In [None]:
import numpy as np
import pandas as pd
from tensorflow import keras
from sklearn.datasets import fetch_openml
from sklearn.mixture import GaussianMixture
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import train_test_split
from scipy.stats import mode 
from scipy.spatial.distance import cdist 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
from random import randint

# PROBLEM 1: KMeans Theory

# A) 

Proof:

Given the centroids $(\mu)$, the E-step in K-means clustering updates each $(\pi)$'s membership in the cluster. K-means seeks to reduce the sum of squares inside a cluster. the separation between each datapoint and the centroid that is allocated to it. The E-step distributes the new clusters in a way that minimizes the sum of squares for point $X_i$.

$ \frac{\partial J}{\partial \pi_{ik}} = \sum_{i} \sum_{k} \pi_{ik} \||X_{i} - \mu_{k}||^{2}  $

-> $\pi_{ik}$ = {1 if k = $argmin_{j}||x^{i}$ - $\mu_{j}||^{2}$, else 0}


As a result of each data point being allocated to the centroid that is closest to it, this phase makes sure that the total of squares inside the cluster is reduced. The E-step changes $(\pi)$ to meet the minimal objective given the current centroid by reducing the within cluster sum of squares.

# B)

Proof:

J = $ \sum_{i} \sum_{k} \pi_{ik} \cdot ||X_{i} - \mu_{k}||^{2} $

 $ \frac{\partial J}{\partial \mu_{k}}$ = $\frac{\partial (\sum_{i} \sum_{k} \pi_{ik} \cdot ||X_{i} - \mu_{k}||^{2})}{\partial \mu_{k}}$

$\frac{\partial J}{\partial \mu_{k}}$ = $ (-2  \cdot \sum_{i} \cdot ||X_{i} - \mu_{k}||^{2})$

0 = $(-2) \cdot \sum_{i} \cdot \pi_{ik} \cdot (X_{i} - \mu_{k}) $

0 = $ \sum_{i}(\pi_{ik} \cdot X_{i}) -  \sum_{i} (\pi_{ik} \cdot \mu_{k})$

$\mu_{k}$ = $\frac{\sum_{i}(\pi_{ik} \cdot X_{i})}{\sum_{i} (\pi_{ik})}$

# C)

The algorithm does not necessarily converge to the global minimum objective value because it is sensitive to the initial placement of the cluster centers and not necessarily a convex function. KMeans algorithm is a hill-climbing optimization technique, which get stuck in a local minima, a solution optimal for the current starting condition but not necessarily optimal for the overall problem. In other words, the algorithm may converge to a suboptimal solution if the initial placement of the cluster centers does not correspond to the global minimum objective value.

# KMeans Implementation

In [None]:
import numpy as np
from scipy.spatial.distance import cdist 
 

def kmeans(x,k, max_iter):
    idx = np.random.choice(len(x), k, replace = False)

    #Randomly choosing Centroids 
    centroids = x[idx, :] 
     
    #finding the distance between centroids and all the data points
    distances = cdist(x, centroids ,'euclidean') 
     
    #Centroid with the minimum Distance
    points = np.array([np.argmin(i) for i in distances]) 
     
    for _ in range(max_iter): 
        centroids = []
        for idx in range(k):
            temp_cent = x[points==idx].mean(axis=0) 
            centroids.append(temp_cent)
 
        centroids = np.vstack(centroids) #Updated Centroids 
         
        distances = cdist(x, centroids ,'euclidean')
        points = np.array([np.argmin(i) for i in distances])
         
    return points, centroids

In [None]:
def kmeans_test(x_test,centroids):

    distances = cdist(x_test, centroids ,'euclidean') 
    predictions = np.array([np.argmin(i) for i in distances]) 
    return predictions

# MNIST

In [None]:
(X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [None]:
X_train = X_train.astype("float32") / 255
X_test = X_test.astype("float32") / 255
X_train = X_train.reshape(-1,784)
X_test = X_test.reshape(-1,784)

In [None]:
def calculate_dist(k, max_iter , X_train, X_test):
  points,centroids=kmeans(X_train, k, max_iter)
  preds=kmeans_test(X_test, centroids)
  dist = np.linalg.norm(centroids[preds]- X_test)
  return dist

# K = 5

In [None]:
# Perform K-means clustering on training data
points, centroids = kmeans(X_train, 5, 100)

# Assign test data to clusters based on centroids
preds = kmeans_test(X_test, centroids)

# Calculate the mode of each cluster
dct = {}
for i in range(5):
    cluster_indices = np.where(preds == i)
    cluster_labels = y_test[cluster_indices]
    corr_true = mode(cluster_labels)
    if len(corr_true.mode) > 0:
        corr_true = corr_true.mode[0]
    else:
        # choose an alternative value if the mode is empty
        corr_true = ...
    dct[i] = corr_true

# Map each prediction to the mode of its corresponding cluster
pred = [dct[p] for p in preds]

# Calculate the confusion matrix
mat = confusion_matrix(pred, y_test)

In [None]:
def G(mat):
    n = mat.shape[0]
    G = 0
    for i in range(n):
        cluster_sum = np.sum(mat[i])
        if cluster_sum == 0:
            continue
        prob = mat[i] / cluster_sum
        G = 1- np.sum(prob ** 2)
    return G

# Calculate the Gini index
G = G(mat)
print("Gini impurity is: ", G)

Gini impurity is:  0.7331416260598607


In [None]:
def purity(mat):
    n = mat.shape[0]
    purity = 0
    for i in range(n):
        cluster_sum = np.sum(mat[i])
        if cluster_sum == 0:
            continue
        proportions = mat[i] / cluster_sum
        majority_proportion = np.max(proportions)
        purity += majority_proportion
    return purity / n

# Calculate the Purity
purity = purity(mat)
print("Purity value is: ", purity)

Purity value is:  0.26013659308530734


# K = 10

In [None]:
dist = calculate_dist(10, 100, X_train, X_test)
print("Euclidean distance: ", dist)

In [None]:
points, centroids = kmeans(X_train, 10, 100)

preds = kmeans_test(X_test, centroids)

# Calculate the mode of each cluster
dct = {}
for i in range(10):
    cluster_indices = np.where(preds == i)
    cluster_labels = y_test[cluster_indices]
    corr_true = mode(cluster_labels)
    if len(corr_true.mode) > 0:
        corr_true = corr_true.mode[0]
    else:
        # choose an alternative value if the mode is empty
        corr_true = ...
    dct[i] = corr_true

# Map each prediction to the mode of its corresponding cluster
pred = [dct[p] for p in preds]


mat = confusion_matrix(pred, y_test)

In [None]:
def G(mat):
    n = mat.shape[0]
    G = 0
    for i in range(n):
        cluster_sum = np.sum(mat[i])
        if cluster_sum == 0:
            continue
        prob = mat[i] / cluster_sum
        G = 1- np.sum(prob ** 2)
    return G

# Calculate the Gini index
G = G(mat)
print("Gini impurity is: ", G)

Gini impurity is:  0.5729176260932288


In [None]:
def purity(mat):
    n = mat.shape[0]
    purity = 0
    for i in range(n):
        cluster_sum = np.sum(mat[i])
        if cluster_sum == 0:
            continue
        proportions = mat[i] / cluster_sum
        majority_proportion = np.max(proportions)
        purity += majority_proportion
    return purity / n

# Calculate the Purity
purity = purity(mat)
print("Purity value is: ", purity)

Purity value is:  0.5018118721055854


# K = 20

In [None]:
points, centroids = kmeans(X_train, 20, 100)

preds = kmeans_test(X_test, centroids)

# Calculate the mode of each cluster
dct = {}
for i in range(20):
    cluster_indices = np.where(preds == i)
    cluster_labels = y_test[cluster_indices]
    corr_true = mode(cluster_labels)
    if len(corr_true.mode) > 0:
        corr_true = corr_true.mode[0]
    else:
        # choose an alternative value if the mode is empty
        corr_true = ...
    dct[i] = corr_true

# Map each prediction to the mode of its corresponding cluster
pred = [dct[p] for p in preds]

mat = confusion_matrix(pred, y_test)

Gini index is:  0.4177644189187681
Purity value is:  0.8988314967590009


# FASHION Dataset

In [None]:
(X_train, y_train), (X_test, y_test)=keras.datasets.fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [None]:
X_train = X_train.astype("float32") / 255
X_test = X_test.astype("float32") / 255
X_train = X_train.reshape(-1,784)
X_test = X_test.reshape(-1,784)

# K = 5

In [None]:
points, centroids = kmeans(X_train, 5, 100)
preds = kmeans_test(X_test, centroids)

# Calculate the mode of each cluster
dct = {}
for i in range(5):
    cluster_indices = np.where(preds == i)
    cluster_labels = y_test[cluster_indices]
    corr_true = mode(cluster_labels)
    if len(corr_true.mode) > 0:
        corr_true = corr_true.mode[0]
    else:
        # choose an alternative value if the mode is empty
        corr_true = ...
    dct[i] = corr_true

# Map each prediction to the mode of its corresponding cluster
pred = [dct[p] for p in preds]


mat = confusion_matrix(pred, y_test)

Gini index is:  0.8454950949493623
Purity value is:  0.7161713003976493


# K = 10

In [None]:
points, centroids = kmeans(X_train, 10, 100)
preds = kmeans_test(X_test, centroids)

# Calculate the mode of each cluster
dct = {}
for i in range(10):
    cluster_indices = np.where(preds == i)
    cluster_labels = y_test[cluster_indices]
    corr_true = mode(cluster_labels)
    if len(corr_true.mode) > 0:
        corr_true = corr_true.mode[0]
    else:
        # choose an alternative value if the mode is empty
        corr_true = ...
    dct[i] = corr_true

# Map each prediction to the mode of its corresponding cluster
pred = [dct[p] for p in preds]

mat = confusion_matrix(pred, y_test)

Gini index is:  0.6316574296987714
Purity value is:  0.8881901002963009


# K = 20

In [None]:
points, centroids = kmeans(X_train, 20, 100)


preds = kmeans_test(X_test, centroids)

# Calculate the mode of each cluster
dct = {}
for i in range(20):
    cluster_indices = np.where(preds == i)
    cluster_labels = y_test[cluster_indices]
    corr_true = mode(cluster_labels)
    if len(corr_true.mode) > 0:
        corr_true = corr_true.mode[0]
    else:
        # choose an alternative value if the mode is empty
        corr_true = ...
    dct[i] = corr_true

# Map each prediction to the mode of its corresponding cluster
pred = [dct[p] for p in preds]

# Calculating confusion matrix
mat = confusion_matrix(pred, y_test)

Gini index is:  0.48077835519197615
Purity value is:  0.9872704080933963


# 20NG

In [None]:
import numpy as np
from scipy.spatial.distance import cdist

def kmeans_sparse(X, k, no_of_iterations):
    idx = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[idx, :]
    
    for _ in range(no_of_iterations):
        distances = cdist(X, centroids, metric='euclidean')
        points = np.argmin(distances, axis=1)
        new_centroids = np.array([X[points==i].mean(axis=0) for i in range(k)])
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
    return centroids

def kmeans_test_sparse(X_test, centroids):
    distances = cdist(X_test, centroids, metric='euclidean')
    predictions = np.argmin(distances, axis=1)
    return predictions

def calculate_dist_sparse(k, iterations, X_train, X_test):
    centroids = kmeans_sparse(X_train, k, iterations)
    predictions = kmeans_test_sparse(X_test, centroids)
    return np.sum([np.linalg.norm(X_test[i]- centroids[predictions[i]]) for i in range(X_test.shape[0])])

In [None]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
vectorizer = TfidfVectorizer(stop_words = 'english')
X_train = vectorizer.fit_transform(newsgroups_train.data)

In [None]:
X_train = np.array(X_train.todense())

In [None]:
newsgroups_test = fetch_20newsgroups(subset='test')
X_test = vectorizer.transform(newsgroups_test.data)
X_test = np.array(X_test.todense())

In [None]:
y_test = newsgroups_test.target

In [None]:
X_train = X_train[:10000]
X_test = X_test[:10000]
y_test = y_test[:10000]

# K = 5

In [None]:
import numpy as np
from scipy.spatial.distance import cdist
from collections import Counter
from sklearn.metrics import confusion_matrix

centroids = kmeans_sparse(X_train, 5, 1000)
preds = kmeans_test_sparse(X_test, centroids)

# Calculate the mode of each cluster
dct = {}
for i in range(5):
    cluster_indices = np.where(preds == i)
    cluster_labels = y_test[cluster_indices]
    if len(cluster_labels) > 0:
        corr_true = Counter(cluster_labels).most_common(1)[0][0]
        dct[i] = corr_true

# Map each prediction to the mode of its corresponding cluster
pred = [dct[p] for p in preds if p in dct]

# Calculating confusion matrix
mat = confusion_matrix(pred, y_test)

# Calculating frequency matrix
M = np.sum(mat, axis=1)
M = np.expand_dims(M, axis=0)
M = np.repeat(a=M, repeats = 20, axis=0).T

# Calculating Gini index
G = 1 - np.sum(np.square(mat / (M + 1e-6)), axis=1)
print("Gini index is: ", np.sum(np.multiply(G, M)) / np.sum(M))

# Calculating Purity
E = -np.sum(np.multiply((mat / (M + 1e-6)), np.log((mat / (M + 1e-6) + 1e-6))), axis=1)
print("Purity value is: ", np.sum(np.multiply(E, M)) / np.sum(M))

Gini index is:  0.9702607005168902
Purity value is:  0.5892332975550049


# K = 10

In [None]:
centroids = kmeans_sparse(X_train, 10, 1000)
preds = kmeans_test_sparse(X_test, centroids)

# Calculate the mode of each cluster
dct = {}
for i in range(10):
    cluster_indices = np.where(preds == i)
    cluster_labels = y_test[cluster_indices]
    if len(cluster_labels) > 0:
        corr_true = Counter(cluster_labels).most_common(1)[0][0]
        dct[i] = corr_true

# Map each prediction to the mode of its corresponding cluster
pred = [dct[p] for p in preds if p in dct]

# Calculating confusion matrix
mat = confusion_matrix(pred, y_test)

# Calculating frequency matrix
M = np.sum(mat, axis=1)
M = np.expand_dims(M, axis=0)
M = np.repeat(a=M, repeats = 20, axis=0).T

# Calculating Gini index
G = 1 - np.sum(np.square(mat / (M + 1e-6)), axis=1)
print("Gini index is: ", np.sum(np.multiply(G, M)) / np.sum(M))

# Calculating Purity
E = -np.sum(np.multiply((mat / (M + 1e-6)), np.log((mat / (M + 1e-6) + 1e-6))), axis=1)
print("Purity value is: ", np.sum(np.multiply(E, M)) / np.sum(M))

Gini index is:  0.9012154616187498
Purity value is:  0.8153369171179389


# K = 20

In [None]:
from collections import Counter
centroids = kmeans_sparse(X_train, 20, 1000)
preds = kmeans_test_sparse(X_test, centroids)

# Calculate the mode of each cluster
dct = {}
for i in range(20):
    cluster_indices = np.where(preds == i)
    cluster_labels = y_test[cluster_indices]
    if len(cluster_labels) > 0:
        corr_true = Counter(cluster_labels).most_common(1)[0][0]
        dct[i] = corr_true

# Map each prediction to the mode of its corresponding cluster
pred = [dct[p] for p in preds if p in dct]

# Calculating confusion matrix
mat = confusion_matrix(pred, y_test)

In [None]:
def G(mat):
    n = mat.shape[0]
    G = 0
    for i in range(n):
        cluster_sum = np.sum(mat[i])
        if cluster_sum == 0:
            continue
        prob = mat[i] / cluster_sum
        G = 1- np.sum(prob ** 2)
    return G

# Calculate the Gini index
G = G(mat)
print("Gini impurity is: ", G)

Gini impurity is:  0.318181818181818


In [None]:
def purity(mat):
    n = mat.shape[0]
    purity = 0
    for i in range(n):
        cluster_sum = np.sum(mat[i])
        if cluster_sum == 0:
            continue
        proportions = mat[i] / cluster_sum
        majority_proportion = np.max(proportions)
        purity += majority_proportion
    return purity / n

# Calculate the Purity
purity = purity(mat)
print("Purity value is: ", purity)

Purity value is:  0.3770146664896794


# PROBLEM 3 : Gaussian Mixture on toy data

In [None]:
def parse_data ():

  with open("/content/drive/MyDrive/2gaussian.txt", 'r') as f:
    lines = f.readlines()
    X = []
    for line in lines:
        data = [float(line.split(' ')[0]), float(line.split(' ')[1])]
        X.append(data)
  return np.array(X)

In [None]:
X = parse_data()
print(X)

[[7.57104365 3.53027417]
 [7.33721752 4.26271316]
 [3.07182783 1.11801871]
 ...
 [5.61639331 3.77793239]
 [8.59215378 3.6349037 ]
 [3.02221288 3.78337346]]


In [None]:
def gaussian_mixture(X, mu, cov) -> np.array:
    n = X.shape[1]
    difference = (X - mu).T

    base = 1 / ((2 * np.pi) ** (n / 2) * np.linalg.det(cov) ** 0.5)
    exponent_value = -0.5 * np.dot(np.dot(difference.T, np.linalg.inv(cov)), difference)
    exponent = np.exp(exponent_value)

    return np.diagonal( base *  exponent).reshape(-1, 1)

In [None]:
def initialize_clusters(X, k) -> np.array:
    
    pi = [ 1/k for i in range(0,k) ]
    mu = [ X[randint(0,len(X)-1),:] for i in range(0,k) ]
    cov = [ [ np.identity(X.shape[1] ,dtype = np.float64) ] for i in range(0,k) ]
    
    clusters = []
    for i in range(k):
        cluster = {}
        cluster['pi'] = pi[i]
        cluster['mu'] = mu[i]
        cluster['cov'] = cov[i]
        clusters.append(cluster)
    
    return clusters

In [None]:
def E_step(X, clusters) -> dict:
    expectation = np.zeros((X.shape[0], 1), dtype = np.float64)    
    
    for cluster in clusters:
        pi = cluster['pi']
        mu = cluster['mu']
        cov = cluster['cov']

        weight = (pi * gaussian_mixture(X, mu, cov)).astype(np.float64)
        
        for i in range(X.shape[0]):
            expectation[i] += weight[i]
        
        cluster['weight'] = weight
        cluster['expectation'] = expectation

    for cluster in clusters:
        cluster['weight'] /= cluster['expectation']

    return cluster

In [None]:
def M_step(X, clusters) -> dict:
    X_len = float(X.shape[0])
  
    for cluster in clusters:
        weight = cluster['weight']
        cov = np.zeros((X.shape[1], X.shape[1]))
        sum_weights = np.sum(weight, axis=0)
        pi = sum_weights / X_len
        mu = np.sum(weight * X, axis=0) / sum_weights
        
        for i in range(X.shape[0]):
            difference = (X[i] - mu).reshape(-1, 1)
            cov += weight[i] * np.dot(difference, difference.T)
        cov = cov / sum_weights
        
        cluster['pi'] = pi
        cluster['mu'] = mu
        cluster['cov'] = cov

    return clusters

In [None]:
def get_likelihood(X, clusters) -> list:

    likelihoods = np.log(np.array([cluster['expectation'] for cluster in clusters]))
    sum_log_likelihood = np.sum(likelihoods)

    return [sum_log_likelihood, likelihoods]

In [None]:
k = 2
cycles = 100

clusters = initialize_clusters(X, k = 2)
likelihoods = np.zeros((cycles, ))

updated_likelihood = 0 
for i in range(cycles):
  E_step(X, clusters)
  M_step(X, clusters)

  result = get_likelihood(X, clusters)
  likelihood, sample_likelihoods = result[0], result[1]

  if likelihood == updated_likelihood: break
  else: 
    updated_likelihood = likelihood
    #print('Cycle: ', i + 1, '  |  Likelihood: ', likelihood)

n = 0
clusters
for cluster in clusters:
  n += 1
  print('\nCluster  :  ', n )
  mu =  cluster['mu'] 
  cov = cluster['cov']

  print('Mean : ', mu)
  print('Cov_matrix : \n', np.array(cov))


Cluster  :   1
Mean :  [2.99413184 3.0520966 ]
Cov_matrix : 
 [[1.01023429 0.02719139]
 [0.02719139 2.93782295]]

Cluster  :   2
Mean :  [7.01314832 3.98313419]
Cov_matrix : 
 [[0.97475891 0.4974703 ]
 [0.4974703  1.00114259]]


In [None]:
n1 = n2 = 0

for i in range(X.shape[0]):
    if clusters[0]['weight'][i][0] >= clusters[1]['weight'][i][0]:
        n1 += 1
    else:
        n2 += 1
        
print("Number of data points in Cluster 1:", n1)
print("Number of data points in Cluster 2:", n2)

Number of data points in Cluster 1: 1991
Number of data points in Cluster 2: 4009


In [None]:
def parse_data_ ():

  with open("/content/drive/MyDrive/3gaussian.txt", 'r') as f:
    lines = f.readlines()
    X_3g = []
    for line in lines:
        data = [float(line.split(' ')[0]), float(line.split(' ')[1])]
        X_3g.append(data)
  return np.array(X_3g)

In [None]:
X_3g = parse_data_()
print(X_3g)

[[2.94693347 3.16222499]
 [5.98399602 4.84671738]
 [5.30142995 8.16811309]
 ...
 [6.27055168 2.83700248]
 [5.27935185 7.87197636]
 [7.26196796 4.58568396]]


In [None]:
k = 3
cycles = 220

clusters = initialize_clusters(X_3g, k = 3)
likelihoods = np.zeros((cycles, ))

updated_likelihood = 0 
for i in range(cycles):
  E_step(X_3g, clusters)
  M_step(X_3g, clusters)

  result = get_likelihood(X_3g, clusters)
  likelihood, sample_likelihoods = result[0], result[1]

  if likelihood == updated_likelihood: break
  else: 
    updated_likelihood = likelihood
    #print('Cycle: ', i + 1, '  |  Likelihood: ', likelihood)

n = 0
clusters
for cluster in clusters:
  n += 1
  print('\nCluster  :  ', n )
  mu =  cluster['mu'] 
  cov = cluster['cov']

  print('Mean : ', mu)
  print('Cov_matrix : \n', np.array(cov))


Cluster  :   1
Mean :  [3.03968828 3.0484741 ]
Cov_matrix : 
 [[1.02849913 0.0268159 ]
 [0.0268159  3.38466419]]

Cluster  :   2
Mean :  [5.01172171 7.00146622]
Cov_matrix : 
 [[0.97972161 0.18516294]
 [0.18516294 0.97455232]]

Cluster  :   3
Mean :  [7.02156142 4.01546065]
Cov_matrix : 
 [[0.99041327 0.50095954]
 [0.50095954 0.99564873]]


In [None]:
n1 = n2 = n3 = 0

for i in range(X_3g.shape[0]):
    if clusters[0]['weight'][i][0] >= clusters[1]['weight'][i][0] and clusters[0]['weight'][i][0] >= clusters[2]['weight'][i][0]:
        n1 += 1
    elif clusters[1]['weight'][i][0] >= clusters[0]['weight'][i][0] and clusters[1]['weight'][i][0] >= clusters[2]['weight'][i][0]:
        n3 += 1
    else:
        n2 += 1
        
print("Number of data points in Cluster 1:", n1)
print("Number of data points in Cluster 2:", n2)
print("Number of data points in Cluster 3:", n3)

Number of data points in Cluster 1: 1964
Number of data points in Cluster 2: 3004
Number of data points in Cluster 3: 5032


# PROBLEM 4 : Gaussian Mixture on real data

# Fashion Dataset

In [None]:
(X_train, y_train), (X_test, y_test)=keras.datasets.fashion_mnist.load_data()
X_train = X_train.astype("float32") / 255
X_test = X_test.astype("float32") / 255
X_train = X_train.reshape(-1,784)
X_test = X_test.reshape(-1,784)

In [None]:
arrays=[]
for i in range(10):
  arrays.append(np.array(X_train)[np.where(y_train == i)])

In [None]:
proba=[]
for k in range(10):
  gm = GaussianMixture(n_components = 10, random_state = 42, covariance_type = "diag").fit(arrays[k])
  score = gm.score_samples(np.array(X_test))
  probs = score * len(arrays[k])
  proba.append(probs)

In [None]:
preds = np.argmax(np.array(proba), axis=0)

In [None]:
print("Accuracy on test data: ", accuracy_score(preds, y_test)*100 )

Accuracy on test data:  74.37


# Spambase Dataset

In [None]:
data_spam = pd.read_csv('spambase.data', header=None)
data_spam.rename(columns={57:'is_spam'}, inplace=True)

In [None]:
data_spam

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,is_spam
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


In [None]:
X = data_spam.drop(['is_spam'], axis = 1)
y = data_spam['is_spam']

In [None]:
# Scale the features, as the original values have wide ranges
X = StandardScaler().fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, stratify = y)

In [None]:
ham = np.array(X_train)[np.where(y_train == 1)]

In [None]:
spam = np.array(X_train)[np.where(y_train == 0)]

In [None]:
gm_ham = GaussianMixture(n_components = 2, random_state = 0, covariance_type = "diag").fit(ham)

In [None]:
gm_spam = GaussianMixture(n_components = 7, random_state = 0, covariance_type = "diag").fit(spam)

In [None]:
pred_ham = gm_ham.score_samples(np.array(X_test))

In [None]:
pred_spam = gm_spam.score_samples(np.array(X_test))

In [None]:
preds=[]

for i in range(len(X_test)):
  if(pred_ham[i]*len(ham) > pred_spam[i]*len(spam)):
    preds.append(1)
  else:
    preds.append(0)

In [None]:
print("Accuracy on test data: ", accuracy_score(preds, y_test)*100 )  

Accuracy on test data:  90.33659066232356
