In [67]:
import numpy as np
import copy
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt
import pandas as pd

In [68]:

n=100
#number of samples per class: n
k=2
#number of classes: k
d=2
#dimensionality of samples: d
mus=np.random.random((k,d))*20-10 
#mean of the multivariate normal distribution | mus[i,j] is the mean for the ith cluster at the jth dimension
# sigma=[np.eye(d) for i in range(k)]
#standard deviations are all assumed to be one. As the covariance matrix is identity the dimensions can be sampled independently
# Pc=np.ones(k)/k
#setting priors: Pc

#Data Generation
X=[]
for i in range(n):
    for j in range(k):
#for each class generate n tuples of size d, taken from the distribution N(mus[j],1)
        point=[mean+np.random.randn() for mean in mus[j]]
        X.append(point)
#we have our dataset, where each point is a d+1 dimensional tuple where the last position represents its class 

X = np.asarray(X)

In [77]:
def w_mat(i,j):

    # A nxd matrix holding whose elements are tuples of the type (i,j)
    # this matrix acts as pointer to respective to elements in  X, new_mus,new_cov

    # a = multivariate_normal.pdf(\
    #         X[index_matrix[i][j][0]],\
    #         mean=new_mus[index_matrix[i][j][1]],\
    #         cov=new_cov[index_matrix[i][j][1]],\
    #         allow_singular=True\
    #     )
    # b = new_priors[index_matrix[i][j][1]]

    a = multivariate_normal.pdf(X[i],mean=new_mus[j],cov=new_cov[j],allow_singular=True)
    b = new_priors[j]
    # print(i,j,a*b)
    return a*b

def tabulate(x, y, f):
   """Return a table of f(x, y)."""
   #* is to unpack the two arrays which results after meshing
   return np.vectorize(f)(* np.meshgrid(x, y))


def cov_loop(n,m):
    cov_var = np.zeros((X.shape[1],X.shape[1]),dtype=float)
    for j in range(n):
        cov_var += (W[m][j]/sum_w[m]) * (((X[j] - new_mus[m]).T) @ (X[j] - new_mus[m]))
    return cov_var

In [70]:

#dimension of the dataset
d = X.shape[1]

#number of points in the dataset
n = X.shape[0]

# index_matrix = np.empty((n,k),dtype=(int,2))
# for ii in range(n):
#     for jj in range(k):
#         index_matrix[ii][jj] = (ii,jj)

'''initialising means ---------------------------------'''
# random means
# old_mus = np.array([np.random.rand(d) for _ in range(k)])

# select random datapoints from data as initial means
# random_rows = np.random.choice(X.shape[0], size=k, replace=False)
# old_mus = copy.deepcopy(X[random_rows, :])

# choose means from unifrom random between 'a' and 'b' 
old_mus = np.random.random((k,d))*20-10 
# old_mus = np.random.random((k,d))*np.max(X)-np.min(X)


new_mus = copy.deepcopy(old_mus)
print('original',mus)
print("initial", new_mus)
'''-------------------------------------------------------'''


'''initalising covariance matrices-------------------------'''
#identity matrices of dxd dimension as covariance matrices
old_cov = np.asarray([np.eye(d) for _ in range(k)])
new_cov = copy.deepcopy(old_cov)
'''-------------------------------------------------------'''

'''initialising priors-------------------------------------'''
#priors are 1/k where k is the number of classes
old_priors = np.full((k),1/k)
new_priors = copy.deepcopy(old_priors)
'''-------------------------------------------------------'''

#stopping condition
eps = 1e-10

#starting time
t = 0

#aritifcal condition to enter the loop
obt_eps = 1

t_data=[i[:d] for i in X]
while (obt_eps > eps) or (t==0):
    t += 1

    '''Begin Expectation step---------------------------------------------------------'''
    #get the W matrix
    #unnormalised
    W_temp = copy.deepcopy(tabulate(list(range(n)),list(range(k)),w_mat))
    temp = copy.deepcopy(W_temp)
    #normalised
    W = copy.deepcopy(temp/temp.sum(axis=0))

    if np.isfinite(W).all():
        pass
    else:
        W_temp_where = np.where(temp.sum(axis=0)==0.0)
        for ii in W_temp_where:
            W[:,ii] = 1/k
    '''End Expectation step------------------------------------------------------------'''

    #get sum of W for each cluster
    temp = copy.deepcopy(W)
    sum_w = copy.deepcopy(temp.sum(axis=1))

    denoms=[sum(W[i,:]) for i in range(k)]

    '''Begin Maximization step---------------------------------------------------------'''
    # re-estimate means
    old_mus = copy.deepcopy(new_mus)
    #unnormalised
    new_mus_temp = copy.deepcopy(W @ X)
    temp = copy.deepcopy(new_mus_temp)
    #normalised
    new_mus = copy.deepcopy(temp/sum_w[:,None])

    # re-estimate covariance
    old_cov = copy.deepcopy(new_cov)
    #unnormalised
    # new_cov_temp = copy.deepcopy([cov_loop(n,a) for a in range(k)])
    # temp = copy.deepcopy(new_cov_temp)
    #normalised
    # new_cov = copy.deepcopy([temp[a]/sum_w[a] for a in range(k)])
    new_cov = copy.deepcopy(new_cov)



    # re-estimate priors
    old_priors = copy.deepcopy(new_priors)
    new_priors = copy.deepcopy(sum_w/n)
    '''End Maximization step---------------------------------------------------------'''

    obt_eps = np.sum([np.linalg.norm(new_mus[a]-old_mus[a]) for a in range(k)])
print("final", t,n,k,d,new_mus)


original [[-8.25453067 -0.83934678]
 [ 2.58580405  3.80500123]]
initial [[ 8.65497865  9.96211716]
 [-3.75478124  1.73118803]]
final 4 200 2 2 [[ 2.68373966  3.80735947]
 [-8.12263715 -0.83833731]]


In [46]:
# print('Original means of the data for class 1 are: {} and the predicted means are {}'.format(mus[0],means[0]))
# print('Original means of the data for class 2 are: {} and the predicted means are {}'.format(mus[1],means[1]))
# print('The Original covariance matrix is an Identity matrix for both the classes.')
# print('Predicted covariance matrices of the data for class 1 is: \n {} \n and for class 2 is \n {}'.format(sigmas[0],sigmas[1]))


In [71]:
#\s+ since dataset has one or more white spaces
df = pd.read_csv("Dataset.data",sep='\s+',header=None)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7400 entries, 0 to 7399
Data columns (total 21 columns):
0     7400 non-null float64
1     7400 non-null float64
2     7400 non-null float64
3     7400 non-null float64
4     7400 non-null float64
5     7400 non-null float64
6     7400 non-null float64
7     7400 non-null float64
8     7400 non-null float64
9     7400 non-null float64
10    7400 non-null float64
11    7400 non-null float64
12    7400 non-null float64
13    7400 non-null float64
14    7400 non-null float64
15    7400 non-null float64
16    7400 non-null float64
17    7400 non-null float64
18    7400 non-null float64
19    7400 non-null float64
20    7400 non-null int64
dtypes: float64(20), int64(1)
memory usage: 1.2 MB


In [73]:
X = df.to_numpy()
X = X[:,:-1]

mus = np.asarray(
    [ [0]*X.shape[1],
        [2/np.sqrt(20)]*X.shape[1]
        ]
)

In [79]:
k = 2
#dimension of the dataset
d = X.shape[1]

#number of points in the dataset
n = X.shape[0]

# index_matrix = np.empty((n,k),dtype=(int,2))
# for ii in range(n):
#     for jj in range(k):
#         index_matrix[ii][jj] = (ii,jj)

'''initialising means ---------------------------------'''
# random means
# old_mus = np.array([np.random.rand(d) for _ in range(k)])

# select random datapoints from data as initial means
random_rows = np.random.choice(X.shape[0], size=k, replace=False)
old_mus = copy.deepcopy(X[random_rows, :])

# choose means from unifrom random between 'a' and 'b' 
# old_mus = np.random.random((k,d))*20-10 
# old_mus = np.random.random((k,d))*np.max(X)-np.min(X)


new_mus = copy.deepcopy(old_mus)
print('original',mus)
print("initial", new_mus)
'''-------------------------------------------------------'''


'''initalising covariance matrices-------------------------'''
#identity matrices of dxd dimension as covariance matrices
old_cov = np.asarray([np.eye(d) for _ in range(k)])
new_cov = copy.deepcopy(old_cov)
'''-------------------------------------------------------'''

'''initialising priors-------------------------------------'''
#priors are 1/k where k is the number of classes
old_priors = np.full((k),1/k)
new_priors = copy.deepcopy(old_priors)
'''-------------------------------------------------------'''

#stopping condition
eps = 1e-10

#starting time
t = 0

#aritifcal condition to enter the loop
obt_eps = 1

while (obt_eps > eps) or (t==0):
    t += 1

    '''Begin Expectation step---------------------------------------------------------'''
    #get the W matrix
    #unnormalised
    W_temp = copy.deepcopy(tabulate(list(range(n)),list(range(k)),w_mat))
    temp = copy.deepcopy(W_temp)
    #normalised
    W = copy.deepcopy(temp/temp.sum(axis=0))

    # if np.isfinite(W).all():
    #     pass
    # else:
    #     W_temp_where = np.where(temp.sum(axis=0)==0.0)
    #     for ii in W_temp_where:
    #         W[:,ii] = 1/k
    '''End Expectation step------------------------------------------------------------'''

    #get sum of W for each cluster
    temp = copy.deepcopy(W)
    sum_w = copy.deepcopy(temp.sum(axis=1))


    '''Begin Maximization step---------------------------------------------------------'''
    # re-estimate means
    old_mus = copy.deepcopy(new_mus)
    #unnormalised
    new_mus_temp = copy.deepcopy(W @ X)
    temp = copy.deepcopy(new_mus_temp)
    #normalised
    new_mus = copy.deepcopy(temp/sum_w[:,None])

    # re-estimate covariance
    old_cov = copy.deepcopy(new_cov)
    #unnormalised
    new_cov_temp = copy.deepcopy([cov_loop(n,a) for a in range(k)])
    temp = copy.deepcopy(new_cov_temp)
    #normalised
    new_cov = copy.deepcopy([temp[a]/sum_w[a] for a in range(k)])


    # re-estimate priors
    old_priors = copy.deepcopy(new_priors)
    new_priors = copy.deepcopy(sum_w/n)
    '''End Maximization step---------------------------------------------------------'''

    obt_eps = np.sum([np.linalg.norm(new_mus[a]-old_mus[a]) for a in range(k)])
print("final", t,new_mus)


original [[0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        0.       ]
 [0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136
  0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136
  0.4472136 0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
initial [[ 1.5616  1.3861 -1.4968  0.0423 -0.5552 -0.1046  0.7733 -0.3488 -0.6817
   2.0427  0.2368 -0.4047  1.6686  1.8561  0.0464  1.3579  0.0431  0.7537
   1.2374  0.9743]
 [ 1.2947  0.1998  1.8239 -0.8114  0.6682 -1.0951  1.2324  0.0799  1.4349
  -1.266   0.5435  1.1515  0.2636  0.8395  1.5961  0.5107  0.702   2.0212
  -0.574   0.1599]]
final 28 [[-0.14980326 -0.09255678 -0.17987472 -0.10762475 -0.12251715 -0.12481045
  -0.1372542  -0.09997686 -0.17708276 -0.10363845 -0.12277684 -0.1443822
  -0.13757871 -0.16771707 -0.13804202 -0.14643852 -0.09917042 -0.16315795
  -0.15543415 -0.