In [116]:
import numpy as np
import copy
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt
import pandas as pd
import pickle as pkl

# Synthetic Datapoints to test the EM algorithm while building

In [117]:
n=100 #number of points
d=2 #dimension of each point
k=2 #clusters

#randomly generate k number of  d-dimensional means
mus=np.random.random((k,d))*20-10 

#Data Generation
X=[]
for i in range(n):
    for j in range(k):
        X.append([mean+np.random.randn() for mean in mus[j]])
#we have our dataset, where each point is a d+1 dimensional tuple where the last position represents its class 

X = np.asarray(X)

# EM Algorithm

In [35]:
def EM(X,k):

    def w_mat(i,j):
    
        a = multivariate_normal.pdf(X[i],mean=new_mus[j],cov=new_cov[j],allow_singular=True)
        b = new_priors[j]
        # print(i,j,a*b)
        return a*b

    def tabulate(x, y, f):
        """Return a table of f(x, y)."""
        #* is to unpack the two arrays which results after meshing
        return np.vectorize(f)(* np.meshgrid(x, y))


    # def cov_loop(n,m):
    #     cov_var = np.zeros((X.shape[1],X.shape[1]),dtype=float)
    #     for j in range(n):
    #         cov_var += (W[m][j]/sum_w[m]) * (((X[j] - new_mus[m]).T) @ (X[j] - new_mus[m]))
    #     return cov_var

    
    #dimension of the dataset
    d = X.shape[1]

    #number of points in the dataset
    n = X.shape[0]

    '''initialising means ---------------------------------'''
    # random means
    # old_mus = np.array([np.random.rand(d) for _ in range(k)])

    # select random datapoints from data as initial means
    random_rows = np.random.choice(X.shape[0], size=k, replace=False)
    old_mus = copy.deepcopy(X[random_rows, :])

    # choose means from unifrom random between 'a' and 'b' 
    # old_mus = np.random.random((k,d))*20-10 
    # old_mus = np.random.random((k,d))*np.max(X)-np.min(X)


    new_mus = copy.deepcopy(old_mus)
    # print('original',mus)
    # print("initial", new_mus)
    '''-------------------------------------------------------'''


    '''initalising covariance matrices-------------------------'''
    #identity matrices of dxd dimension as covariance matrices
    old_cov = np.asarray([np.eye(d) for _ in range(k)])
    new_cov = copy.deepcopy(old_cov)
    '''-------------------------------------------------------'''

    '''initialising priors-------------------------------------'''
    #priors are 1/k where k is the number of classes
    old_priors = np.full((k),1/k)
    new_priors = copy.deepcopy(old_priors)
    '''-------------------------------------------------------'''

    #stopping condition
    eps = 1e-07

    #starting time
    t = 0

    #aritifcal condition to enter the loop
    obt_eps = 1

    t_data=[i[:d] for i in X]
    while (obt_eps > eps) or (t==0):
        t += 1

        '''Begin Expectation step---------------------------------------------------------'''
        #get the W matrix
        #unnormalised
        W_temp = copy.deepcopy(tabulate(list(range(n)),list(range(k)),w_mat))
        temp = copy.deepcopy(W_temp)
        #normalised
        W = copy.deepcopy(temp/temp.sum(axis=0))

        #it so happens that sometime, the probability of a point being assinged to any of the cluster is 0.
        #this is probably an artefact of multivariate.normal.pdf giving very small values 
        # and the 64 bit precision of the computer rounding stuff off to 0 or nan. 
        #When this happens, this part of the assigns the point to any of the clusters with equal probability.
        if np.isfinite(W).all():
            pass
        else:
            W_temp_where = np.where(temp.sum(axis=0)==0.0)
            for ii in W_temp_where:
                W[:,ii] = 1/k
        '''End Expectation step------------------------------------------------------------'''

        #get sum of W for each cluster
        temp = copy.deepcopy(W)
        sum_w = copy.deepcopy(temp.sum(axis=1))

        '''Begin Maximization step---------------------------------------------------------'''
        # re-estimate means
        old_mus = copy.deepcopy(new_mus)
        #unnormalised
        new_mus_temp = copy.deepcopy(W @ X)
        temp = copy.deepcopy(new_mus_temp)
        #normalised
        new_mus = copy.deepcopy(temp/sum_w[:,None])

        # re-estimate covariance
        old_cov = copy.deepcopy(new_cov)
        #unnormalised
        # new_cov_temp = copy.deepcopy([cov_loop(n,a) for a in range(k)])
        # temp = copy.deepcopy(new_cov_temp)
        #normalised
        # new_cov = copy.deepcopy([temp[a]/sum_w[a] for a in range(k)])
        new_cov = [
                    np.sum(
                            [
                                W[i,j]/(sum_w[i])*
                                (
                                    (np.expand_dims(X[j]-new_mus[i], axis=0).T) @ 
                                    np.expand_dims(X[j]-new_mus[i], axis=0)
                                ) 
                                for j in range(n)
                            ],axis=0
                        ) 
                        for i in range(k)
                     ]
        # new_cov = copy.deepcopy([cov_loop(n,a) for a in range(k)])



        # re-estimate priors
        old_priors = copy.deepcopy(new_priors)
        new_priors = copy.deepcopy(sum_w/n)
        '''End Maximization step---------------------------------------------------------'''

        obt_eps = np.sum([np.linalg.norm(new_mus[a]-old_mus[a]) for a in range(k)])
    # print("final", t,n,k,d,new_mus) 
    return W,new_mus,new_cov,t


# Ringnorm analysis

In [113]:
# Loading data
# \s+ since dataset has one or more white spaces
df = pd.read_csv("Dataset.data",sep='\s+',header=None)

X = df.to_numpy()
X = X[:,:-1]

mus = np.asarray(
    [ [0]*X.shape[1],
        [2/np.sqrt(20)]*X.shape[1]
        ]
)

In [114]:
posteriors,output_mus,output_covs,iterations = {},{},{},{}

for k in [2,3,4,5]:
    data = {}
    for e in range(20):
        print(f"k={k},e={e}")
        p,m,c,i = EM(X,k)
        posteriors[e] = p
        output_mus[e] = m
        output_covs[e] = c
        iterations[e] = i
    data['posteriors'] = posteriors
    data['output_mus'] = output_mus
    data['output_covs'] = output_covs
    data['iterations'] = iterations
    with open(f"ringnorm{k}.pkl",'wb') as f:
        pkl.dump(data,f)
        

original [[ 1.          0.          0.83442204  0.0858972   0.80170573  0.16723071
   0.71591711  0.20012378  0.62357124  0.22496222  0.54688533  0.21417987
   0.48527187  0.16627956  0.44523907  0.12209133  0.42215538  0.04068591
   0.41434782 -0.01020809  0.43676009 -0.03676911  0.45467787 -0.05496471
   0.47747858 -0.0706012   0.49878218 -0.05196422  0.48604551 -0.02940329
   0.4782444  -0.0176112   0.45136324 -0.00797689]
 [ 1.          0.          0.83442204  0.0858972   0.80170573  0.16723071
   0.71591711  0.20012378  0.62357124  0.22496222  0.54688533  0.21417987
   0.48527187  0.16627956  0.44523907  0.12209133  0.42215538  0.04068591
   0.41434782 -0.01020809  0.43676009 -0.03676911  0.45467787 -0.05496471
   0.47747858 -0.0706012   0.49878218 -0.05196422  0.48604551 -0.02940329
   0.4782444  -0.0176112   0.45136324 -0.00797689]]
initial [[1 0 0.88305 -0.21996 1.0 0.36373 0.8240299999999999 0.19205999999999998
  0.8508600000000001 0.05901 0.9055799999999999 -0.04292 0.85193 0

KeyboardInterrupt: 

In [112]:
#creating a final array from the dataframe
posterior = pd.DataFrame(posterior)

#check which point belongs to which cluster
clusters = [posterior.iloc[0] < posterior.iloc[1]]
#add a new row to the data frame with value 1 or 0 if point (represented by point) belongs to cluster 1 or 0 respectively
posterior_assigned = posterior.append(clusters,ignore_index=True)
cluster_df = pd.DataFrame()
cluster_df['assigned1'] = posterior_assigned.T[2].astype(int)

#check which point belongs to which cluster
clusters = [posterior.iloc[0] > posterior.iloc[1]]
#add a new row to the data frame with value 1 or 0 if point (represented by point) belongs to cluster 1 or 0 respectively
posterior_assigned = posterior.append(clusters,ignore_index=True)
cluster_df['assigned2'] = posterior_assigned.T[2].astype(int)

#\s+ since dataset has one or more white spaces
df = pd.read_csv("Dataset.data",sep='\s+',header=None)
cluster_df['groundtruth'] = df[20].astype(int)

In [101]:
cluster_df

Unnamed: 0,assigned1,assigned2,groundtruth
0,1,0,0
1,0,1,1
2,0,1,1
3,0,1,1
4,1,0,0
...,...,...,...
7395,0,1,1
7396,0,1,1
7397,1,0,0
7398,1,0,0


In [103]:
#\s+ since dataset has one or more white spaces
df = pd.read_csv("ionosphere.data",header=None)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 35 columns):
0     351 non-null int64
1     351 non-null int64
2     351 non-null float64
3     351 non-null float64
4     351 non-null float64
5     351 non-null float64
6     351 non-null float64
7     351 non-null float64
8     351 non-null float64
9     351 non-null float64
10    351 non-null float64
11    351 non-null float64
12    351 non-null float64
13    351 non-null float64
14    351 non-null float64
15    351 non-null float64
16    351 non-null float64
17    351 non-null float64
18    351 non-null float64
19    351 non-null float64
20    351 non-null float64
21    351 non-null float64
22    351 non-null float64
23    351 non-null float64
24    351 non-null float64
25    351 non-null float64
26    351 non-null float64
27    351 non-null float64
28    351 non-null float64
29    351 non-null float64
30    351 non-null float64
31    351 non-null float64
32    351 non-null float64
33    35

In [104]:
X = df.to_numpy()
X = X[:,:-1]

mean_g = [i/len(df[df[34]=='g']) for i in df[df[34]=='g'].sum(axis=0).to_list()[:-1]]
mean_b = [i/len(df[df[34]=='b']) for i in df[df[34]=='b'].sum(axis=0).to_list()[:-1]]

mus = np.array([mean_g,mean_g])

In [105]:
posterior = EM(X,k)

original [[ 1.          0.          0.83442204  0.0858972   0.80170573  0.16723071
   0.71591711  0.20012378  0.62357124  0.22496222  0.54688533  0.21417987
   0.48527187  0.16627956  0.44523907  0.12209133  0.42215538  0.04068591
   0.41434782 -0.01020809  0.43676009 -0.03676911  0.45467787 -0.05496471
   0.47747858 -0.0706012   0.49878218 -0.05196422  0.48604551 -0.02940329
   0.4782444  -0.0176112   0.45136324 -0.00797689]
 [ 1.          0.          0.83442204  0.0858972   0.80170573  0.16723071
   0.71591711  0.20012378  0.62357124  0.22496222  0.54688533  0.21417987
   0.48527187  0.16627956  0.44523907  0.12209133  0.42215538  0.04068591
   0.41434782 -0.01020809  0.43676009 -0.03676911  0.45467787 -0.05496471
   0.47747858 -0.0706012   0.49878218 -0.05196422  0.48604551 -0.02940329
   0.4782444  -0.0176112   0.45136324 -0.00797689]]
initial [[0 0 -1.0 -1.0 1.0 1.0 1.0 -1.0 -1.0 1.0 1.0 -1.0 -1.0 -1.0 0.0 0.0 1.0
  1.0 -1.0 -1.0 1.0 -1.0 1.0 -1.0 1.0 1.0 1.0 -1.0 1.0 -1.0 -1.0 1.

In [110]:
posterior = pd.DataFrame(posterior)
#check which point belongs to which cluster
clusters = [posterior.iloc[0] < posterior.iloc[1]]
#add a new row to the data frame with value 1 or 0 if point (represented by point) belongs to cluster 1 or 0 respectively
posterior_assigned = posterior.append(clusters,ignore_index=True)
cluster_df = pd.DataFrame()
cluster_df['assigned1'] = posterior_assigned.T[2].astype(int)

#check which point belongs to which cluster
clusters = [posterior.iloc[0] > posterior.iloc[1]]
#add a new row to the data frame with value 1 or 0 if point (represented by point) belongs to cluster 1 or 0 respectively
posterior_assigned = posterior.append(clusters,ignore_index=True)
cluster_df['assigned2'] = posterior_assigned.T[2].astype(int)

#\s+ since dataset has one or more white spaces
df = pd.read_csv("ionosphere.data",header=None)
cluster_df['groundtruth'] = df[20].astype(int)

In [111]:
cluster_df

Unnamed: 0,assigned1,assigned2,groundtruth
0,1,0,0
1,0,1,0
2,1,0,0
3,0,1,0
4,1,0,0
...,...,...,...
346,1,0,0
347,1,0,0
348,1,0,0
349,1,0,0
