In [22]:
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import homogeneity_score, completeness_score, \
v_measure_score, adjusted_rand_score, adjusted_mutual_info_score, silhouette_score
%matplotlib inline

#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

np.random.seed(123)

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
 
df = pd.read_csv("Live_Preprocessed_No_Scaling.csv")
df.shape

(7050, 12)

In [24]:
X = df.copy(deep=True)

In [26]:
from sklearn import preprocessing
X = pd.DataFrame(preprocessing.normalize(X))
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.0,0.0,0.0,0.087832,0.548096,0.488777,0.533205,0.382658,0.121761,0.060881,0.060881,0.0
1,0.0,0.195467,0.0,0.0,0.0,0.0,0.98071,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.107685,0.588826,0.437247,0.573206,0.332858,0.074641,0.074641,0.0,0.0
3,0.0,0.207327,0.0,0.0,0.0,0.0,0.978272,0.0,0.0,0.0,0.0,0.0
4,0.0,0.169916,0.0,0.0,0.0,0.0,0.904464,0.391246,0.0,0.0,0.0,0.0


In [27]:
gmm = GaussianMixture(n_components=2)
cluster = gmm.fit_predict(X)

In [28]:
print(gmm.means_)
print('\n')
print(gmm.covariances_)

[[0.00000000e+00 7.53408750e-02 7.76706101e-03 5.39183275e-02
  4.77218217e-01 2.94268125e-01 6.62079526e-01 2.67755558e-01
  1.08937799e-01 6.01941950e-02 2.16194275e-02 1.27843844e-02]
 [3.33888676e-03 2.01481138e-01 1.08511741e-02 8.43042823e-02
  2.27643838e-01 7.39400593e-02 8.64909091e-01 3.66629225e-02
  2.19801897e-04 4.35482640e-05 0.00000000e+00 0.00000000e+00]]


[[[ 1.00000000e-06  0.00000000e+00  0.00000000e+00  0.00000000e+00
    0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
    0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  9.04547954e-03 -5.85177173e-04 -4.06225397e-03
   -1.21903146e-02 -1.25879259e-02  1.23966457e-02 -7.58592665e-03
    3.22009247e-03 -2.08102202e-03 -1.05811318e-04 -5.62926573e-04]
  [ 0.00000000e+00 -5.85177173e-04  1.30147829e-03 -4.18786939e-04
   -5.41151496e-04 -1.71519019e-03  1.27638170e-03 -1.15541601e-03
   -2.18823199e-05 -1.55140358e-04  2.26138806e-04 -4.77577816e-05]
  [ 0.00000000e+0

In [29]:
#Add the cluster vector to our DataFrame, X
X["Cluster"] = cluster

In [30]:
#plotX is a DataFrame containing 5000 values sampled randomly from X
# plotX = pd.DataFrame(np.array(X.sample(5000)))
plotX = pd.DataFrame(np.array(X))

#Rename plotX's columns since it was briefly converted to an np.array above
plotX.columns = X.columns

In [31]:
#PCA with two principal components
pca_2d = PCA(n_components=2)

In [32]:
#This DataFrame contains the two principal components that will be used
#for the 2-D visualization mentioned above
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX.drop(["Cluster"], axis=1)))

In [33]:
#"PC1_2d" means: 'The first principal component of the components created for 2-D visualization, by PCA.'
#And "PC2_2d" means: 'The second principal component of the components created for 2-D visualization, by PCA.'
PCs_2d.columns = ["PC1_2d", "PC2_2d"]

In [34]:
plotX = pd.concat([plotX,PCs_2d], axis=1, join='inner')


In [35]:
plotX["dummy"] = 0


In [36]:
#Note that all of the DataFrames below are sub-DataFrames of 'plotX'.
#This is because we intend to plot the values contained within each of these DataFrames.

cluster0 = plotX[plotX["Cluster"] == 0]
cluster1 = plotX[plotX["Cluster"] == 1]
# cluster2 = plotX[plotX["Cluster"] == 2]

In [37]:
#Instructions for building the 2-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_2d"],
                    y = cluster0["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_2d"],
                    y = cluster1["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
# trace3 = go.Scatter(
#                     x = cluster2["PC1_2d"],
#                     y = cluster2["PC2_2d"],
#                     mode = "markers",
#                     name = "Cluster 2",
#                     marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
#                     text = None)

# data = [trace1, trace2, trace3]
data = [trace1, trace2]

title = "Visualizing Clusters in Two Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [38]:
from sklearn.metrics import silhouette_score
silhouette_score(df, cluster)

0.3801176895756888