In [None]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [None]:
# fetch data 

cereal_data = pd.read_csv('Kaggle_Data/cereal.csv')

cereal_data.head()

In [None]:
Kval = 8 # Number of clusters

cereal_data.isna().sum()


In [None]:
# define function to import viz libraries
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [None]:
# correl = subset_data.corr()
correl = cereal_data.corr()

trace = go.Heatmap(z=correl.values,
                  x=correl.index.values,
                  y=correl.columns.values)
data=[trace]
plotly.offline.iplot(data, filename='basic-heatmap')

In [None]:
cereal_data.columns

In [None]:
cols = ['calories', 'protein', 'fat', 'sodium', 'fiber',
       'carbo', 'sugars', 'potass', 'vitamins', 'shelf', 'weight', 'cups',
       'rating']
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
subset_data = pd.DataFrame(sc.fit_transform(cereal_data[cols]), columns = cereal_data[cols].columns, index = cereal_data.index)

In [None]:
subset_data.head()

In [None]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters = Kval, 
           init = 'random', 
           n_init = 10, 
           max_iter = 500, 
           tol = 1e-04, 
           random_state = 50)
data_km = pd.DataFrame(km.fit_predict(subset_data), index = subset_data.index)

In [None]:
data_km.head()

In [None]:
data_km.rename(columns = {0:'cluster'}, inplace = True)
data_km = data_km.merge(cereal_data[['name', 'mfr', 'rating', 'carbo']], left_index = True, right_index = True)
data_km.head()

In [None]:
pd.set_option('display.max_rows', None)
data_km.sort_values(by = ['cluster'])

In [None]:
plot_data = []
for clus in set(data_km['cluster']):
    df = data_km[data_km['cluster'] == clus]
    plot_data.append(go.Scatter(x= df['rating'], y= df['carbo'], text=df["name"], name = 'cluster ' + str(clus), mode = 'markers'))

layout = go.Layout(xaxis = dict(title='rating'), yaxis = dict(title= 'carbo'), 
                   title = 'Clustering')
fig = go.Figure(data= plot_data, layout=layout)
plotly.offline.iplot(fig)
    
    

# plt.plot(data_km['cluster'],data_km['rating'], 'ro', alpha = 0.5)
# for i in range(len(data_km)):
#     plt.text(data_km.loc[i]['cluster'], data_km.loc[i]['rating'], str(i)) # cereal_data['name']) # 

# plt.show()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
data_pca = pd.DataFrame(pca.fit_transform(subset_data), index = subset_data.index)

In [None]:
data_km = data_km.merge(data_pca, left_index = True, right_index = True)
data_km.head()

In [None]:
plot_data = []
for clus in set(data_km['cluster']):
    df = data_km[data_km['cluster'] == clus]
    plot_data.append(go.Scatter(x= df[0], y= df[1], text=df["name"], name = 'cluster ' + str(clus), mode = 'markers'))

layout = go.Layout(xaxis = dict(title='PCA First'), yaxis = dict(title= 'PCA Second'), 
                   title = 'Clustering')
fig = go.Figure(data= plot_data, layout=layout)
plotly.offline.iplot(fig)

In [None]:
distortions = []
for i in range(2, 30): 
    km = KMeans(n_clusters = i, 
           init = 'k-means++', # this initializes by placing the initial ones randomly and uniformly in grid
           n_init = 10, 
           max_iter = 500, 
           # tol = 1e-04, 
           random_state = 50)
    km.fit(subset_data)
    distortions.append(km.inertia_)
plt.plot(range(2,30), distortions, marker = 'o')
plt.xlabel('Number of Clusters')
plt.ylabel('Distortion')
plt.tight_layout()
plt.show()