In [1]:
import pandas as pd
import numpy as np

In [2]:
cereals = pd.read_csv("Cereals.csv")
cereals.head()

Unnamed: 0,name,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100%_Bran,70,4,1,130,10.0,5.0,6.0,280.0,25,3,1.0,0.33,68.402973
1,100%_Natural_Bran,120,3,5,15,2.0,8.0,8.0,135.0,0,3,1.0,1.0,33.983679
2,All-Bran,70,4,1,260,9.0,7.0,5.0,320.0,25,3,1.0,0.33,59.425505
3,All-Bran_with_Extra_Fiber,50,4,0,140,14.0,8.0,0.0,330.0,25,3,1.0,0.5,93.704912
4,Almond_Delight,110,2,2,200,1.0,14.0,8.0,,25,3,1.0,0.75,34.384843


## Dropping less relevant columns

In [3]:
cereals.drop(['shelf','weight','cups','rating'], axis=1, inplace=True)
cereals.head()

Unnamed: 0,name,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins
0,100%_Bran,70,4,1,130,10.0,5.0,6.0,280.0,25
1,100%_Natural_Bran,120,3,5,15,2.0,8.0,8.0,135.0,0
2,All-Bran,70,4,1,260,9.0,7.0,5.0,320.0,25
3,All-Bran_with_Extra_Fiber,50,4,0,140,14.0,8.0,0.0,330.0,25
4,Almond_Delight,110,2,2,200,1.0,14.0,8.0,,25


In [4]:
cereals.describe()

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins
count,77.0,77.0,77.0,77.0,77.0,76.0,76.0,75.0,77.0
mean,106.883117,2.545455,1.012987,159.675325,2.151948,14.802632,7.026316,98.666667,28.246753
std,19.484119,1.09479,1.006473,83.832295,2.383364,3.907326,4.378656,70.410636,22.342523
min,50.0,1.0,0.0,0.0,0.0,5.0,0.0,15.0,0.0
25%,100.0,2.0,0.0,130.0,1.0,12.0,3.0,42.5,25.0
50%,110.0,3.0,1.0,180.0,2.0,14.5,7.0,90.0,25.0
75%,110.0,3.0,2.0,210.0,3.0,17.0,11.0,120.0,25.0
max,160.0,6.0,5.0,320.0,14.0,23.0,15.0,330.0,100.0


In [5]:
cereals.isnull().sum()

name        0
calories    0
protein     0
fat         0
sodium      0
fiber       0
carbo       1
sugars      1
potass      2
vitamins    0
dtype: int64

## Decoupling name label

In [6]:
labels = cereals['name']
cereals.drop(['name'], axis=1,inplace=True)

## Imputation

In [7]:
from sklearn.preprocessing import Imputer
mean_imputer = Imputer()
mean_imputer.fit(cereals)
cereals = pd.DataFrame(mean_imputer.transform(cereals),
                       columns=cereals.columns)

In [8]:
cereals.isnull().sum()

calories    0
protein     0
fat         0
sodium      0
fiber       0
carbo       0
sugars      0
potass      0
vitamins    0
dtype: int64

## Standardization

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(cereals)
cereals = pd.DataFrame(scaler.transform(cereals),
                       columns=cereals.columns)

In [10]:
cereals.describe()

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,-1.258012e-16,5.767392e-17,1.013349e-16,-5.1906530000000004e-17,1.463476e-16,-4.527403e-16,-2.144749e-16,-4.758099e-17,5.1906530000000004e-17
std,1.006557,1.006557,1.006557,1.006557,1.006557,1.006557,1.006557,1.006557,1.006557
min,-2.938605,-1.420902,-1.013072,-1.917189,-0.9088244,-2.542013,-1.625929,-1.212115,-1.27255
25%,-0.3555846,-0.5014948,-1.013072,-0.3563056,-0.486498,-0.7267769,-0.931712,-0.7774919,-0.1462701
50%,0.1610194,0.4179123,-0.01298811,0.2440343,-0.06417167,-4.606439e-16,-0.006089621,-0.1255577,-0.1462701
75%,0.1610194,0.4179123,0.9870962,0.6042382,0.3581547,0.5698204,0.9195328,0.3090651,-0.1462701
max,2.74404,3.176134,3.987349,1.924986,5.003745,2.125737,1.845155,3.351425,3.23257


## Agglomerative Clustering
**Parameter description**

n_clusters : The number of clusters to find.

linkage : {“ward”, “complete”, “average”}

ward minimizes the variance of the clusters being merged.

complete uses the maximum distances between all observations of the two sets.

average uses the average of the distances of each observation of the two sets.
affinity : {“euclidean”, “l1”, “l2”, “manhattan”, “cosine”}

Metric used to compute the linkage.



In [11]:
from scipy.cluster.hierarchy import linkage, dendrogram

linkage_matrix = linkage(cereals, method='ward', metric='euclidean')

In [12]:
import matplotlib.pyplot as plt
%matplotlib notebook

dendrogram(linkage_matrix, labels=labels.as_matrix())
plt.tight_layout()
plt.show()

  after removing the cwd from sys.path.


<IPython.core.display.Javascript object>

## Implementing 6 clusters

In [13]:
from sklearn.cluster import AgglomerativeClustering

clust = AgglomerativeClustering(n_clusters=6,
                                affinity='euclidean',
                                linkage='ward')

cluster_predictions = clust.fit_predict(cereals)

result = pd.DataFrame({'Label':labels,
                       'Cluster':cluster_predictions})
result.head()

Unnamed: 0,Label,Cluster
0,100%_Bran,3
1,100%_Natural_Bran,2
2,All-Bran,3
3,All-Bran_with_Extra_Fiber,3
4,Almond_Delight,2


## K-Means Clustering
**Parameter description**

n_clusters : The number of clusters to find.

tol : Relative tolerance with regards to inertia to declare convergence

n_init : Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia.

max_iter : max iterations of recomputing new cluster centroids

n_jobs : The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel.

In [14]:
from sklearn.cluster import KMeans
wss = []
for k in range(2,15):
    km = KMeans(n_clusters=k)
    km.fit(cereals)
    wss.append(km.inertia_)

In [15]:
%matplotlib notebook
plt.plot(range(2,15), wss, 'bx--')
plt.xlabel('k')
plt.ylabel('wss')
plt.show()

<IPython.core.display.Javascript object>

## Kmeans with 6 clusters

In [16]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=6, random_state=4545, n_init=50)
km.fit(cereals)

kmeans_clusters = km.predict(cereals)

result = pd.DataFrame({"Label":labels, "KMeans Cluster":kmeans_clusters})
result.head()

Unnamed: 0,Label,KMeans Cluster
0,100%_Bran,3
1,100%_Natural_Bran,2
2,All-Bran,3
3,All-Bran_with_Extra_Fiber,3
4,Almond_Delight,1


## Cluster characteristics 

In [17]:
cluster_characteristics = pd.DataFrame(scaler.inverse_transform(km.cluster_centers_),
             columns=cereals.columns)

cluster_characteristics.head()

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins
0,103.333333,2.722222,0.5,226.666667,1.777778,18.277778,3.388889,77.222222,25.0
1,110.909091,1.545455,1.045455,170.0,0.590909,12.5,11.318182,49.712121,25.0
2,122.222222,3.333333,2.111111,144.444444,3.15,13.766813,8.945906,149.722222,22.222222
3,63.333333,4.0,0.666667,176.666667,11.0,6.666667,3.666667,310.0,25.0
4,116.666667,2.666667,0.833333,208.333333,1.833333,18.166667,6.333333,95.833333,100.0


## Writing files

In [18]:
result.to_csv("KMeans_Clustering_Output.csv")
cluster_characteristics.to_csv("Cluster_Characteristics.csv")