In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df= pd.read_csv('../input/glass/glass.csv')
df.head()

In [None]:
df.shape

Their are 214 data points and 10 columns in the dataset

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

Their are no null values present in the data

In [None]:
from scipy.cluster.hierarchy import linkage,cophenet,dendrogram

In [None]:
### Checking the number of classes in the actual data
df['Type'].value_counts()

In [None]:
df1=df.drop('Type',axis=1)

In [None]:
df1.head()

In [None]:
#### Standardizing the data before clustering
from scipy.stats import zscore
df1=df1.apply(zscore)

###### KMeans clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
cluster_no = range(1,15)
wcss=[]
for no in cluster_no:
    km = KMeans(no,random_state=1)
    km.fit(df1)
    wcss.append(km.inertia_)
    

In [None]:
## Elbow curve to identify the appropriate no of clusters

In [None]:
plt.figure(figsize=(10,5))
plt.plot(cluster_no,wcss,marker='o')

As seen above  the optimum number of features is 6 hence we can do clustering using 6 as the number of clusters

In [None]:
km = KMeans(n_clusters=6,random_state=1)
km.fit(df1)

In [None]:
km.labels_

In [None]:
df1['Class']= km.labels_

In [None]:
df1.head()

In [None]:
df1['Class'].value_counts()

In [None]:
 # plot of the clusters using two features
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(10,8))

ax1 = plt.subplot(1,2,1)
plt.title('Original Classes')
sns.scatterplot(x='Ca', y='Fe', hue='Type', style='Type', data=df, ax=ax1)

ax2 = plt.subplot(1,2,2)
plt.title('Predicted Classes')
sns.scatterplot(x='Ca', y='Fe', hue='Class', style='Class', data=df1, ax=ax2)
plt.show()

###### Agglomerative CLusering

In [None]:
df2=df.drop('Type',axis=1)
df2=df2.apply(zscore)

In [None]:
from scipy.spatial.distance import pdist

###### Calculating the Cophenet Distance

In [None]:
Z = linkage(df2, method='complete')
c, coph_dists = cophenet(Z , pdist(df2))
c

In [None]:
Z = linkage(df2, method='single')
c, coph_dists = cophenet(Z , pdist(df2))
c

In [None]:
Z = linkage(df2, method='ward')
c, coph_dists = cophenet(Z , pdist(df2))
c

In [None]:
Z = linkage(df2, method='average')
c, coph_dists = cophenet(Z , pdist(df2))
c

Average linkage is better

###### Making of Dendogram

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram
plt.figure(figsize=[10,10])
merg = linkage(df2, method='average')
dendrogram(merg, leaf_rotation=90)
plt.title('Dendrogram')
plt.xlabel('Data Points')
plt.ylabel('Euclidean Distances')
plt.show()

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram
plt.figure(figsize=[10,10])
merg = linkage(df2, method='ward')
dendrogram(merg, leaf_rotation=90)
plt.title('Dendrogram')
plt.xlabel('Data Points')
plt.ylabel('Euclidean Distances')
plt.show()

Average method is better but here ward gives a better dendogram for interpretation.

If we draw a horizontal line at 15 on the y axis the optimal number of cluster is 6 hence making a hierarchical model with 6 clusters

In [None]:
from sklearn.cluster import AgglomerativeClustering


In [None]:
ac = AgglomerativeClustering(n_clusters=6, affinity ='euclidean',linkage='ward')
ac.fit(df2)

In [None]:
df2['label']=ac.labels_

In [None]:
df2.head()

In [None]:
df2['label'].value_counts()

In [None]:
 # plot of the clusters using two features
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(10,8))

ax1 = plt.subplot(1,2,1)
plt.title('Original Classes')
sns.scatterplot(x='Ca', y='Fe', hue='Type', style='Type', data=df, ax=ax1)

ax2 = plt.subplot(1,2,2)
plt.title('Predicted Classes')
sns.scatterplot(x='Ca', y='Fe', hue='label', style='label', data=df2, ax=ax2)
plt.show()

In [None]:
plt.title('Original Classes')
sns.scatterplot(x='Mg', y='Al', hue='Type', style='Type', data=df)
plt.show()
plt.title('K-Means Classes')
sns.scatterplot(x='Mg', y='Al', hue='Class', style='Class', data=df1)
plt.show()
plt.title('Hierarchical Classes')
sns.scatterplot(x='Mg', y='Al', hue='label', style='label', data=df2)
plt.show()

In [None]:
print('Original Data Classes:')
print(df.Type.value_counts())
print('-' * 30)
print('K-Means Predicted Data Classes:')
print(df1.Class.value_counts())
print('-' * 30)
print('Hierarchical Predicted Data Classes:')
print(df2.label.value_counts())

In [None]:
###### Calculating cohen_kappa_score to check the aggrement

In [None]:
#### For KMeans Label
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(df['Type'],df1['Class'] )

In [None]:
##### For Agglomerative Label
cohen_kappa_score(df['Type'],df2['label'])

In [None]:
df3=df1.drop('Class',1)
df3.head()

In [None]:
##### Finding Silhouette Score 

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score ( df3 , df1['Class']  )

In [None]:
silhouette_score ( df3 , df2['label']  )

A high value of silhouette score  indicates that the object is well matched to its own cluster
and poorly matched to neighboring clusters. 
Hence here we can say that Kmeans is performing better than Agglomerative Clustering.

##### Bulding models using  KMeans outcomes

In [None]:
## Decision Tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,roc_auc_score
dt = DecisionTreeClassifier()


In [None]:
x= df1.drop('Class',axis=1)
y=df1['Class']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)


In [None]:
dt.fit(x_train,y_train)
y_pred=dt.predict(x_test)
y_prob = dt.predict_proba(x_test)[:,1]
print('accuracy_score fot test:',accuracy_score(y_test,y_pred))


In [None]:
## KNN model

from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier()
kn.fit(x_train,y_train)
y_pred=kn.predict(x_test)
y_prob = kn.predict_proba(x_test)[:,1]
print('accuracy_score fot test:',accuracy_score(y_test,y_pred))


In [None]:
## Logistic regression

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)
y_prob = lr.predict_proba(x_test)[:,1]
print('accuracy_score fot test:',accuracy_score(y_test,y_pred))


In [None]:
## SVC

from sklearn.svm import SVC
svc= SVC(probability=True)
svc.fit(x_train,y_train)
y_pred=svc.predict(x_test)
y_prob = svc.predict_proba(x_test)[:,1]
print('accuracy_score fot test:',accuracy_score(y_test,y_pred))


From the above models we can see that the best model is Logistic Regression and the worst performing model is SVC

In [None]:
###### Using PC for model building 

In [None]:
from sklearn.decomposition import PCA

In [None]:
df.head()
df4=df.drop('Type',axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X_std = StandardScaler().fit_transform(df4)

In [None]:
# covariance matrix
cov_matrix = np.cov(X_std.T)
print('Covariance Matrix \n', cov_matrix)

In [None]:
eig_values,eig_vect = np.linalg.eig(cov_matrix)

In [None]:
print('Eigen Vectors \n', eig_vect)
print('\n Eigen Values \n', eig_values)

In [None]:
tot = sum(eig_values)
var_exp = [( i /tot ) * 100 for i in sorted(eig_values, reverse=True)]
cum_var = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var)

Hence from above we can see that the first 6 principal components can explain 95% of the variablility in the data hence we can use only 6 principal components instead of all the features.

In [None]:
pca1 = PCA(n_components=6).fit_transform(X_std)

In [None]:
## KNN model

from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier()
kn.fit(pca1,y)
y_pred=kn.predict(pca1)
print('accuracy_score fot test:',accuracy_score(y,y_pred))


In [None]:
# Logistic regression
lr = LogisticRegression()
lr.fit(pca1,y)
y_pred=lr.predict(pca1)
print('accuracy_score fot test:',accuracy_score(y,y_pred))

In [None]:
## Decision Tree
dt =DecisionTreeClassifier()
dt.fit(pca1,y)
y_pred=dt.predict(pca1)
print('accuracy_score fot test:',accuracy_score(y,y_pred))

In [None]:
## SVM
svc= SVC(probability=True)
svc.fit(pca1,y)
y_pred=svc.predict(pca1)
print('accuracy_score fot test:',accuracy_score(y,y_pred))



From the above accuracy scores we can see that Decision tree works really well with the PCA  and it has outperformed Logistic regression. But there might be overfitting issue.

##### Performing modeling using the labels given by Agglomorative clustering

In [None]:
df2.head()

In [None]:
x= df2.drop('label',axis=1)
y=df2['label']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)


In [None]:
## KNN model

from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier()
kn.fit(x_train,y_train)
y_pred=kn.predict(x_test)
y_prob = kn.predict_proba(x_test)[:,1]
print('accuracy_score fot test:',accuracy_score(y_test,y_pred))


In [None]:
## Logistic model

lr = LogisticRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)
y_prob = lr.predict_proba(x_test)[:,1]
print('accuracy_score fot test:',accuracy_score(y_test,y_pred))


In [None]:
## Decision tree

dt =DecisionTreeClassifier()
dt.fit(x_train,y_train)
y_pred=dt.predict(x_test)
y_prob = dt.predict_proba(x_test)[:,1]
print('accuracy_score fot test:',accuracy_score(y_test,y_pred))


In [None]:
## SVM

svc= SVC(probability=True)
svc.fit(x_train,y_train)
y_pred=svc.predict(x_test)
print('accuracy_score fot test:',accuracy_score(y_test,y_pred))


With the labels predicted by hierarchical clustering also Logistic regression performs well .

In [None]:
### with pca 

In [None]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
x_train

In [None]:
pca2 = PCA()
x_train_2 = pca2.fit_transform(x_train)
x_test_2 = pca2.transform(x_test)

In [None]:
## Decision tree
dt=DecisionTreeClassifier()
dt.fit(x_train_2,y_train)
y_pred = dt.predict(x_test_2)
print("Accuracy Score:",accuracy_score(y_test, y_pred) )

In [None]:
## SVM

svc=SVC()
svc.fit(x_train_2,y_train)
y_pred = svc.predict(x_test_2)
print("Accuracy Score:",accuracy_score(y_test, y_pred) )

In [None]:
## logistic
lr=LogisticRegression()
lr.fit(x_train_2,y_train)
y_pred = lr.predict(x_test_2)
print("Accuracy Score:",accuracy_score(y_test, y_pred) )

In [None]:
## KNN
kn=KNeighborsClassifier()
kn.fit(x_train_2,y_train)
y_pred = kn.predict(x_test_2)
print("Accuracy Score:",accuracy_score(y_test, y_pred) )

With pca also logistic regression seems to work better than other models .

 ##### Overall observation
    
-The clustering done by KMeans seems to be better than Hierarchical clustering and aslo its easy to interpret and find the number of cluster.
-Analysing the number of clusters becomes difficult with dendogram as the number of observation increases.
-Overall the Logistic regression model performs better than all the other models , with pca and without pca.
-Decision tree performs well with pca using KMeans  labels but might have overfitting issue .

In [None]:
import pandas as pd
glass = pd.read_csv("../input/glass/glass.csv")