# K-Means and Hierarchical Clustering Implementation

* Clustering algorithms is being used for unlabelled datasets.
* This is an implementation example of clustering algorithms.  We'll use K-Means an Hierarchical clustering algorithms for seperate the cancer data by "radius_mean" and "texture_mean"

## Index of contents

* [DATA EXPLORATION](#1)
* [K-MEANS CLUSTERING](#2)
* [HIERARCHICAL CLUSTERING](#3)

In [None]:
#1. Choose 3 Features
#2. Plot them to see how the distribution are with respect to the label (diagnosis)
#3. Fit a appropriate clustering on the Features
#4. For each sample replace the 3 feature with the centroid of the cluster the 3 feature belongs to
#5. Fit a classifier with original features and features modified with k means 
#6. see the performance difference

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble




<a id="1"></a> 
**DATA EXPLORATION**

In [None]:
# Read and upload data
data = pd.read_csv("../input/data.csv")

In [None]:
data

In [None]:
data.head(5)

In [None]:
data.columns

In [None]:
data.info()

In [None]:
# We don't need id and NaN data.
data.drop(["Unnamed: 32", "id"], axis = 1, inplace = True)
data.head()

In [None]:
data.info()

In [None]:
data["diagnosis"].value_counts()

# We have 357 B and 212 M labelled data

In [None]:
# For clustering we do not need labels. Because we'll identify the labels.

dataWithoutLabels = data.drop(["diagnosis"], axis = 1)
dataWithoutLabels.head()

In [None]:
dataWithoutLabels.info()

In [None]:
# radius_mean and texture_mean features will be used for clustering. Before clustering process let's check  how our data looks.

sns.pairplot(data.loc[:,['perimeter_mean','area_mean', 'diagnosis']], hue = "diagnosis", height = 5)
plt.show()

In [None]:
# Our data looks like below plot without diagnosis label

plt.figure(figsize = (10, 10))
plt.scatter(dataWithoutLabels["perimeter_mean"], dataWithoutLabels["area_mean"])
plt.xlabel('perimeter_mean')
plt.ylabel('area_mean')
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples,silhouette_score
wcss = [] # within cluster sum of squares

for k in range(1, 15):
    kmeansForLoop = KMeans(n_clusters = k)
    kmeansForLoop.fit(dataWithoutLabels)
    wcss.append(kmeansForLoop.inertia_)

plt.figure(figsize = (10, 10))
plt.plot(range(1, 15), wcss)
plt.xlabel("K value")
plt.ylabel("WCSS")
plt.show()

In [None]:
#centroids.shape

In [None]:
%matplotlib inline
for i,k in enumerate([2,5,7,10,14]):
    fig, ax = plt.subplots(1,2,figsize=(15,5))
    
    # Run the kmeans algorithm
    km = KMeans(n_clusters=k)
    y_predict = km.fit_predict(dataWithoutLabels)
    centroids  = km.cluster_centers_
    
    
    y_ticks = []
    y_lower = y_upper = 0
    silhouette_vals = silhouette_samples(dataWithoutLabels,y_predict)
    for i,cluster in enumerate(np.unique(y_predict)):
        cluster_silhouette_vals = silhouette_vals[y_predict ==cluster]
        cluster_silhouette_vals.sort()
        y_upper += len(cluster_silhouette_vals)

        ax[0].barh(range(y_lower,y_upper),
        cluster_silhouette_vals,height =1)   
        ax[0].text(-0.03,(y_lower+y_upper)/2,str(i+1))
        y_lower += len(cluster_silhouette_vals)       
        # Get the average silhouette score    
        avg_score = np.mean(silhouette_vals)
        ax[0].axvline(avg_score,linestyle ='--',linewidth =2,color = 'green')
        ax[0].set_yticks([])
        ax[0].set_xlim([-0.1, 1])
        ax[0].set_xlabel('Silhouette coefficient values')
        ax[0].set_ylabel('Cluster labels')
        ax[0].set_title('Silhouette plot for the various clusters');


        # scatter plot of data colored with labels

        ax[1].scatter(dataWithoutLabels['perimeter_mean'],dataWithoutLabels['area_mean'] , c = y_predict)    
        ax[1].scatter(centroids[:,0],centroids[:,1],marker = '*' , c= 'r',s =250);
        ax[1].set_xlabel('Eruption time in mins')
        ax[1].set_ylabel('Waiting time to next eruption')
        ax[1].set_title('Visualization of clustered data', y=1.02)

        plt.suptitle(f' Silhouette analysis using k = {k}',fontsize=16,fontweight = 'semibold')

In [None]:
from sklearn.metrics import silhouette_samples,silhouette_score
silhouette_vals = silhouette_samples(dataWithoutLabels,y_predict)
    #silhouette_vals# silhouette ploty_ticks = []


In [None]:
dataWithoutLabels.shape

In [None]:
silhouette_vals.shape

In [None]:
dataWithoutLabels.columns

In [None]:
import matplotlib.pyplot as plt



In [None]:
# Elbow point starting from 2 

dataWithoutLabels = data.loc[:,['perimeter_mean','area_mean']]
kmeans = KMeans(n_clusters = 2)
clusters = kmeans.fit_predict(dataWithoutLabels)
dataWithoutLabels["type"] = clusters
dataWithoutLabels["type"].unique()

In [None]:
# Plot data after k = 2 clustering

plt.figure(figsize = (15, 10))
plt.scatter(dataWithoutLabels["perimeter_mean"][dataWithoutLabels["type"] == 0], dataWithoutLabels["area_mean"][dataWithoutLabels["type"] == 0], color = "red")
plt.scatter(dataWithoutLabels["perimeter_mean"][dataWithoutLabels["type"] == 1], dataWithoutLabels["area_mean"][dataWithoutLabels["type"] == 1], color = "green")
plt.xlabel('perimeter_mean')
plt.ylabel('area_mean')
plt.show()

In [None]:
# Data centroids middle of clustered scatters

plt.figure(figsize = (15, 10))
plt.scatter(dataWithoutLabels["perimeter_mean"], dataWithoutLabels["area_mean"], c = clusters, alpha = 0.5)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], color = "red", alpha = 1)
plt.xlabel('perimeter_mean')
plt.ylabel('area_mean')
plt.show()

In [None]:
dataWithoutDiagnosis = data.drop(["diagnosis"], axis = 1)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
scalar = StandardScaler()
kmeans = KMeans(n_clusters = 2)
pipe = make_pipeline(scalar, kmeans)
pipe.fit(dataWithoutDiagnosis)
labels = pipe.predict(dataWithoutDiagnosis)
df = pd.DataFrame({'labels': labels, "diagnosis" : data['diagnosis']})
ct = pd.crosstab(df['labels'], df['diagnosis'])
print(ct)

In [None]:
#dataWithoutTypes.info()

In [None]:
dataWithoutTypes = dataWithoutLabels.drop(["type"], axis = 1)
dataWithoutTypes.head()

In [None]:
from scipy.cluster.hierarchy import linkage,dendrogram
merg = linkage(dataWithoutTypes, method = "ward")
dendrogram(merg, leaf_rotation = 90)
plt.xlabel("data points")
plt.ylabel("euclidean distance")
plt.show()


In [None]:
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 2, affinity = "euclidean", linkage = "ward")
cluster = hc.fit_predict(dataWithoutTypes)
dataWithoutTypes["label"] = cluster

In [None]:
dataWithoutTypes.label.value_counts()

In [None]:
# Data after hierarchical clustering

plt.figure(figsize = (15, 10))
plt.scatter(dataWithoutTypes["perimeter_mean"][dataWithoutTypes.label == 0], dataWithoutTypes["area_mean"][dataWithoutTypes.label == 0], color = "red")
plt.scatter(dataWithoutTypes["perimeter_mean"][dataWithoutTypes.label == 1], dataWithoutTypes["area_mean"][dataWithoutTypes.label == 1], color = "blue")
plt.xlabel("perimeter_mean")
plt.ylabel("area_mean")
plt.show()

In [None]:
y= df.iloc[:,-1] #class variable
X = df.iloc[:,:-1]


In [None]:
# Lets split the data into 5 folds.  
# We will use this 'kf'(KFold splitting stratergy) object as input to cross_val_score() method
kf =KFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1

In [None]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0,stratify=y)


In [None]:
print('Total count for each class:\n', y.value_counts())
print("\nCount of each class in train data:\n",y_train.value_counts())
print("\nCount of each class in test data:\n",y_test.value_counts())

In [None]:
# Function to plot ROC curve and classification score which will be used for each model

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

def plot_roc(fpr,tpr):
    plt.plot(fpr, tpr, color='green', label='ROC')
    plt.plot([0, 1], [0, 1], color='yellow', linestyle='--')
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.show()

def clf_score(clf):
    prob = clf.predict_proba(X_test)
    prob = prob[:, 1]
    auc = roc_auc_score(y_test, prob)    
    print('AUC: %.2f' % auc)
    fpr, tpr, thresholds = roc_curve(y_test,prob, pos_label='Non_Fraudulent')
    plot_roc(fpr,tpr)
    predicted=clf.predict(X_test)
    report = classification_report(y_test, predicted)
    print(report)
    return auc

# logistic Regression

In [None]:
# Logistic Regression
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression #import the package
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [None]:
num_C = [0.001,0.01,0.1,1,10,100] #--> list of values
for cv_num in num_C:
  clf = LogisticRegression(penalty='l2',C=cv_num,random_state = 0)
  clf.fit(X_train, y_train)
  print('C:', cv_num)
  print('Training metric:\n'+ classification_report(y_train, clf.predict(X_train)))
  print('Test metric:\n'+ classification_report(y_test, clf.predict(X_test)))
  print('')

In [None]:
#perform cross validation

grid={"C":np.logspace(-3,3,7), "penalty":["l2"]}  # l2 ridge

lsr = LogisticRegression()
clf_lsr_cv = GridSearchCV(lsr,grid,cv=3,scoring='roc_auc')
clf_lsr_cv.fit(X_train,y_train)

print("tuned hyperparameters :(best parameters) ",clf_lsr_cv.best_params_)
print("accuracy :",clf_lsr_cv.best_score_)

#perform hyperparameter tuning

print('Training metric:\n'+ classification_report(y_train, clf_lsr_cv.best_estimator_.predict(X_train)))
print('Test metric:\n'+ classification_report(y_test, clf_lsr_cv.best_estimator_.predict(X_test)))

#print the optimum value of hyperparameters

In [None]:
# Fitting the model with best parameters .

lsr_best = LogisticRegression(penalty='l2',C=0.01,random_state = 0)
lsr_clf = lsr_best.fit(X_train,y_train)
clf_score(lsr_clf)

# KNN

In [None]:
#K-Nearest Neighbor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection  import cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error

# Taking only odd integers as K values to apply the majority rule. 
k_range = np.arange(1, 20, 2)
scores = [] #to store cross val score for each k
k_range

In [None]:
# Finding the best k with stratified K-fold method. 
# We will use cv=3 in cross_val_score to specify the number of folds in the (Stratified)KFold.

for k in k_range:
  knn_clf = KNeighborsClassifier(n_neighbors=k)
  knn_clf.fit(X_train,y_train)
  score = cross_val_score(knn_clf, X_train, y_train, cv=3, n_jobs = -1)
  scores.append(score.mean())

#Storing the mean squared error to decide optimum k
mse = [1-x for x in scores]

In [None]:
print(mse)

In [None]:
#Plotting a line plot to decide optimum value of K

plt.figure(figsize=(20,8))
plt.subplot(121)
sns.lineplot(k_range,mse,markers=True,dashes=False)
plt.xlabel("Value of K")
plt.ylabel("Mean Squared Error")
plt.subplot(122)
sns.lineplot(k_range,scores,markers=True,dashes=False)
plt.xlabel("Value of K")
plt.ylabel("Cross Validation Accuracy")

plt.show()

In [None]:
#Fitting the best parameter to the model
# 3 fold cross validation with K=3

knn = KNeighborsClassifier(n_neighbors=3)

knn_clf = knn.fit(X_train,y_train)

In [None]:
# Checking AUC 

clf_score(knn_clf)


# Decision Tree

In [None]:
from sklearn import tree
from pprint import pprint

In [None]:
# 5 fold cross validation for getting best parameter

depth_score=[]
dep_rng = [x for x in range(1,20)]
for i in dep_rng:
  clf = tree.DecisionTreeClassifier(max_depth=i)
  score_tree = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=5, n_jobs=-1)
  depth_score.append(score_tree.mean())
print(depth_score)

In [None]:
#Plotting depth against score

plt.figure(figsize=(8,6))
sns.lineplot(dep_rng,depth_score,markers=True,dashes=False)
plt.xlabel("Depth")
plt.ylabel("Cross Validation Accuracy")

plt.show()

In [None]:
#Fitting the model with depth=5 and plotting ROC curve

dt = tree.DecisionTreeClassifier(max_depth = 5)
dt_clf = dt.fit(X_train,y_train)

#Plotting ROC
clf_score(dt_clf)

# Random Forest

In [None]:
#Import libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# Using grid search cv to find the best parameters.

param = {'n_estimators': [50, 60, 30], 'max_depth': [5,4, 3]}
rfc = RandomForestClassifier()
clf_rfc_cv = GridSearchCV(rfc, param, cv=5,scoring='roc_auc', n_jobs=-1)
clf_rfc_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",clf_rfc_cv.best_params_)
print("accuracy :",clf_rfc_cv.best_score_)
print('Training metric:\n'+ classification_report(y_train, clf_rfc_cv.best_estimator_.predict(X_train)))
print('Test metric:\n'+ classification_report(y_test, clf_rfc_cv.best_estimator_.predict(X_test)))


In [None]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(max_depth=5, n_estimators=30)
RFC_clf = rf.fit(X_train,y_train)

#Plotting ROC
print('Training metric:\n'+ classification_report(y_train, rf.predict(X_train)))
print('Test metric:\n'+ classification_report(y_test, rf.predict(X_test)))



In [None]:
#Plotting ROC
clf_score(dt_clf)

In [None]:
#import libraries

from xgboost import XGBClassifier
from scipy import stats

In [None]:
# Using grid search cv to find the best parameters.

xgbst = XGBClassifier()

param_xgb = {'n_estimators': [50,60],
              'max_depth': [5, 7]
               
             } 

clf_xgb_cv = GridSearchCV(xgbst, param_xgb, cv=3,scoring='roc_auc', n_jobs=-1)
clf_xgb_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",clf_xgb_cv.best_params_)
print("accuracy :",clf_xgb_cv.best_score_)

print('Training metric:\n'+ classification_report(y_train, clf_xgb_cv.best_estimator_.predict(X_train)))
print('Test metric:\n'+ classification_report(y_test, clf_xgb_cv.best_estimator_.predict(X_test)))

In [None]:
from xgboost import XGBClassifier

xgbst = XGBClassifier(n_estimators=150,max_depth=5,min_child_weight=3)

xgb_clf = xgbst.fit(X_train,y_train)

#Plotting ROC
print('Training metric:\n'+ classification_report(y_train, xgb_clf.predict(X_train)))
print('Test metric:\n'+ classification_report(y_test, xgb_clf.predict(X_test)))


In [None]:
clf = XGBClassifier(n_estimators=150,max_depth=5,min_child_weight=3)  #initialise the model with optimum hyperparameters
clf.fit(X_train, y_train)

# print the evaluation score on the X_test by choosing the best evaluation metric
clf_score(clf)

# Conclusion

I got All the classification  models of same roc and auc score  as 0.89
