# Assignment 2 

In this assignment, you will try to find groups of similar customers in the dataset included in the uploaded folder. The dataset contains information about credit card behaviour of customers.  


### 1. Importing required libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import  silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
import seaborn as sns
from IPython.display import HTML, display
import tabulate

ModuleNotFoundError: No module named 'tabulate'

### 2. Reading the dataset

In [None]:
data = pd.read_csv('CC GENERAL.csv')
print(data.info())
print(data.describe())
data.head()

### 3. Basic pre-processing

In [None]:
data = data.drop('CUST_ID', axis=1)
data.head()
data.fillna(method ='ffill', inplace = True)

### Scale the data 

In [None]:
# scaling 
mms=MinMaxScaler()
data[:] = mms.fit_transform(data[:])

In [None]:
data.head()

## 4. Finding groups

### Approach 1 [ 1.5 + 3.5 marks ] 

Plot the dendrograms for the above scaled dataset points. (hint: use scipy.cluster.hierarchy imported above). 
Plot using the following parameters:
1. ward
2. complete
3. average <br> 
Study the dendrograms and comment on the major differences between them.

In [None]:
#Ward
Z=sch.linkage(data,method='ward')
plt.figure(figsize=(15,10))
sch.dendrogram(Z,leaf_rotation=90,p=5,color_threshold=20,leaf_font_size=10,truncate_mode='level')
plt.axhline(y=35, color='r', linestyle='--')
plt.title("Linkage: Ward")
plt.show()

<b>Conclusion:</b> The optimal no. of clusters is <b> 2 </b> as per above dendogram for Ward Linkage. 

In [None]:
#Complete
Z=sch.linkage(data,method='complete')
plt.figure(figsize=(15,10))
sch.dendrogram(Z,leaf_rotation=90,p=5,color_threshold=20,leaf_font_size=10,truncate_mode='level')
plt.axhline(y=2.2, color='r', linestyle='--')
plt.title("Linkage: Complete")
plt.show() 

<b>Conclusion:</b> The optimal no. of clusters is <b> 6 </b> as per above dendogram for Complete Linkage. 

In [None]:
#Average
Z=sch.linkage(data,method='average')
plt.figure(figsize=(15,10))
sch.dendrogram(Z,leaf_rotation=90,p=5,color_threshold=20,leaf_font_size=10,truncate_mode='level')
plt.axhline(y=1.75, color='r', linestyle='--')
plt.title("Linkage: Average")
plt.show() 

<b>Conclusion:</b> The optimal no. of clusters is <b> 3 </b> as per above dendogram for Average Linkage. 

With the help of dendrograms obtained above, choose a suitable **k** for each linkage type. Experiment with different distance measures as mentioned below: <br>
1. Euclidean 
2. Manhattan 
3. Cosine <br> 
Calculate the cluster quality for each case and report your results in an organized, tabular format. The table should have the parameters used, cluster means and cluster quality.  

In [None]:
# clustering (hint: Use AgglomerativeClustering imported above)
options = { 'ward': 2, 'complete': 6, 'average': 3 }
results = []
for linkage in options.keys(): 
    no_of_clusters = options[linkage] 
    euclidean_model = AgglomerativeClustering(n_clusters=no_of_clusters, affinity='euclidean', linkage=linkage)
    euclidean_model.fit(data)
    euclidean_labels = euclidean_model.labels_
    silhouette_avg = silhouette_score(data, euclidean_labels)

    results.append((linkage, no_of_clusters, 'euclidean', silhouette_avg ))
    
    if(linkage != 'ward') :
        manhattan_model = AgglomerativeClustering(n_clusters=no_of_clusters, affinity='manhattan', linkage=linkage)
        manhattan_model.fit(data)
        manhattan_labels = manhattan_model.labels_
        silhouette_avg = silhouette_score(data, manhattan_labels)
 
        results.append((linkage, no_of_clusters,  'manhattan', silhouette_avg ))
        
        cosine_model = AgglomerativeClustering(n_clusters=no_of_clusters, affinity='cosine', linkage=linkage)
        cosine_model.fit(data)
        cosine_labels = cosine_model.labels_
        silhouette_avg = silhouette_score(data, cosine_labels)
  
        results.append((linkage, no_of_clusters,  'cosine', silhouette_avg ))
    else:
        results.append((linkage, no_of_clusters, 'manhattan', "NA" ))
        results.append((linkage, no_of_clusters,  'cosine', "NA" ))
     
df = pd.DataFrame(results, columns =['Linkage', 'No. of Clusters', 'Distance Measure', 'Silhouette Score']) 
df.reset_index(drop=True, inplace=True) 
display( df )

<b>Conclusion:</b> As per above table, we can conclude that for <b>Euclidean</b> distance performs better for Ward and Complete linkages. <b>Manhattan</b> distance performs better for Average linkage.

### Approach 2 [ 3 marks] 
Solve the same problem using a density based approach. Experiment with the following values of eps and minpts:<br> 
1. eps = 0.1, min_pts = 5
2. eps = 0.5, min_pts = 3
3. eps = 0.8, min_pts = 5 <br>
Analyze the results and comment on how the clustering changes as the above parameters are varied. Report the cluster quality for all the cases using the same measure as used for approach 1. Report results in a tabular format with parameters used, number of noise and core points and cluster quality. 

In [None]:
# clustering 
results = []
for (eps,min_pts) in ((0.1,5), (0.5, 3), (0.8, 5)):
    db = DBSCAN(eps=eps, min_samples=min_pts).fit(data)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    results.append((eps, min_pts, n_clusters, silhouette_score(data, labels) ))

df = pd.DataFrame(results, columns =['EPS', 'Min Samples', 'No. of Clusters' , 'Silhouette Score']) 
df.reset_index(drop=True, inplace=True) 
display( df )

<b>Conclusion:</b> As per above table, we can conclude that EPS <b>0.8</b> and Min Samples <b>5</b> performs better with DBSCAN Method and it results in <b>4</b> clusters

## 5. Visualization [ 2 marks ] 
To visualize the clusters, reduce the data to 2 dimensions using PCA. Make a scatterplot with different colours for each cluster obtained. Make one visualization each for approach 1 and 2 (the parameters which gave the best cluster quality for each). 

In [None]:
# visualization - reduce to two dimensions using PCA and make a scatterplot 
pca = PCA(n_components = 2) 
X_principal = pca.fit_transform(data) 
X_principal = pd.DataFrame(X_principal) 
X_principal.columns = ['P1', 'P2'] 


### Hierarchical Clustering

In [None]:
#Hierarchical Clustering
plt.scatter(X_principal['P1'], X_principal['P2'],  
           c = AgglomerativeClustering(n_clusters = 2).fit_predict(X_principal), cmap =plt.cm.winter) 
plt.show() 

### DBScan Clustering

In [None]:
#DBScan
plt.scatter(X_principal['P1'], X_principal['P2'],  
           c =DBSCAN(eps=0.8, min_samples=5).fit_predict(X_principal), cmap =plt.cm.winter) 
plt.show() 