In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

from sklearn.decomposition import PCA
import sklearn.decomposition

import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

<div class="span5 alert alert-info">
<h2>Notebook details</h2>

<p> This notebook is for <b>Clustering</b> for <b>Mortagage custome segementation</b> project.</p>
<p> The records are choosen by test and train. The loan amount is not considerd as one feature. The salary and loan are used as bins</p>
<p> Notes.</p>
<ol>
<li>Apply K Mean clustering algorithm to the data</li>
<li>Apply methods to choose best value of K</li> 
    <ul>
     <li>The Elbow Sum-of-Squares Method</li>
     <li>The Silhouette Method </li>
    </ul>
<li>Consider K Mean as baseline analysis </li>
<li>Apply test train split to reduce data size to one million records</li>
        
</ol>
</div>

In [None]:
pickle_file='df_selectdata_ML.sa'
df_selectdata = pickle.load( open( pickle_file, "rb" ) )
#df_final_months.info()
#df_final_months.head()

In [None]:
#df_selectdata.info()
len(df_selectdata)

## Helper methods

In [None]:
# To reduce the size of data set and allow the code to analyze cluster in each state the data frame filtered by State
def getDataFrameforState(inputframe,stateName='CA'):
    df=inputframe[inputframe.StateName==stateName]
    df=df[df.Accepted>0]
    df = df.reset_index()
    return df

In [None]:
# get K mean for no of cluster 
def getKmeansForCluster(noOfCluster, randomstate=1):
    kmean = KMeans(n_clusters=noOfCluster, random_state=randomstate)
    return kmean 

In [None]:
# get the fit predcit for k mean 
def getFitPredictForKMean(kMean,xcols):
    kmeans_val =kMean.fit_predict(xcols)
    return kmeans_val

In [None]:
# get the data frame  with PCA with component dimension and K mean for 
#one component as all x cols and another component as the unique value
# this provides the clutser for each row in data frame 
def getPCADataFrame(noOfCluster,kmeans_val,xcols,componetNum=2):
    pca = PCA(n_components = componetNum)
    matrix = np.matrix(pca.fit_transform(xcols))
    df_pca_matrix = pd.DataFrame(matrix)
    df_pca_matrix.columns = ['x','y']

    df_clusters = pd.DataFrame(df_filterdata.iloc[:,0])
    df_clusters['x'], df_clusters['y'] = df_pca_matrix['x'], df_pca_matrix['y']
    df_clusters['cluster_label'] = kmeans_val

    return df_clusters

In [None]:
# method to draw average silhouette score as graph for ranges of clusterand avg score calculated
def drawAverageSilhouetteScore(range_n_clusters,silhouette_avgscores):
        fig, axis = plt.subplots(1,1,figsize=(6,6),dpi=100)
        _ = plt.plot(range_n_clusters,silhouette_avgscores)
        _ = plt.xlabel('$K$')
        _ = plt.ylabel('Average Silhouette Score')
        _ = plt.title('Average Silhouette Scores for KMeans clustering')

In [None]:
# draw bar graph for number of rows for each cluster X axis(cluster number), Y axis(no of rows in each cluster)
def drawClusterBar(noOfCluster,kmeans_val):
    cluster_kmeans_val = pd.Series(kmeans_val).value_counts().sort_index()
    #print(cluster_kmeans_val5)

    fig, axis = plt.subplots(1,1,figsize=(6,6),dpi=100)
    _ = cluster_kmeans_val.plot(kind='bar')
    _ = plt.ylabel('Number of Points')
    _ = plt.xlabel('Cluster')
    _ = plt.title('No of points for Clusters($K$ = '+str(noOfCluster)+')')

In [None]:
# draw cluster point on graph for each cluster . depends on data frame created PCA
def drawClusterPlot(df_clusters):
    axis = sns.lmplot(data=df_clusters, x='x', y='y', hue='cluster_label', 
                   fit_reg=False, legend=True, legend_out=True,size=10)
    _ = axis.set_axis_labels("Component 1", "Component 2")

In [None]:
# draw elbow plot to get the best component value for PCA
def drawElbowPCAplot(xcols):
    pca = sklearn.decomposition.PCA()
    pca.fit(xcols)
    fig, axis = plt.subplots(1,1,figsize=(12,6),dpi=100)
    _ = plt.plot(pca.explained_variance_)
    _ = plt.xlabel('$K$')
    _ = plt.xticks(range(0,33,1))
    _ = plt.xlim([0,31])
    _ = plt.ylabel('Explained Variance')
    _ = plt.title('Elbow Plot')

In [None]:
def getAllcolNameforDataframe(df):
    for col in df:
        print(col)

In [None]:
def getFilterDatasetForRowCount(df,noofRows,random=True, samplesize=0.5):
    if(len(df)>noofRows):
        if(random):
            df1,df2=train_test_split(df, shuffle=True,train_size=samplesize,test_size=samplesize)
            if(len(df1)>noofRows):
                df=df1.iloc[:noofRows,:]
            else:
                 df=(df1.append(df2,ignore_index=True)).iloc[:noofRows,:]
        else:
            df=df.iloc[:noofRows,:]
    
    return df

### Processing Starts

In [None]:
# 1 million rows extracted for state to process
noofRows=1000000
range_n_clusters = range(2,11)
df_filterdata=getFilterDatasetForRowCount(getDataFrameforState(df_selectdata,'CA'),noofRows)
df_filterdata_NR=getFilterDatasetForRowCount(getDataFrameforState(df_selectdata,'CA'),noofRows,False)

In [None]:
# create common x for all further processing for all feature columns 
x_cols = np.matrix(df_filterdata.iloc[:,8:96])
x_cols_NR = np.matrix(df_filterdata_NR.iloc[:,8:96])

In [None]:
print(len(df_filterdata))
print (len(df_filterdata.index.unique()))
print(len(x_cols))

print(len(df_filterdata_NR))
print (len(df_filterdata_NR.index.unique()))
print(len(x_cols_NR))

## K - Mean clustering

#### Elbow Method for K 
We can plot SS vs. $K$ and choose the *elbow point* in the plot as the best value for $K$. The elbow point is the point at which the plot starts descending much more slowly. 

In [None]:
# calculate the K means for range to find out best K value for elbow method
ss = []
for k in range_n_clusters:
    #kmeans = KMeans(n_clusters=k, random_state=1)
    kmeans = getKmeansForCluster(k,x_cols)
    kmeans.fit(x_cols)
    #print('k='+str(k))
    #print('inertia '+str(kmeans.inertia_))
    ss.append(kmeans.inertia_)

In [None]:
print(max(ss))
print(min(ss))

In [None]:
fig, axis = plt.subplots(1,1,figsize=(8,8),dpi=100)
_ = plt.plot(range_n_clusters, ss, 'ro-', linewidth = 1.0)
_ = plt.xlim([1,12])
_ = plt.xlabel('K')
_ = plt.ylim([20919354137,99045829963])
_ = plt.ylabel('Sum of Squares(SS)')
_ = plt.title('Elbow Method (2-10)')

#### Silhouette Method for K 
<pre>

| Range       | Interpretation                                |
|-------------|-----------------------------------------------|
| 0.71 - 1.0  | A strong structure has been found.            |
| 0.51 - 0.7  | A reasonable structure has been found.        |
| 0.26 - 0.5  | The structure is weak and could be artificial.|
| < 0.25      | No substantial structure has been found.      |

</pre>

In [None]:

silhouette_avgscores = []
kmeans_predict_col={}
for n_clusters in range_n_clusters:
    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    cluster_labels = getFitPredictForKMean(getKmeansForCluster(n_clusters),x_cols)
    if n_clusters not in kmeans_predict_col:
        kmeans_predict_col[n_clusters]=cluster_labels


    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    # The sample size is used to avoid memeory error
    silhouette_avg = silhouette_score(x_cols, cluster_labels,sample_size=50000)
    silhouette_avgscores.append(silhouette_avg)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

In [None]:
# draw graph for all above silhouette_score
drawAverageSilhouetteScore(range_n_clusters,silhouette_avgscores)

### Visualize cluster for range

In [None]:
for noofcluster in range_n_clusters:
    if noofcluster in kmeans_predict_col:
        kmeans_val=kmeans_predict_col[noofcluster]
    else:
        kmeans_val = getFitPredictForKMean(getKmeansForCluster(noofcluster),x_cols)
    drawClusterBar(noofcluster,kmeans_val)
    drawClusterPlot(getPCADataFrame(noofcluster,kmeans_val,x_cols))

In [None]:
drawElbowPCAplot(x_cols)

### Choose the best Kmean and display the data frame with cluster

In [None]:
noofcluster=3
if noofcluster in kmeans_predict_col_NR:
    kmeans_val=kmeans_predict_col[noofcluster]
else:
    kmeans_val = getFitPredictForKMean(getKmeansForCluster(noofcluster),x_cols)
drawClusterBar(noofcluster,kmeans_val)
drawClusterPlot(getPCADataFrame(noofcluster,kmeans_val,x_cols))

# Not random records

In [None]:
# calculate the K means for range to find out best K value for elbow method
ss_NR = []
for k in range_n_clusters:
    #kmeans = KMeans(n_clusters=k, random_state=1)
    kmeans = getKmeansForCluster(k,x_cols_NR)
    kmeans.fit(x_cols_NR)
    #print('k='+str(k))
    #print('inertia '+str(kmeans.inertia_))
    ss_NR.append(kmeans.inertia_)

In [None]:
print(max(ss_NR))
print(min(ss_NR))

In [None]:
fig, axis = plt.subplots(1,1,figsize=(8,8),dpi=100)
_ = plt.plot(range_n_clusters, ss_NR, 'ro-', linewidth = 1.0)
_ = plt.xlim([1,12])
_ = plt.xlabel('K')
_ = plt.ylim([20919354137,99045829963])
_ = plt.ylabel('Sum of Squares(SS)')
_ = plt.title('Elbow Method (2-10)')

#### Silhouette Method for K 
<pre>

| Range       | Interpretation                                |
|-------------|-----------------------------------------------|
| 0.71 - 1.0  | A strong structure has been found.            |
| 0.51 - 0.7  | A reasonable structure has been found.        |
| 0.26 - 0.5  | The structure is weak and could be artificial.|
| < 0.25      | No substantial structure has been found.      |

</pre>

In [None]:
silhouette_avgscores_NR = []
# Empty dict
kmeans_predict_col_NR = {}
for n_clusters in range_n_clusters:
    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    cluster_labels = getFitPredictForKMean(getKmeansForCluster(n_clusters),x_cols)
    if n_clusters not in kmeans_predict_col_NR:
        kmeans_predict_col_NR[n_clusters]=cluster_labels

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    # The sample size is used to avoid memeory error
    silhouette_avg = silhouette_score(x_cols, cluster_labels,sample_size=50000)
    silhouette_avgscores_NR.append(silhouette_avg)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

In [None]:
# draw graph for all above silhouette_score
drawAverageSilhouetteScore(range_n_clusters,silhouette_avgscores_NR)

### Visualize cluster for range

In [None]:
for noofcluster in range_n_clusters:
    if noofcluster in kmeans_predict_col_NR:
        kmeans_val=kmeans_predict_col_NR[noofcluster]
    else:
        kmeans_val = getFitPredictForKMean(getKmeansForCluster(noofcluster),x_cols_NR)
    drawClusterBar(noofcluster,kmeans_val)
    drawClusterPlot(getPCADataFrame(noofcluster,kmeans_val,x_cols_NR))

In [None]:
drawElbowPCAplot(x_cols_NR)

### Choose the best Kmean and display the data frame with cluster

In [None]:
noofcluster=3
if noofcluster in kmeans_predict_col_NR:
    kmeans_val=kmeans_predict_col_NR[noofcluster]
else:
    kmeans_val = getFitPredictForKMean(getKmeansForCluster(noofcluster),x_cols_NR)
drawClusterBar(noofcluster,kmeans_val)
drawClusterPlot(getPCADataFrame(noofcluster,kmeans_val,x_cols))

In [None]:
### Rough work below