In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

from sklearn.decomposition import PCA
import sklearn.decomposition

from sklearn.cluster import AffinityPropagation, SpectralClustering, AgglomerativeClustering, DBSCAN

import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

<div class="span5 alert alert-info">
<h2>Notebook details</h2>

<p> This notebook is for <b>Clustering</b> for <b>Mortagage custome segementation</b> project.</p>
<p> The records are choosen by test and train. The loan amount is considerd as one feature. The salary and loan are used as amount</p>
<p> Notes.</p>
<ol>
<li>Apply K Mean clustering algorithm to the data</li>
<li>Apply methods to choose best value of K</li> 
    <ul>
     <li>The Elbow Sum-of-Squares Method</li>
     <li>The Silhouette Method </li>
    </ul>
<li>Consider K Mean as baseline analysis </li>
<li>Apply test train split to reduce data size to one million records</li>
        
</ol>
</div>

## Helper methods

In [2]:
# To reduce the size of data set and allow the code to analyze cluster in each state the data frame filtered by State
def getDataFrameforState(inputframe,stateName='CA'):
    df=inputframe[inputframe.StateName==stateName]
    df=df[df.Accepted>0]
    #df = df.reset_index()
    return df

In [3]:
# get algo for no of cluster 
def getAlgoForCluster(algoType, noOfCluster, randomstate=10):
    if(algoType.lower()=='kmean'):
        kmean = KMeans(n_clusters=noOfCluster, random_state=randomstate)
    elif(algoType.lower()=='spectcluster'):
        return SpectralClustering(n_clusters=noOfCluster)
    elif(algoType.lower()=='affipropogation'):
        return  AffinityPropagation(damping=noOfCluster)
    elif(algoType.lower()=='algocluster'):
        return AgglomerativeClustering(n_clusters=noOfCluster)
    elif(algoType.lower()=='dbscan'):
        return DBSCAN(min_samples=noOfCluster)
    else :
        return None
        
    return kmean 

In [4]:
# get the fit predcit for Algo
def getFitPredictForAlgo(kMean,xcols):
    algo_val =kMean.fit_predict(xcols)
    return algo_val

In [5]:
# get the data frame  with PCA with component dimension and K mean for 
#one component as all x cols and another component as the unique value
# this provides the clutser for each row in data frame 
def getPCADataFrame(df,noOfCluster,algo_val,xcols,componetNum=2):
    pca = PCA(n_components = componetNum)
    matrix = np.matrix(pca.fit_transform(xcols))
    df_pca_matrix = pd.DataFrame(matrix)
    df_pca_matrix.columns = ['x','y']

    df_clusters = pd.DataFrame(df.iloc[:,0])
    #df_clusters['x'], df_clusters['y'] = df_pca_matrix['x'], df_pca_matrix['y']
    #df_clusters['cluster_label'] = algo_val
    #df_clusters['x']  = np.NAN
    df_clusters['x']  = df_pca_matrix['x'].values
    #df_clusters['y']  = np.NAN
    df_clusters['y']  = df_pca_matrix['y'].values
    df_clusters['cluster_label'] = algo_val

    return df_clusters

In [6]:
# method to draw average silhouette score as graph for ranges of clusterand avg score calculated
def drawAverageSilhouetteScore(range_n_clusters,silhouette_avgscores,algo_Name):
        fig, axis = plt.subplots(1,1,figsize=(6,6),dpi=100)
        _ = plt.plot(range_n_clusters,silhouette_avgscores)
        _ = plt.xlabel('$K$')
        _ = plt.ylabel('Average Silhouette Score')
        _ = plt.title('Average Silhouette Scores for '+algo_Name)

In [7]:
# draw bar graph for number of rows for each cluster X axis(cluster number), Y axis(no of rows in each cluster)
def drawClusterBar(noOfCluster,algo_val):
    cluster_algo_val = pd.Series(algo_val).value_counts().sort_index()
    #print(cluster_algo_val5)

    fig, axis = plt.subplots(1,1,figsize=(6,6),dpi=100)
    _ = cluster_algo_val.plot(kind='bar')
    _ = plt.ylabel('Number of Points')
    _ = plt.xlabel('Cluster')
    _ = plt.title('No of points for Clusters($K$ = '+str(noOfCluster)+')')

In [8]:
# draw cluster point on graph for each cluster . depends on data frame created PCA
def drawClusterPlot(df_clusters):
    axis = sns.lmplot(data=df_clusters, x='x', y='y', hue='cluster_label', 
                   fit_reg=False, legend=True, legend_out=True,size=10)
    _ = axis.set_axis_labels("Component 1", "Component 2")

In [9]:
# draw elbow plot to get the best component value for PCA
def drawElbowPCAplot(xcols):
    pca = sklearn.decomposition.PCA()
    pca.fit(xcols)
    fig, axis = plt.subplots(1,1,figsize=(12,6),dpi=100)
    _ = plt.plot(pca.explained_variance_)
    _ = plt.xlabel('$K$')
    _ = plt.xticks(range(0,33,1))
    _ = plt.xlim([0,31])
    _ = plt.ylabel('Explained Variance')
    _ = plt.title('Elbow Plot')

In [10]:
def getAllcolNameforDataframe(df):
    for col in df:
        print("'"+str(col)+"',")

In [11]:
def getAllColSum(df):
    for col in df:
        col_sum=sum(df[col])
        col_n=len(df[col])
        print('Sum of col : '+ col +' is '+str(col_sum) + ' total of col is ' +str(col_n))

In [12]:
def compareColForDataFrame(df1,df2):
    for col in df1:
        col_sum1=sum(df1[col])
        #col_n1=len(df1[col])
        #print('Sum of col : '+ col +' is '+str(col_sum) + ' total of col is ' +str(col_n))
        col_sum2=sum(df2[col])
        #col_n2=len(df2[col])
        diff =col_sum1-col_sum2
        tot=col_sum1+col_sum2
        print('Sum of col  : '+ col +' is '+str(col_sum1) + ' vs ' +str(col_sum2) +' diff ='+str(diff)+ ' ratio col1 :'+str((col_sum1/tot)))
    

In [13]:
def getFilterDatasetForRowCount(df,noofRows,random=True, samplesize=0.5):
    if(len(df)>noofRows):
        if(random):
            df1,df2=train_test_split(df, shuffle=True,train_size=samplesize,test_size=samplesize)
            if(len(df1)>noofRows):
                df=df1.iloc[:noofRows,:]
            else:
                 df=(df1.append(df2,ignore_index=True)).iloc[:noofRows,:]
        else:
            df=df.iloc[:noofRows,:]
    
    return df

In [14]:
# Method to convert the category column into dummy columns 
def AddDummyColumnsToDataFrame(dfinput,colname,removeOrgColumn=False,removelastdummy=False):
    print('Add {}'.format(colname))
    temp =pd.get_dummies(dfinput[colname])
    # remove one column from dummies with least value.
  
    if removelastdummy:
        t=dfinput.groupby(colname).count().state
        col_name=((t[t.values==t.min()]).index).get_values()[0]
        if col_name in temp.columns:
            print('removed column {}'.format(col_name))
            temp=temp.drop([col_name], axis=1)
    
    # remove the main column after extracting dummy
    if removeOrgColumn:
        if colname in dfinput.columns:
            print('removed column {}'.format(colname))
            dfinput =dfinput.drop([colname], axis=1)
    else:
        print('left column {} in dataframe'.format(colname))
        
        
    for col in temp:
        temp.rename(columns={col: colname+'_'+str(col)}, inplace=True)
    
    return  pd.concat([dfinput,temp], axis=1,ignore_index=False)

### Processing Starts

In [15]:
pickle_file='pickle_selectdata_ML_All_Col_CA.sa'
df_filterdata = pickle.load( open( pickle_file, "rb" ) )
#df_final_months.info()
#df_final_months.head()

In [16]:
#df_selectdata.info()
len(df_filterdata)

3179129

In [17]:
df_filterdata.head()

Unnamed: 0,index,Year,PropertyType,LoanPurpose,Occupancy,LoanAmount,ActionType,MSA,StateCode,CountyCode,...,LonAmt_100_150,LonAmt_150_200,LonAmt_200_250,LonAmt_250_300,LonAmt_300_350,LonAmt_350_400,LonAmt_400_450,LonAmt_450_500,LonAmt_500_5500,LonAmt_5500_999999
280124,280416,2015,1,3.0,2.0,380.0,1.0,11244.0,6.0,59.0,...,0,0,0,0,0,1,0,0,0,0
280125,280417,2015,1,1.0,1.0,404.0,1.0,11244.0,6.0,59.0,...,0,0,0,0,0,0,1,0,0,0
280126,280418,2015,1,3.0,1.0,415.0,1.0,11244.0,6.0,59.0,...,0,0,0,0,0,0,1,0,0,0
280127,280419,2015,1,3.0,1.0,325.0,1.0,11244.0,6.0,59.0,...,0,0,0,0,1,0,0,0,0,0
280128,280420,2015,1,3.0,1.0,322.0,1.0,11244.0,6.0,59.0,...,0,0,0,0,1,0,0,0,0,0


In [18]:
# 1 million rows extracted for state to process
noofRows=100#000
range_n_clusters = range(2,11)
df_filterdata=getFilterDatasetForRowCount(getDataFrameforState(df_filterdata,'CA'),noofRows)
#df_filterdata_NR=getFilterDatasetForRowCount(getDataFrameforState(df_selectdata,'CA'),noofRows,False)

In [19]:
#df_filterdata.info()

In [20]:
# Convert category columns to dummy columns=
categoryColumns=['CountyCode']
for col in categoryColumns:
    print(col)
    df_filterdata=AddDummyColumnsToDataFrame(df_filterdata,col)

CountyCode
Add CountyCode
left column CountyCode in dataframe


In [21]:
# create common x for all further processing for all feature columns 
x_cols = np.matrix(df_filterdata.iloc[:,69:])

In [22]:
print(len(df_filterdata))
print (len(df_filterdata.index.unique()))
print(len(x_cols))

100
100
100


## Other Clustering Algorithms

k-means is only one of a ton of clustering algorithms. Below is a brief description of several clustering algorithms, and the table provides references to the other clustering algorithms in scikit-learn. 

* **Affinity Propagation** does not require the number of clusters $K$ to be known in advance! AP uses a "message passing" paradigm to cluster points based on their similarity. 

* **Spectral Clustering** uses the eigenvalues of a similarity matrix to reduce the dimensionality of the data before clustering in a lower dimensional space. This is tangentially similar to what we did to visualize k-means clusters using PCA. The number of clusters must be known a priori.

* **Ward's Method** applies to hierarchical clustering. Hierarchical clustering algorithms take a set of data and successively divide the observations into more and more clusters at each layer of the hierarchy. Ward's method is used to determine when two clusters in the hierarchy should be combined into one. It is basically an extension of hierarchical clustering. Hierarchical clustering is *divisive*, that is, all observations are part of the same cluster at first, and at each successive iteration, the clusters are made smaller and smaller. With hierarchical clustering, a hierarchy is constructed, and there is not really the concept of "number of clusters." The number of clusters simply determines how low or how high in the hierarchy we reference and can be determined empirically or by looking at the [dendogram](https://docs.scipy.org/doc/scipy-0.18.1/reference/generated/scipy.cluster.hierarchy.dendrogram.html).

* **Agglomerative Clustering** is similar to hierarchical clustering but but is not divisive, it is *agglomerative*. That is, every observation is placed into its own cluster and at each iteration or level or the hierarchy, observations are merged into fewer and fewer clusters until convergence. Similar to hierarchical clustering, the constructed hierarchy contains all possible numbers of clusters and it is up to the analyst to pick the number by reviewing statistics or the dendogram.

* **DBSCAN** is based on point density rather than distance. It groups together points with many nearby neighbors. DBSCAN is one of the most cited algorithms in the literature. It does not require knowing the number of clusters a priori, but does require specifying the neighborhood size.

### Clustering Algorithms in Scikit-learn
<table border="1">
<colgroup>
<col width="15%" />
<col width="16%" />
<col width="20%" />
<col width="27%" />
<col width="22%" />
</colgroup>
<thead valign="bottom">
<tr><th>Method name</th>
<th>Parameters</th>
<th>Scalability</th>
<th>Use Case</th>
<th>Geometry (metric used)</th>
</tr>
</thead>
<tbody valign="top">
<tr><td>K-Means</span></a></td>
<td>number of clusters</td>
<td>Very large<span class="pre">n_samples</span>, medium <span class="pre">n_clusters</span> with
MiniBatch code</td>
<td>General-purpose, even cluster size, flat geometry, not too many clusters</td>
<td>Distances between points</td>
</tr>
<tr><td>Affinity propagation</td>
<td>damping, sample preference</td>
<td>Not scalable with n_samples</td>
<td>Many clusters, uneven cluster size, non-flat geometry</td>
<td>Graph distance (e.g. nearest-neighbor graph)</td>
</tr>
<tr><td>Mean-shift</td>
<td>bandwidth</td>
<td>Not scalable with <span class="pre">n_samples</span></td>
<td>Many clusters, uneven cluster size, non-flat geometry</td>
<td>Distances between points</td>
</tr>
<tr><td>Spectral clustering</td>
<td>number of clusters</td>
<td>Medium <span class="pre">n_samples</span>, small <span class="pre">n_clusters</span></td>
<td>Few clusters, even cluster size, non-flat geometry</td>
<td>Graph distance (e.g. nearest-neighbor graph)</td>
</tr>
<tr><td>Ward hierarchical clustering</td>
<td>number of clusters</td>
<td>Large <span class="pre">n_samples</span> and <span class="pre">n_clusters</span></td>
<td>Many clusters, possibly connectivity constraints</td>
<td>Distances between points</td>
</tr>
<tr><td>Agglomerative clustering</td>
<td>number of clusters, linkage type, distance</td>
<td>Large <span class="pre">n_samples</span> and <span class="pre">n_clusters</span></td>
<td>Many clusters, possibly connectivity constraints, non Euclidean
distances</td>
<td>Any pairwise distance</td>
</tr>
<tr><td>DBSCAN</td>
<td>neighborhood size</td>
<td>Very large <span class="pre">n_samples</span>, medium <span class="pre">n_clusters</span></td>
<td>Non-flat geometry, uneven cluster sizes</td>
<td>Distances between nearest points</td>
</tr>
<tr><td>Gaussian mixtures</td>
<td>many</td>
<td>Not scalable</td>
<td>Flat geometry, good for density estimation</td>
<td>Mahalanobis distances to  centers</td>
</tr>
<tr><td>Birch</td>
<td>branching factor, threshold, optional global clusterer.</td>
<td>Large <span class="pre">n_clusters</span> and <span class="pre">n_samples</span></td>
<td>Large dataset, outlier removal, data reduction.</td>
<td>Euclidean distance between points</td>
</tr>
</tbody>
</table>
Source: http://scikit-learn.org/stable/modules/clustering.html

#### Silhouette Method for K 
<pre>

| Range       | Interpretation                                |
|-------------|-----------------------------------------------|
| 0.71 - 1.0  | A strong structure has been found.            |
| 0.51 - 0.7  | A reasonable structure has been found.        |
| 0.26 - 0.5  | The structure is weak and could be artificial.|
| < 0.25      | No substantial structure has been found.      |

</pre>

### Agglomerative Clustering

### Visualize cluster for range

### DBSCAN

In [23]:
#DBSCAN
print(str(datetime.datetime.now()))
#find the best value for min_samples parameter. 
range_min_samples = [1, 2, 3]
dbscan_predict_col={}
dbscan_silh_score={}
best_score = 0.0
for min_samples in range_min_samples:
    db =  getAlgoForCluster('dbscan', min_samples)#DBSCAN(min_samples=min_samples)
    #labels = db.fit_predict(x_cols)
    labels = getFitPredictForAlgo(db,x_cols)
    if min_samples not in dbscan_predict_col:
        dbscan_predict_col[min_samples]=labels
    silhouette_avg = silhouette_score(x_cols, labels, random_state=10)
    if min_samples not in dbscan_silh_score:
        dbscan_silh_score[min_samples]=silhouette_avg
            
    print("For min_samples =", min_samples,
          "The average silhouette_score is :", silhouette_avg)
    if silhouette_avg > best_score:
        best_score = silhouette_avg
        best_min_samples = min_samples
        
print('Best min_samples parameter:',best_min_samples)
print ('Best DBSCAN score is', best_score)
print(str(datetime.datetime.now()))

2018-07-13 18:35:48.973103
For min_samples = 1 The average silhouette_score is : 0.02
For min_samples = 2 The average silhouette_score is : -0.06118004572025914


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [None]:
# draw graph for all above silhouette_score
drawAverageSilhouetteScore(range_min_samples,dbscan_silh_score.values(),'DB SCAN' )

In [None]:
for noofcluster in range_min_samples:
    if noofcluster in dbscan_predict_col:
        algo_val=dbscan_predict_col[noofcluster]
    else:
        algo_val = getFitPredictForAlgo(getAlgoForCluster('dbscan', min_samples),x_cols)
    drawClusterBar(min_samples,algo_val)
    df_PCA=getPCADataFrame(df_filterdata,min_samples,algo_val,x_cols)
    drawClusterPlot(df_PCA)
    df_merge_w_cluster=pd.merge(df_filterdata, df_PCA,
                                left_on = 'index', right_on = 'index',how='inner')
    # save file for all clusters
    picklefilename='pickle_Dbscan_All_Col_CA_Cluster_'+str(noofcluster)+'.sa'
    print(picklefilename)
    # create pickle file for further use 
    pickle.dump(df_merge_w_cluster,open(picklefilename,'wb'), protocol=4)

In [None]:
%reset


In [None]:
### old below

In [None]:
print(len(df_filterdata_male))
print(len(df_filterdata_Notmale))

In [None]:
print(len(df_filterdata))
print (len(df_filterdata.index.unique()))
print(len(x_cols))

#range_n_clusters = range(2,14)
# print(len(df_filterdata_NR))
# print (len(df_filterdata_NR.index.unique()))
# print(len(x_cols_NR))

## K - Mean clustering

#### Elbow Method for K 
We can plot SS vs. $K$ and choose the *elbow point* in the plot as the best value for $K$. The elbow point is the point at which the plot starts descending much more slowly. 

In [None]:
# calculate the K means for range to find out best K value for elbow method
print(str(datetime.datetime.now()))
ss = []
kmeans_predict_col={}
for k in range_n_clusters:
    kmeans = getAlgoForCluster(k)
    #kmeans.fit(x_cols)
    cluster_labels = getFitPredictForKMean(kmeans,x_cols)
    if k not in kmeans_predict_col:
        kmeans_predict_col[k]=cluster_labels
    #print('k='+str(k))
    #print('inertia '+str(kmeans.inertia_))
    ss.append(kmeans.inertia_)
print(str(datetime.datetime.now()))

In [None]:
print(max(ss))
print(min(ss))

In [None]:
fig, axis = plt.subplots(1,1,figsize=(8,8),dpi=100)
_ = plt.plot(range_n_clusters, ss, 'ro-', linewidth = 1.0)
_ = plt.xlim([1,15])
_ = plt.xlabel('K')
_ = plt.ylim([714603621,7511630524])
_ = plt.ylabel('Sum of Squares(SS)')
_ = plt.title('Elbow Method (2-10)')

#### Silhouette Method for K 
<pre>

| Range       | Interpretation                                |
|-------------|-----------------------------------------------|
| 0.71 - 1.0  | A strong structure has been found.            |
| 0.51 - 0.7  | A reasonable structure has been found.        |
| 0.26 - 0.5  | The structure is weak and could be artificial.|
| < 0.25      | No substantial structure has been found.      |

</pre>

In [None]:
print(str(datetime.datetime.now()))
silhouette_avgscores = []

for n_clusters in range_n_clusters:
    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
#     cluster_labels = getFitPredictForKMean(getAlgoForCluster(n_clusters),x_cols)
    if n_clusters not in kmeans_predict_col:
        cluster_labels = getFitPredictForKMean(getAlgoForCluster(n_clusters),x_cols)
        kmeans_predict_col[n_clusters]=cluster_labels
    else:
        cluster_labels =kmeans_predict_col[n_clusters]
        

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    # The sample size is used to avoid memeory error
    silhouette_avg = silhouette_score(x_cols, cluster_labels,sample_size=50000)
    silhouette_avgscores.append(silhouette_avg)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
# commented for memory issue
   # Create a subplot with 1 row and 2 columns

#     fig, ax1 = plt.subplots(1, 1)
#     _ = fig.set_size_inches(9, 7)

#     # The silhouette coefficient can range from -1, 1 but in this example all
#     # lie within [-0.1, 1]
#     _ = ax1.set_xlim([-0.1, 1])
#     # The (n_clusters+1)*10 is for inserting blank space between silhouette
#     # plots of individual clusters, to demarcate them clearly.
#     _ = ax1.set_ylim([0, len(x_cols) + (n_clusters + 1) * 10])
#     # Compute the silhouette scores for each sample
#     sample_silhouette_values = silhouette_samples(x_cols, cluster_labels)

#     y_lower = 10
#     for i in range(n_clusters):
#         # Aggregate the silhouette scores for samples belonging to
#         # cluster i, and sort them
#         ith_cluster_silhouette_values = \
#             sample_silhouette_values[cluster_labels == i]

#         ith_cluster_silhouette_values.sort()

#         size_cluster_i = ith_cluster_silhouette_values.shape[0]
#         y_upper = y_lower + size_cluster_i

#         #color = cm.spectral(float(i) / n_clusters)
#         color = plt.cm.Spectral(float(i) / n_clusters)
#         _ = ax1.fill_betweenx(np.arange(y_lower, y_upper),
#                           0, ith_cluster_silhouette_values,
#                           facecolor=color, edgecolor=color, alpha=0.7)

#         # Label the silhouette plots with their cluster numbers at the middle
#         _ = ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

#         # Compute the new y_lower for next plot
#         y_lower = y_upper + 10  # 10 for the 0 samples

#     _ = ax1.set_title(("The Silhouette plot for KMeans clustering with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold')
#     _ = ax1.set_xlabel("The silhouette coefficient values")
#     _ = ax1.set_ylabel("Cluster label")

#     # The vertical line for average silhouette score of all the values
#     _ = ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

#     _ = ax1.set_yticks([])  # Clear the yaxis labels / ticks
#     _ = ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

print(str(datetime.datetime.now()))

In [None]:
# draw graph for all above silhouette_score
drawAverageSilhouetteScore(range_n_clusters,silhouette_avgscores)

### Visualize cluster for range

In [None]:
for noofcluster in range_n_clusters:#[5,6,8,9,11]:#:range_n_clusters:
    if noofcluster in kmeans_predict_col:
        algo_val=kmeans_predict_col[noofcluster]
    else:
        algo_val = getFitPredictForKMean(getAlgoForCluster(noofcluster),x_cols)
    drawClusterBar(noofcluster,algo_val)
    df_PCA=getPCADataFrame(noofcluster,algo_val,x_cols)
    drawClusterPlot(df_PCA)

In [None]:
drawElbowPCAplot(x_cols)

### Choose the best Kmean and display the data frame with cluster

In [None]:
noofcluster=3
if noofcluster in kmeans_predict_col:
    algo_val=kmeans_predict_col[noofcluster]
else:
    algo_val = getFitPredictForKMean(getAlgoForCluster(noofcluster),x_cols)
drawClusterBar(noofcluster,algo_val)
df_PCA=getPCADataFrame(noofcluster,algo_val,x_cols)
drawClusterPlot(df_PCA)

In [None]:
v=df_PCA.groupby('cluster_label').count()
fig, axis = plt.subplots(1,1,figsize=(6,6),dpi=100)
_ = v.x.plot(kind='bar')
_ = plt.ylabel('Number of Points')
_ = plt.xlabel('Cluster')
_ = plt.title('No of points for Clusters($K$ = '+str(3)+')')

In [None]:
custer_num=1
df_merge_w_cluster=pd.merge(df_filterdata, (df_PCA[df_PCA.cluster_label==custer_num]), left_on = 'index', right_on = 'index',how='inner')

In [None]:
#df_merge_w_cluster.head(2)
#df_merge_w_cluster[df_merge_w_cluster['cluster_label'].isna()]
print(len(df_merge_w_cluster))


In [None]:
df_merge_w_cluster.iloc[:,60:75].describe()

In [None]:
df_merge_w_cluster.iloc[:,75:].describe()

In [None]:
df_merge_w_cluster.iloc[:,70:75].describe()

In [None]:
df_merge_w_cluster.iloc[:,75:80].describe()

In [None]:
df_merge_w_cluster.iloc[:,80:85].describe()

In [None]:
getAllcolNameforDataframe(df_merge_w_cluster.iloc[:,60:85])

In [None]:
getAllColSum(df_merge_w_cluster.iloc[:,60:])

In [None]:
custer_num=2
df_merge_w_cluster2=pd.merge(df_filterdata, (df_PCA[df_PCA.cluster_label==custer_num]), left_on = 'index', right_on = 'index',how='inner')

In [None]:
compareColForDataFrame(df_merge_w_cluster.iloc[:,60:],df_merge_w_cluster2.iloc[:,60:])

In [None]:
%reset

In [None]:
getAllcolNameforDataframe(df_filterdata.iloc[:,60:85])