In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
#Initialization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import mutual_info_regression

In [None]:
!pip install graphviz
!apt-get -qq install -y graphviz

In [None]:
from sklearn import tree
import graphviz 

def plot_decision_tree(model, columns):
    dot_data = tree.export_graphviz(model, out_file=None, 
                             feature_names=columns,  
                             class_names=['Malignant','Benign'],  
                             filled=False, rounded=True,  
                             special_characters=False)  
    graph = graphviz.Source(dot_data)  
    return graph 
  
    
def plot_importances_features(model, columns):
    indices = np.argsort(model.feature_importances_)[::-1]
    feat_imp = pd.DataFrame({'Feature':columns.values[indices],
                        'Feature ranking':model.feature_importances_[indices]})
    plt.rcParams['figure.figsize']=(8,12)
    sns.set_style('whitegrid')
    ax = sns.barplot(x='Feature ranking', y='Feature', data=feat_imp)
    ax.set(xlabel='Feature ranking')
    plt.show()
    
def getCenter(D,clusters): # distance matrix and clusters
    err = 0.0
    centers = []
    contr = []
    for i in range(len(set(clusters))):
        id_pts = [index for index,value in enumerate(clusters) if value == i+1] #ids cluster i-th
        sub_ms = D[id_pts,:][:,id_pts] #sub distance matrix
        err = err + np.sum(D[np.argmin(np.mean(sub_ms, axis=0)), :])
        beta = 1
        index = np.exp(-beta * sub_ms / sub_ms.std()).sum(axis=1).argmax()
        centers.append(id_pts[index])
        contr.append(float("{0:.2f}".format((len(sub_ms) * 100) / len(D))))
    return contr,err,centers 


**Explore the Data and Browse through its columns**

In [None]:
df = pd.read_csv('../input/data.csv')
df.head()

In [None]:
df.columns


Note that the column 'diagnosis' is the actual result and will be used for comparision of the clustering result at the later time. Hence, it is the target column or 'Y'

In [None]:
# Datatype of columns
df.info()

In [None]:
df.shape

* 33 columns and 569 rows in total
* 'Unnamed: 32' has null data and we will remove it from 'df'
* 'id' is not required in the data processing, hence it's removed 

In [None]:
df.drop(labels=['Unnamed: 32','id'],axis=1,inplace=True)
print("Some error occured" if 'Unnamed: 32' in df.columns else f"Successfully removed 'Unnamed: 32'\nCOLUMNS: {df.columns}")

In [None]:
df.describe() ## Numerical

In [None]:
df.describe(include=['O']) # Objects

In [None]:
# Check for duplication exclude (id)
df.duplicated().sum()

In [None]:
# Few initialization
sns.set_style('whitegrid')

In [None]:
sns.countplot(df['diagnosis'],label="Count")    
B, M = df['diagnosis'].value_counts()

print('Number of Benign\t:\t ',B)
print('Number of Malignant\t:\t ',M)
print('Percentage Benign\t:\t % 2.2f %%' % (B/(B+M)*100))
print('Percentage Malignant\t:\t % 2.2f %%' % (M/(B+M)*100))

We study the correlation dividing the features into three groups: (The mean, standard error and "worst" or largest )

In [None]:
# Seperation of feature and target columns
# Mapping Benign to 0 and Malignant to 1 and storing it in a different dataframe
y = pd.DataFrame()
y['diagnosis'] = df['diagnosis'].map({'M':0,'B':1})
df.drop('diagnosis',axis=1,inplace=True)
# This is done for ease of use when comparing with the results obtained with different methods for clustering

### Scaling
We need to scale the data to apply the algo as Measurement units weight one better than other and affect final result of clustering algorithm. For example the parameters may be 'Height (m)' or 'Weight (pounds)' of humans. The model would understand only numerics and does not care about units, and especially for clustering, scaling data is sensitive to the unsupervised result obtained.

In [None]:
# Using the scale() in sklearn.preprocessing module
columns = ['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']
df_scaled = pd.DataFrame(preprocessing.scale(df))
X = df_scaled.copy()
df_scaled.columns = columns
df_scaled.head()

In [None]:
print(df_scaled.shape)
# Now the data contains 30 columns and 569 rows, we divide it into three categories of features
df_columns = df_scaled.columns

columns_mean = df_columns[0:10]
columns_se = df_columns[10:20]
columns_worst = df_columns[20:30]

print("columns_mean : ",columns_mean )
print("columns_se : ",columns_se )
print("columns_worst : ", columns_worst)
# Getting the features out of the dataframe
features_mean = df_scaled[columns_mean]
features_se = df_scaled[columns_se]
features_worst = df_scaled[columns_worst]

In [None]:
sns.pairplot( pd.concat([features_mean,y], axis=1),  hue='diagnosis', diag_kind="kde",diag_kws=dict(shade=True))

In [None]:
plt.figure(figsize=(22,5))
plt.subplot(1, 3, 1)
sns.heatmap(features_mean.corr(), cbar = False,  square = True, annot=True, fmt= '.2f',annot_kws={'size': 8},cmap= 'coolwarm')
plt.subplot(1, 3, 2)
sns.heatmap(features_worst.corr(), cbar = False,  square = True, annot=True, fmt= '.2f',annot_kws={'size': 8},cmap= 'coolwarm')
plt.subplot(1, 3, 3)
sns.heatmap(features_se.corr(), cbar = True,  square = True, annot=True, fmt= '.2f',annot_kws={'size': 8},cmap= 'coolwarm')

**radius, area and perimeter** (mean, the wrost, and the error standard) are closely correlated to each other, the same for the characteristics of **compactness, concave points and concavity**.

In [None]:
# SWARMPLOT
plt.figure(figsize=(23,8))
data = pd.melt(pd.concat([df_scaled,y],axis=1),id_vars="diagnosis",var_name="features", value_name='value')
sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)
plt.xticks(rotation=90) 
plt.tight_layout()

### Feature engineering
Selecting the most discriminating the features that are most discriminating (see swarmplot) among the most correlated features (see correlation plot). the following considerations were therefore made,

* from the correlated group [compactness_se, concavity_se concave points_se], we select **concavity_se**.
* from the correlated group [compactness_worst, concavity_worst and concave points_worst], we select **concave points_worst**)
* from the correlated group [concavity_mean, compactness_mean and concave points_mean] , we select **concave points_mean**)
* from the correlated group [area_worst, perimeter_worst, radius_worst ] , we select **radius_worst**)
* from the correlated group [perimeter_mean, area_mean, radius_mean ] , we select **area_mean**)
* from the correlated group [area_se, perimeter_se, radius_se] , we select **area_se**)

In [None]:
selected_features = ['concavity_se',
'concave points_worst',
'concave points_mean',
'radius_worst',
'area_mean',
'area_se',
'texture_mean','texture_se','texture_worst',
'smoothness_mean','smoothness_se','smoothness_worst',
'symmetry_mean','symmetry_se','symmetry_worst',
'fractal_dimension_mean','fractal_dimension_se','fractal_dimension_worst']
df_selected_features = df_scaled[selected_features]

## Feature Reduction

In [None]:
plt.figure(figsize= [10,6])
pd.Series(mutual_info_regression(df_selected_features, y), index= df_selected_features.columns).sort_values(ascending=True).plot(kind="barh")
plt.title("Feature importances", fontsize= 20)
plt.yticks(fontsize= 12)

The most important features of the reduced dataset are (total 7 columns) : 
* 'radius_worst'
* 'concave points_mean'
* 'concave points_worst'
* 'area_mean'
* 'area_se'
* 'concavity_se'
* 'texture_worst'

We then select the above features for further processing

In [None]:
df_reduced_features = df_scaled[['radius_worst', 'concave points_mean', 'concave points_worst', 'area_mean', 'area_se', 'concavity_se', 'texture_worst']]

# Clustering
Clustering (grouping a list of observations into various buckets) can be based on various factors. We try to cluster according to two clustering techniques : 
* Hierarchical based clusters (Distance) 
* k-means clusters (Centroid based)

## Heirarchical Clustering
Standard implementation is a bottom up approach where each observation starts in its own cluster, and pairs of clusters are merged as one moves up the hierarchy where the hierarchy is defined by the distance between the clusters. 

### Linkage functions
Selecting the best linkage

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
D = df_reduced_features.values

#ward = Similarity of two clusters is based on the increase in squared error when two clusters are merged
methods = ['single','complete','average','weighted','median','ward']


plt.figure(figsize=(25, 8))
for i in range(len(methods)):
    plt.subplot(231+i)
    Z = linkage(D, method=methods[i]) #Perform hierarchical/agglomerative clustering. 
    de = dendrogram(
      Z,
      leaf_rotation=90.,
      leaf_font_size=11.,
      distance_sort='descending',
      truncate_mode = 'lastp',
      p=50
      
    )
    plt.title(methods[i])

plt.tight_layout()

- As evidenced by the plot, among the various linkage functions selected, the **ward** method was the most suitable, as it allowed to create clusters and well separated clusters.
- Ward suggests 2 clusters by default (different color)

In [None]:
sns.set_style('whitegrid') 
D = df_reduced_features.values

Z = linkage(D, method='ward', metric='euclidean') #Perform hierarchical/agglomerative clustering. 
# ward = Similarity of two clusters is based on the increase in squared error when two clusters are merged

plt.figure(figsize=(15, 7))
dendrogram(
    Z,
    leaf_rotation=90.,
    leaf_font_size=11.,
    show_contracted=True,
    distance_sort='descending',
    truncate_mode = 'lastp', # truncated output for better rep
    p=50
)

plt.tight_layout()

In [None]:
from sklearn.decomposition import PCA # Principal Component Analysis Module
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import fcluster  # simple clustering

pca_2d = PCA(n_components=2)
X = pca_2d.fit_transform(D)
Y = pdist(D, 'euclidean')
Y = squareform(Y)
Y.shape
k=2
clusters = fcluster(Z, k, criterion='maxclust')
# print(clusters)
contr, err,centers = getCenter(Y,clusters)
print('centroid: ',centers,'\t %items ',contr)
# print(X)
plt.figure(figsize=(10, 8))
plt.scatter(X[:,0], X[:,1], c=clusters, cmap='prism')  # plot points with cluster dependent colors
plt.scatter([X[centers[0],0], X[centers[1],0]],# all x
            [X[centers[0],1], X[centers[1],1]],# all y
            c='black', 
            cmap='prism',
            marker='x',
            s=50,
            label="centroid")  # plot centroids
plt.legend()

plt.show()

**Truncating Dentogram**

In [None]:
df_y_features = pd.concat([df_reduced_features,y], axis=1)
mean_pca_M = pca_2d.transform(df_reduced_features[ df_y_features['diagnosis']==0].mean().values.reshape(1,-1))
mean_pca_B = pca_2d.transform(df_reduced_features[ df_y_features['diagnosis']==1].mean().values.reshape(1,-1))

plt.figure(figsize=(10, 8))
plt.scatter(X[:,0], X[:,1], c=clusters, cmap='prism')  # plot points with cluster dependent colors
plt.scatter([mean_pca_M[:,0], mean_pca_B[:,0]],[mean_pca_M[:,1], mean_pca_B[:,1]], c='blue', cmap='prism',marker='+',s=50,
            label="centroid diagnosis class (b,m)")  # plot points with cluster dependent colors
plt.scatter([X[centers[0],0], X[centers[1],0]],[X[centers[0],1], X[centers[1],1]], c='black', cmap='prism',marker='x',s=50,
            label="centroid cluster")  # plot points with cluster dependent colors
plt.legend()
plt.show()

It is interesting to note how the centroids of the clusters fall very close to the average values of the two classes of tumors (benign and malignant). Therefore It is possible to note that, If we did not have a labeled dataset (with well-defined classes B and M) we would still be able to determine (with good probability) the class of belonging of the dataset elements, through an unsupervised clustering process .

Usually the distance cutoff is set at 70% max distance for ward [refer docs](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.dendrogram.html)

### K-Means Clustering

k-means clustering aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster.

For K means clustering, the main question here also is how many clusters. For which we use the Elbow method. It tries to find the clustering step where the acceleration of distance growth is the biggest (the "strongest elbow" of the blue line graph below, which is the highest value of the green graph below)

In [None]:
D = df_reduced_features.values
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(D)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 10), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

The steepest change occurs in when number of clusters is 2, followed by 3. Hence the number of clusters according to the elbow plot is 2 or 3.

However, this analysis is often not enough and we must try different methods like silhouette score to get a more concrete answer. The silhouette plot displays a measure of how close each point in one cluster is to points in the neighboring clusters and thus provides a way to assess parameters like number of clusters visually.
Silhouette coefficients (as these values are referred to as) near +1 indicate that the sample is far away from the neighboring clusters. A value of 0 indicates that the sample is on or very close to the decision boundary between two neighboring clusters and negative values indicate that those samples might have been assigned to the wrong cluster.

In [None]:
from sklearn.metrics import silhouette_score

sse = []
for k in range(2, 8):
    kmeans = KMeans(n_clusters=k).fit(X)
    sse.append([k, silhouette_score(X, kmeans.labels_)])

plt.plot(pd.DataFrame(sse)[0], pd.DataFrame(sse)[1])
plt.title('Silhouette Analysis')
plt.xlabel('Number of clusters')
plt.ylabel('silhouette_score')
plt.show();

Typically the case is that we select the number of clusters with the maximum silhouette_score. In our case this is '2' which matches to the previous observations. 

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

X = df_reduced_features.values

range_n_clusters = [2, 3, 4]

pca_2d = PCA(n_components=2)
pca_2d_r = pca_2d.fit_transform(X)

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, ax2 = plt.subplots(1, 1)
    fig.set_size_inches(18, 5)

    clusterer = KMeans(n_clusters=n_clusters, random_state=10, max_iter=9000)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    # 2nd Plot showing the actual clusters formed
    colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(pca_2d_r[:, 0], pca_2d_r[:, 1], marker='.', s=90, lw=0, alpha=0.7,
                c=colors, edgecolor='k')
    
    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    centers = pca_2d.transform(centers)
    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                c="white", alpha=1, s=250, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                    s=100, edgecolor='k')

    ax2.set_title(f"Silhouette analysis for KMeans clustering on sample data with n_clusters = {n_clusters}")
    ax2.set_xlabel("PC1")
    ax2.set_ylabel("PC2")

    plt.show()
