In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import cm
import seaborn as sns; sns.set()
import scipy

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples
from sklearn import metrics
from sklearn.metrics import silhouette_score
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.metrics.cluster import adjusted_rand_score

## 1. Exploratory Data Analysis

The dataset GSE45827 was taken from kaggle (https://www.kaggle.com/brunogrisci/breast-cancer-gene-expression-cumida). It represents gene expression data from breast cancer samples curated by the Curated Microarray Database CuMiDa (http://sbcb.inf.ufrgs.br/cumida#). CuMiDa is a repository for machine learning that contains 78 handpicked cancer microarray datasets from 30.000 studies from the Gene Expression Omnibus (GEO), a public functional genomics data repository (https://www.ncbi.nlm.nih.gov/geo/). The data is already in a pre-processed and normalized format ready to be used for experimental ML approaches.

In [None]:
# load dataset and explore the first rows
df = pd.read_csv('/kaggle/input/breast-cancer-gene-expression-cumida/Breast_GSE45827.csv')
df.head()

In [None]:
# retrieve number of rows and columns in the dataset
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns in the breast cancer data set')

In [None]:
# check for missing values in dataset
df.isnull().sum()

In [None]:
# check for unique ID identifiers
print(f"The total ids are {df['samples'].count()}, from those the unique ids are {df['samples'].value_counts().shape[0]} ")

In [None]:
# check for label distribution
label_count = df['type'].value_counts()
label_count

In [None]:
# visualize distribution of labels
fig = plt.figure(figsize=(7, 5))
df['type'].value_counts().plot(kind='bar')
plt.xticks(rotation=45)
plt.ylabel('Number of occurences', fontsize=12, fontweight='bold')
plt.xlabel('Sample type', fontsize=12, fontweight='bold')
plt.title('Distribution of label types in breast cancer data', fontsize=14, fontweight='bold')

## 2. Data Preparation for Clustering

In [None]:
# assign labels to variable y
y = df['type']
y

In [None]:
# select feature data for clustering
data = df.iloc[:,2:].values
data

Before applying clustering, we scale our data such that each feature has unit variance. This is necessary because fitting algorithms highly depend on the scaling of the features. Here we use the StandardScaler module for scaling the features individually. StandardScaler subtracts the mean from each feature and then scales to unit variance.

In [None]:
scaler = StandardScaler()

In [None]:
scaled_data = scaler.fit_transform(data)

In [None]:
scaled_data

## 3. Clustering Approaches

In [None]:
### k-Means Clustering

Although the number of classes with 6 for the cancer subtypes is known for this dataset, perform a calculation and plotting of the cluster errors to see whether 6 is really the optimal size for k.

In [None]:
# Calculate the cluster errors for clusters from 1 to 15
cluster_range = range( 1, 20 )
cluster_errors = []
for num_clusters in cluster_range:
  clusters = KMeans(num_clusters, n_init = 10 )
  clusters.fit(scaled_data)
  labels = clusters.labels_
  centroids = clusters.cluster_centers_
  cluster_errors.append( clusters.inertia_ )
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df[0:20]

The total sum of squared distances of every data point from respective centroid is also called inertia. Let us print the inertia value for all k values. That k at which the inertia stop to drop significantly (elbow method) will be the best k.

In [None]:
# Elbow plot
plt.figure(figsize=(12,6))
plt.plot(clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
plt.xlabel('Number of clusters', fontsize=12, fontweight='bold')
plt.ylabel('Cluster error', fontsize=12, fontweight='bold')
plt.title('Elbow plot for determining number of clusters', fontsize=14, fontweight='bold')
plt.savefig('elbowplot.png')

In [None]:
# instantiate KMeans object
km = KMeans(n_clusters=6, random_state=0)

In [None]:
# predict the cluster labels
labels = km.fit_predict(scaled_data)

In [None]:
km.cluster_centers_.shape

In [None]:
centroids = km.cluster_centers_
print(centroids)

In [None]:
# print cluster labels
print(labels)

In [None]:
## creating a new dataframe only for labels and converting it into categorical variable
df_labels = pd.DataFrame(km.labels_ , columns = list(['label']))

df_labels['label'] = df_labels['label'].astype('category')

In [None]:
# Joining the label dataframe with the original data frame. 
df_labeled = df.join(df_labels)
df_labeled.head()

In [None]:
df_labeled['label'].value_counts()

### Evaluate k-means Clustering

There are several ways to evaluate a clustering. In the following the cluster error, the silhoutte plot and score as well as the accuracy are calculated. Since the data set already has labels assigned to it, it is possible to calculate the amount of correct cluster assignments
Calculatation of cluster error

In [None]:
print('Distortion: %.2f' % km.inertia_)

Calculation of accuracy score. Since the k-means algorithm doesn´t have any knowledge on the true cluster labels, the permutations need to be found before comparing to the true labels.

In [None]:
def find_permutation(n_clusters, real_labels, labels):
    permutation=[]
    for i in range(n_clusters):
        idx = labels == i
        new_label=scipy.stats.mode(real_labels[idx])[0][0]  # Choose the most common label among data points in the cluster
        permutation.append(new_label)
    return permutation

In [None]:
permutation = find_permutation(6, y, km.labels_)
print(permutation)

In [None]:
new_labels = [ permutation[label] for label in km.labels_]   # permute the labels
print("Accuracy score is", accuracy_score(y, new_labels))

In [None]:
# plot confusion matrix
mat = confusion_matrix(y, new_labels)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=permutation,
            yticklabels=permutation)
plt.xlabel('true label')
plt.ylabel('predicted label');
plt.savefig('confustion_matrix_1')

Create silhouette plot and calculate silhouette score

In [None]:
# create silhoutte plot
cluster_labels = np.unique(labels)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(scaled_data,
                                      labels,
                                      metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
     c_silhouette_vals = silhouette_vals[labels == c]
     c_silhouette_vals.sort()
     y_ax_upper += len(c_silhouette_vals)
     color = cm.jet(float(i) / n_clusters)
     plt.barh(range(y_ax_lower, y_ax_upper),
              c_silhouette_vals,
              height=1.0,
              edgecolor='none',
              color=color)
     yticks.append((y_ax_lower + y_ax_upper) / 2.)
     y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg,
             color="red",
             linestyle="--")
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.tight_layout()
#plt.show()
plt.savefig('silhoutte_plot_1.png')

In [None]:
kmeansSilhouette_Score = metrics.silhouette_score(data, labels, metric='euclidean')

In [None]:
print(kmeansSilhouette_Score)

In [None]:
rand_index = adjusted_rand_score(labels_true = y, labels_pred = labels)
print('The Rand index is', round(rand_index, 2))

### Apply PCA to data before clustering

In [None]:
# In order to find the number of dimensions explaining most of the variety in the data, plot cumulative explained variance
pca_plot = PCA().fit(scaled_data)
plt.plot(np.cumsum(pca_plot.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In order to explain 95% of data, one would need 100 principal components.

### Apply t-SNE

PCA didn´t seem to be a good approach as it would need 100 components to explain most of the data. Try t-SNE instead on original data.

In [None]:
tsne = TSNE(random_state=0)

In [None]:
tsne_result = tsne.fit_transform(data)

In [None]:
xi = tsne_result[:, 0]
yi = tsne_result[:, 1]

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x=xi, y=yi,
    hue=y,
    legend="full",
    alpha=1
)
plt.savefig('t-SNE_plot.png')

Try t-SNE on scaled data.

In [None]:
tsne_scaled = TSNE(random_state=0)

In [None]:
tsne_result_scaled = tsne.fit_transform(scaled_data)

In [None]:
xi_scaled = tsne_result_scaled[:, 0]
yi_scaled = tsne_result_scaled[:, 1]

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x=xi_scaled, y=yi_scaled,
    hue=y,
    legend="full",
    alpha=1
)

t-SNE does better on original data than on scaled data.

Run k-means algorithm on data after t-SNE with original data. t-SNE on scaled data visually doesn´t separate the clusters as well as with the original data.

In [None]:
km_tsne = KMeans(n_clusters = 6, random_state=0)

In [None]:
# predict the cluster labels
labels_tsne = km_tsne.fit_predict(tsne_result)

In [None]:
labels_tsne.size

In [None]:
labels_tsne

In [None]:
## creating a new dataframe only for labels and converting it into categorical variable
df_labels_tsne = pd.DataFrame(km_tsne.labels_ , columns = list(['label']))
df_labels_tsne['label'] = df_labels_tsne['label'].astype('category')
df_labels_tsne.head()

In [None]:
df_labels_tsne['label'].value_counts()

In [None]:
# silhouette plot
cluster_labels = np.unique(labels_tsne)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(tsne_result,
                                      labels_tsne,
                                      metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
     c_silhouette_vals = silhouette_vals[labels == c]
     c_silhouette_vals.sort()
     y_ax_upper += len(c_silhouette_vals)
     color = cm.jet(float(i) / n_clusters)
     plt.barh(range(y_ax_lower, y_ax_upper),
              c_silhouette_vals,
              height=1.0,
              edgecolor='none',
              color=color)
     yticks.append((y_ax_lower + y_ax_upper) / 2.)
     y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg,
             color="red",
             linestyle="--")
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.tight_layout()
#plt.show()
plt.savefig('silhoutte_plot_2.png')

In [None]:
kmeansSilhouette_Score = metrics.silhouette_score(tsne_result, labels_tsne, metric='euclidean')
kmeansSilhouette_Score

In [None]:
permutation = find_permutation(6, y, km_tsne.labels_)
print(permutation)

In [None]:
new_labels = [ permutation[label] for label in km_tsne.labels_]   # permute the labels
print("Accuracy score is", accuracy_score(y, new_labels))

This dataset has "ground truth" cell type labels available. We can use these to assess our cluster labels a bit more rigorously using the adjusted Rand index. This index is a measure between (0, 1) which indicates the similarity between two sets of categorical labels (e.g., our cell type labels and cluster labels). A value of 1 means the two clusterings are identical, and 0 means the level of similarity expected by random chance.

In [None]:
rand_index = adjusted_rand_score(labels_true = y, labels_pred = labels_tsne)
print('The Rand index is', round(rand_index, 2))

### UMAP

In [None]:
import umap
import numba.targets

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=30,
    min_dist=0.0,
    n_components=2,
    random_state=42,
).fit_transform(data)

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x=clusterable_embedding[:, 0], y=clusterable_embedding[:, 1],
    hue=y,
    legend="full",
    alpha=1
)
plt.savefig('UMAP_plot.png')

Perform k-means clustering after UMAP embedding

In [None]:
km_umap = KMeans(n_clusters = 6)

In [None]:
# predict the cluster labels
labels_umap = km_umap.fit_predict(clusterable_embedding)

In [None]:
# silhouette plot
cluster_labels = np.unique(labels_umap)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(clusterable_embedding,
                                      labels_umap,
                                      metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
     c_silhouette_vals = silhouette_vals[labels == c]
     c_silhouette_vals.sort()
     y_ax_upper += len(c_silhouette_vals)
     color = cm.jet(float(i) / n_clusters)
     plt.barh(range(y_ax_lower, y_ax_upper),
              c_silhouette_vals,
              height=1.0,
              edgecolor='none',
              color=color)
     yticks.append((y_ax_lower + y_ax_upper) / 2.)
     y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg,
             color="red",
             linestyle="--")
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.tight_layout()
#plt.show()
plt.savefig('silhoutte_plot_3.png')

In [None]:
kmeansSilhouette_Score = metrics.silhouette_score(clusterable_embedding, labels_umap, metric='euclidean')
kmeansSilhouette_Score

In [None]:
permutation = find_permutation(6, y, km_umap.labels_)
print(permutation)

In [None]:
new_labels = [ permutation[label] for label in km_umap.labels_]   # permute the labels
print("Accuracy score is", accuracy_score(y, new_labels))

In [None]:
rand_index = adjusted_rand_score(labels_true = y, labels_pred = labels_umap)
print('The Rand index is', round(rand_index, 2))