In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import mpl_toolkits.mplot3d.axes3d as p3
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# try:
#      from rich import print
# except ImportError as e:
#     !pip install rich
#     from rich import print
    
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data import and cleaning

In [None]:
nci=pd.read_csv("/kaggle/input/nci-data/d_unsupervised_nci_unique_prep.csv")

In [None]:
print(nci.shape)
print(nci.columns)

In [None]:
#plot the percent of missing values for columns and rows
threshhold=0.3
to_be_deleted_rows=[]
to_be_deleted_cols=[]

for dire,dire_vs in [(1,nci.columns),(0,nci.index)]:
    NaN_percent=[]
    max_NaN_percent=(-10,"")
    for c in dire_vs.to_list():
        d,cr=(nci.iloc[c,:],"rows")  if dire==0  else (nci[c],"columns")
        col_=(sum(d.isna())/len(d),c)
        NaN_percent.append(col_[0])
        max_NaN_percent=max([col_, max_NaN_percent])
        if col_[0]>threshhold: to_be_deleted_rows.append(c) if dire==0  else to_be_deleted_cols.append(c)
    print("the largest percent of NaN's for %s is %.3f which is %s"%(cr,max_NaN_percent[0],max_NaN_percent[1]))
    fig=plt.figure(figsize=(10,4))
    _=plt.plot(NaN_percent,"-*")
    _=plt.title("the percentage of NaN's for %s"%cr)
    _=plt.ylabel("%")
    _=plt.xlabel("%s index"%cr)    

print("the rows to be discarded:{}".format(to_be_deleted_rows))    
print("the cols to be discarded:{}".format(to_be_deleted_cols)) 

#delete the rows/cols having percentage of missing values higher than threshold
nci.drop(index=to_be_deleted_rows,inplace=True) #delete rows
nci.drop(columns=to_be_deleted_cols,inplace=True) #delete cols

In [None]:
nci.head(10)

In [None]:
#drop the rows with Weight=0
nci.drop(nci[nci["Weight"]==0].index,axis=0,inplace=True)

In [None]:
#drop the chemical structure symbol column:
try:
    nci.drop(columns="smiles",inplace=True)
except:
    pass

In [None]:
# nci.info()
# check the data with non-float dtype
nci[nci.columns[nci.dtypes.apply(lambda x: not x in ["float64","int64"])]].dtypes

In [None]:
def isfloat(x):
      if x.startswith("-"):
            x= x[1:]
      return "".join(x.split(".")).isnumeric()

A=nci["PEOE_PC-"].astype(str).apply(lambda x: isfloat(x))

# list the special values that are not numeric:
print("These are values of PEOE_PC- that are not numeric:")
nci.loc[A[A==False].index,"PEOE_PC-"]

It's interesting to note that the first three entries of the above list seem have numeric type, yet 'astype(str)' would make it into scientific notation and become a non-numeric string. Explicit comparison was made in the following.

In [None]:
print(nci.loc[[1957,2209],"PEOE_PC-"])
print(nci.loc[[1957,2209],"PEOE_PC-"].astype(str))
print(nci.loc[[1957,2209],"PEOE_PC-"].astype(str).apply(lambda x: isfloat(x)))
# nci.loc[[1957,2209],"PEOE_PC-"].astype(str)

In [None]:
#drop the line with "PEOE_PC-"="#NAME?"
ss=nci.loc[nci["PEOE_PC-"]!="#NAME?","PEOE_PC-"].astype("float")
nci=nci.loc[ss.index,:]
print("shape after droping the line with PEOE_PC- = #Name:{}".format(nci.shape))
#change datatye for "PEOE_PC-"
nci.loc[ss.index,"PEOE_PC-"]=ss.values

#drop too large negative values for "PEOE_PC-":
nci=nci[nci["PEOE_PC-"]>=-10]   
print("shape after droping large negative values:{}".format(nci.shape))
#list all non-float columns to check now "PEOE_PC-" column has float dtype 
print("datatypes for columns that are not floattype:")
nci.loc[:, nci.columns[nci.dtypes!="float64"] ].dtypes

In [None]:
#fill missing values with 0 and record the corresponding positions
if nci.isna().sum().sum()>0:
    nan_df=nci.isna()
    nci.replace(np.nan, 0,inplace=True)
display(nci.head())
len(nci)

# Unsupervised Learning(UL): 'CCRF-CEM':'T-47D' as features

In [None]:
print(nci.columns)

In [None]:
UL_features=nci.loc[:,'CCRF-CEM':'T-47D'].columns
print(UL_features)

## Reorder features according to correlation

In [None]:

data1=nci[UL_features].copy()
CorrMatrix=data1.corr();
## reordering features according to the magnitude of the corr matrix element
for i in range(CorrMatrix.shape[0]):  
     CorrMatrix.iat[i, i]= 0 

new_c=[]
#Sort the Corr matrix according to absolute values and order the features accordingly
for k,v in abs(CorrMatrix).idxmax()[abs(CorrMatrix).max().sort_values(ascending=False).index].items():
   if not k in new_c:
       new_c.append(k)
   if not v in new_c:
       new_c.append(v)

print(pd.Index(new_c))
print(len(new_c))


#use the reordered feature list:
data1=data1.reindex(columns=new_c)
CorrMatrix=data1.corr();

## plot correlation

In [None]:
f, ax = plt.subplots(figsize=(11, 15))
#crop a submatrix to plot:
row_start=0; col_start=0; size_matrix=10;
sub_corrmat_to_plot=CorrMatrix.iloc[row_start:(row_start+size_matrix),col_start:(col_start+size_matrix)]
mask = np.zeros_like(sub_corrmat_to_plot, dtype=np.bool)
# mask[np.triu_indices_from(mask)]= True


heatmap = sns.heatmap(sub_corrmat_to_plot,
                      mask = mask,
                      square = True,
                      linewidths = .5,
                      cmap = 'coolwarm',
                      cbar_kws = {'shrink': .4,
                                'ticks' : [-1, -.5, 0, 0.5, 1]},
                      ax=ax,
                      vmin = -1,
                      vmax = 1,
                      annot = True,
                      annot_kws = {'size': 12})

#add the column names as labels
ax.set_yticklabels(sub_corrmat_to_plot.columns, rotation = 0)
ax.set_xticklabels(sub_corrmat_to_plot.columns)
ax.xaxis.set_label_position('top') 

# f, ax = plt.subplots(figsize=(11, 15))

grids = sns.PairGrid(sub_corrmat_to_plot)
grids.map_diag(sns.histplot)
grids.map_upper(sns.kdeplot)
grids.map_lower(plt.scatter)

# sns.set_style({'xtick.bottom': False}, {'ytick.left': True})

# PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
# scaler = preprocessing.RobustScaler()
# scaler = preprocessing.MinMaxScaler()

data_scaled = scaler.fit_transform(data1) 
data_scaled_df=pd.DataFrame(data_scaled,columns=data1.columns)

data_to_use_in_PCA=data_scaled_df

n_components=15
pca = PCA(n_components=n_components)
principalComponents = pca.fit_transform(data_to_use_in_PCA)
x_pca_transformed = pd.DataFrame(data = principalComponents)
x_pca_transformed.head()
print("explained variance ratio:",pca.explained_variance_ratio_)
print("sum of explained variance ratio",sum(pca.explained_variance_ratio_))
plt.bar(range(1,n_components+1),pca.explained_variance_ratio_)
plt.xlabel("nth component")
plt.ylabel("explained variance ratio")

#  Kmeans

In [None]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
# kmeans = KMeans(n_clusters=20, random_state=0).fit(data1)

K=range(1,15)
inertia={}
distortions=[]


for k in K:
  kmeans = KMeans(n_clusters=k).fit(data1)
  inertia[k] = kmeans.inertia_
  distortions.append(sum(np.min(cdist(data1, kmeans.cluster_centers_,
                                        'euclidean'), axis=1)) / data1.shape[0])

fig, ax1 = plt.subplots()
ax1.plot(K,list(inertia.values()),"bx-")
plt.ylabel("Inertia")
plt.arrow(4,0.85*1e6,-2,0, color='blue',linestyle='dashed',head_width=0.02*1e6, head_length=0.3)
ax2 = ax1.twinx() 
ax2.plot(K,distortions,"r*-")
plt.arrow(10,6.6,3,0,color='red',linestyle='dashed',head_width=0.05, head_length=0.3)
plt.xlabel("k")
plt.ylabel('Distortions')
_=plt.title('The Elbow Method using Inertia: unscaled data')

In [None]:
#using scaled data data_scaled_df to repeat the Kmeans calculation
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
# kmeans = KMeans(n_clusters=20, random_state=0).fit(data1)

K=range(1,15)
inertia={}
distortions=[]

try:
    data_scaled_df.drop(columns="cluster",inplace=True)
except:
    pass

for k in K:
  kmeans = KMeans(n_clusters=k).fit(data_scaled_df)
  inertia[k] = kmeans.inertia_
  distortions.append(sum(np.min(cdist(data_scaled_df, kmeans.cluster_centers_,
                                        'euclidean'), axis=1)) / data_scaled_df.shape[0])

fig, ax1 = plt.subplots()
ax1.plot(K,list(inertia.values()),"bx-")
plt.ylabel("Inertia")
plt.arrow(4,1e6,-2,0, color='blue',linestyle='dashed',head_width=0.02*1e6, head_length=0.3)
ax2 = ax1.twinx() 
ax2.plot(K,distortions,"r*-")
plt.arrow(10,7.3,3,0,color='red',linestyle='dashed',head_width=0.05, head_length=0.3)
plt.xlabel("k")
plt.ylabel('Distortions')
_=plt.title('The Elbow Method using Inertia: scaled data')

# 2-classes clustering

In [None]:
## define a function to find two planes in the feature space for which the projections 
## of inter-cluster distances were as large as possible
def projectMax3dim(centerPositions):
   m=[(0,0,0),(0,0,0)] 
   for k1 in range(1,len(centerPositions)):
        for k2 in range(k1):
             A1=centerPositions[k1]-centerPositions[k2]
             for i in range(1,len(A1)):
                  for j in range(i):
                      m.append((np.linalg.norm(np.array([A1[i],A1[j]])),i,j))
                      m.sort(reverse=True)
                      m.pop()
   return m #return two sub-planes

The elbow method calculations indicate we should choose the cluster number to be 2.

Now let's find the 2-classes Kmeans clustering and plot the labeled data in the 
3-dimensions(found by function projectMax3dim) in the feature space. 

In [None]:
# 2-classes clustering:
try:
    data1.drop(columns="cluster",inplace=True)
except:
    pass
try:
    data_scaled_df.drop(columns="cluster",inplace=True)
except:
    pass


data_to_use=data1.copy()  #choose whether to use the rescaled x_pca_df or the original data1

kmeans = KMeans(n_clusters=2).fit(data_to_use)

centerPositions=kmeans.cluster_centers_
data_to_use["cluster"]=pd.Categorical(kmeans.labels_)

m=projectMax3dim(centerPositions)

print(m)

FeatureNames=data_to_use.columns
m=list(set(m[0][1:]+m[1][1:])) #two planes usually define 3-subdimenions in feature space. 


fig = px.scatter_3d(data_to_use, x=FeatureNames[m[0]], y=FeatureNames[m[1]], z=FeatureNames[m[2]],
                color="cluster",size_max=2)
fig.show()
if len(m)>3:
  fig1 = px.scatter_3d(data_to_use, x=FeatureNames[m[1]], y=FeatureNames[m[2]], z=FeatureNames[m[3]],
                color="cluster",size_max=0.2)
  fig1.show()
fig2 = px.scatter(data_to_use, x=FeatureNames[m[0]], y=FeatureNames[m[1]],
                color="cluster",symbol='cluster',size_max=2)
fig2.show()
fig3 = px.scatter(data_to_use, x=FeatureNames[m[0]], y=FeatureNames[m[2]],
                color="cluster",symbol='cluster',size_max=2)
fig3.show()
fig4 = px.scatter(data_to_use, x=FeatureNames[m[1]], y=FeatureNames[m[2]],
                color="cluster",symbol='cluster',size_max=2)
# fig4.add_trace(go.Scatter(x=[centerPositions[0][m[1]],centerPositions[1][m[1]]],
#                           y=[centerPositions[0][m[2]],centerPositions[1][m[2]]],mode='markers',name="center"))
fig4.show()

Now plot the labeled data in the subspace spanned by the first three principal components

In [None]:
def prepare_pca(n_components, data, kmeans_labels):
    names = ['P1', 'P2', 'P3']
    matrix = PCA(n_components=n_components).fit_transform(data)
    df_matrix = pd.DataFrame(matrix)
    df_matrix.rename({i:names[i] for i in range(n_components)}, axis=1, inplace=True)
    df_matrix['cluster'] = pd.Categorical(kmeans_labels)
    return df_matrix

pca_df2 = prepare_pca(3,data_to_use,kmeans.labels_)
# f=sns.scatterplot(x=pca_df.x, y=pca_df.y, hue=pca_df.labels, 
#                 palette="Set2")
fig = px.scatter_3d(pca_df2, x=pca_df2.P1, y=pca_df2.P2, z=pca_df2.P3,
                color="cluster",symbol='cluster',size_max=2)
fig.show()

# pca_df2 = prepare_pca(3, data1 ,kmeans.labels_)
# f=sns.scatterplot(x=pca_df.x, y=pca_df.y, hue=pca_df.labels, 
#                 palette="Set2")
fig1 = px.scatter(pca_df2, x=pca_df2.P1, y=pca_df2.P2, 
                color="cluster",symbol="cluster",size_max=2)
fig1.show()
fig2 = px.scatter(pca_df2, x=pca_df2.P1, y=pca_df2.P3, 
                color="cluster",symbol="cluster",size_max=2)
fig2.show()
fig3 = px.scatter(pca_df2, x=pca_df2.P2, y=pca_df2.P3, 
                color="cluster",symbol="cluster",size_max=2)
fig3.show()

It's amazing to note that in these **Kmeans-PCA-plots**, P1-P2 and P1-P3 plots have nearly perfect boundaries between two classes. 

# Discussion
At first look, this result is quite surprising. Let's recall some basic concepts. What PCA does is to find the best axes(through linear transformation in the feature space) so that the data projected onto these axes would have as much variance as possible.  On the other hand, what Kmeans tries to do is to separate data points into different classes so that the total intra-class variance would be minimized. If the number of classes is fixed, this would be roughly equivalent to maximize the inter-class variance(recall the [total variance theorem](https://en.wikipedia.org/wiki/Law_of_total_variance)). So, the PCA axis would be great choices on which class centers separates from each other as far as possible.

 The above heuristic argument was supported by a [mathematical proof](http://ranger.uta.edu/~chqding/papers/KmeansPCA1.pdf) with some mathematical rigor. In this paper, it was shown that principal components calculated by PCA(used for dimension reduction) are aculally the continuous solution to the underlying discrete cluster membership indicator vectors in Kmeans(used for data clustering). This interesting finding and some of its subtlety have attracted lots of attention of [discussion](https://stats.stackexchange.com/questions/183236/what-is-the-relation-between-k-means-clustering-and-pca). 

One interesting point is that the 2-class separation in the P1-P2 plane is better(less "mislabeled" points across the boundary between the two classes) using the original dataset data1 than using the rescaled dataset data_scaled_df(not shown here, one can simply change the line "data_to_use=data1.copy()" two cells above and rerun to to see the difference). At first sight this is a little contra-intuitive. Yet it is in agreement with the elbow curve calculations shown above, where for original dataset, a clearer "elbow" was seen for the 2-class Kmeans clustering. This fact can be understood by the following way. The "original" dataset carries the "natural difference" between different features, yet the rescaling makes them uniformly take a zero mean and one variance, which somehow washes away those differences and hurt the 2-class separability of the samples. Another interesting point supporting this argument is the behavior of the total explained variance ratio(by 15 principle components) calculated in the PCA section, i.e., it decreased from around 50% for the unscaled dataset to 44% for the scaled one. 


Another point we would mention is that our ploting scheme is somehow complemetary yet roughly equivalent to to other's work, e.g.,  [example](https://medium.com/@dmitriy.kavyazin/principal-component-analysis-and-k-means-clustering-to-visualize-a-high-dimensional-dataset-577b2a7a5fe2). We do Kmeans and label data points first and plot them in the principle component space. More often, people do Kmeans directly to the PCA transformed data in which feature columns are the principle components. 

Lastly, here is an open problem. Though we are able to effectively separate the two classes in P1-P2 and P1-P3 planes,which are principle components planes calculated by PCA, how can we distinguish different classes practically?  This is an open problem. One way to answer this question is to plot corr matrix for samples for each class, separately, and to discern some quailtative differences for different classes.  The second way might be to use this unsupervised clustering method to a labeled dataset and see how those labels agree with our prediction up to class ordering.  