In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Importing the required libraries
import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#Locating Data set and extracting into dataframe
os.chdir('/content/drive/My Drive/Colab Notebooks/OralCancerProject')

datapath = os.getcwd() + '/Data'

TypeList = [ttype+exttype for ttype in ["/Malignant*/", "/Premalignant*/", "/Normal*/"] for exttype in ["*PRN"]]


df = pd.DataFrame()

for ftype in TypeList:
  for file in glob.iglob(datapath + os.path.normpath(ftype)): 
    colname = os.path.splitext(os.path.basename(file))[0]
    tempdf = pd.read_csv(file, delim_whitespace = True, usecols = [1], engine = 'python',
                         skipfooter = 1, names = ('Wavelength', colname),
                         dtype = {'Wavelength': np.float64, colname: np.float64})   
    df = pd.concat([df, tempdf], axis = 1)
  

In [None]:
#Assigning Labels to each group (malignant, pre-malignant or normal)
df = df.T
label_classes = ["M", "P", "N"]
df['Label'] = df.index.str[0]
df['Label'] = df['Label'].apply(label_classes.index)
df.head(10)

In [None]:
# distributing the dataset into malignant, premalignant, and normal 
sample_names = df.index.to_numpy()
n_M = df[df.Label == 0 ].shape[0]
n_PM = df[df.Label == 1].shape[0]
n_N = df[df.Label == 2].shape[0]
n = n_M + n_PM + n_N
p = len(df.columns) - 1

#input attributes
X_M = df.loc[df.Label == 0 , df.columns != 'Label']
X_M = np.column_stack((sample_names[df.Label == 0], X_M))
X_PM =  df.loc[df.Label == 1 , df.columns != 'Label']
X_PM = np.column_stack((sample_names[df.Label == 0], X_PM))
X_N =  df.loc[df.Label == 2 , df.columns != 'Label']
X_N = np.column_stack((sample_names[df.Label == 0], X_N))

#output attributes 
y_M = df.loc[df['Label'] == 0, df.columns == 'Label' ].values         
y_PM = df.loc[df['Label'] == 1,  df.columns == 'Label' ].values
y_N = df.loc[df['Label'] == 2,  df.columns == 'Label' ].values

#Instances 
X = np.concatenate((X_M, X_PM, X_N)) 
y = np.concatenate((y_M, y_PM, y_N)).flatten()

In [None]:
#Ploting all Malignant patient data 
for i in np.random.randint(len(X_M), size = 10):
  x_m = X_M[i, 1:]
  plt.plot(x_m)
  plt.title('Malignant')
  plt.xlabel('Wavelength (λ)')
  plt.ylabel('Frequency')

In [None]:
#Ploting all Pre-Malignant patient data 
for i in np.random.randint(len(X_PM), size = 10):
  x_pm = X_PM[i, 1:]
  plt.plot(x_pm)
  plt.title('Pre-Malignant')
  plt.xlabel('Wavelength (λ)')
  plt.ylabel('Frequency')

In [None]:
#Ploting all Normal patient data 
for i in np.random.randint(len(X_N), size = 10):
  x_n = X_N[i, 1:]
  plt.plot(x_n)
  plt.title('Normal')
  plt.xlabel('Wavelength (λ)')
  plt.ylabel('Frequency')

In [None]:
#Ploting average patient graphs from each group 
avg_M = np.array([])
avg_N = np.array([])
avg_PM = np.array([])

for i in range(1, len(X_M[0])):
  avg_M = np.append(avg_M, np.mean(X_M[:, i]))
  avg_PM = np.append(avg_PM, np.mean(X_PM[:, i]))
  avg_N = np.append(avg_N, np.mean(X_N[:, i]))

#print(avg_M)
#print(avg_N)
#print(avg_PM)
plt.plot(avg_M, label = 'Malignant')
plt.plot(avg_N, label = 'Normal')
plt.plot(avg_PM, label = 'Pre-Malignant')

plt.title('Average Patient record for Malignant, Pre-Malignant and Normal')
plt.xlabel('Wavelength')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
#Applying Principle Component Analysis (PCA)
from sklearn.decomposition import PCA
pca = PCA(n_components = 4) #bringing it down to 4 features 
X_pca = pca.fit_transform(X[:, 1:])
print("original shape: ", X[:, 1:].shape)
print("transformed shape:", X_pca.shape)

In [None]:
#PCA = pd.DataFrame(np.absolute(X_pca))


In [None]:
#plt.scatter(X[y == 0, 1: ], X_pca[:, 0])

the information along
the least important principal axis or axes is removed, leaving only the component(s)
of the data with the highest variance. 

In [None]:
pca.explained_variance_ratio_
#% of varitaion explained in the first, second, third and fourth direction

In [None]:
#Visualization of the dimensional reduction: light color is the original data
#orange is the reduced/ projected version
X_new = pca.inverse_transform(X_pca)
plt.scatter(X[:, 0], X[:, 1], alpha=0.2)
plt.scatter(X_new[:, 0], X_new[:, 1], alpha=0.8)
plt.axis('equal');

these points are the projection of each data point along the directions with the
largest variance

In [None]:
#Plotting PCA between PCA1 and PCA3
colors = ['r' ,'g' ,'b']
names = ['Malignant', 'Pre-Malignant', 'Normal']
labels = [ 0, 1, 2]
plt.figure()

for n, c in zip(labels, colors):
  plt.scatter(X_pca[y == n, 0], X_pca[y == n,2], cmap = c)

plt.xlabel('Principle Component 1')
plt.ylabel('Principle Component 3')
plt.legend(names)
plt.show()

In [None]:
#Plotting PCA 
colors = ['r' ,'g' ,'b']
names = ['Malignant', 'Pre-Malignant', 'Normal']
labels = [ 0, 1, 2]
plt.figure()

for n, c in zip(labels, colors):
  plt.scatter(X_pca[y == n, 0], X_pca[y == n,1], cmap = c)

plt.xlabel('Principle Component 1')
plt.ylabel('Principle Component 3')
plt.legend(names)
plt.show()

In [None]:
#Plotting PCA 
colors = ['r' ,'g' ,'b']
names = ['Malignant', 'Pre-Malignant', 'Normal']
labels = [ 0, 1, 2]
plt.figure()

for n, c in zip(labels, colors):
  plt.scatter(X_pca[y == n, 0], X_pca[y == n,3], cmap = c)

plt.xlabel('Principle Component 1')
plt.ylabel('Principle Component 3')
plt.legend(names)
plt.show()

In [None]:
'''
for i in range(0, 3):
  plt.scatter(X_M[:, 1: ], X_pca[y == i, 0])
plt.legend(names)

'''