<a href="https://colab.research.google.com/github/rharris9/SYSM578/blob/main/RohanHarrisClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Data and linking the datasets to have data with the desired labels

In [71]:
#import pandas and numpy
import pandas as pd
import numpy as np

#import standard classification tools from sklearn
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import itertools

# Loading SVM problems
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.preprocessing import scale
import warnings

#import tools for ROC curves
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc

#visualization
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

template = 'ggplot2'
test_size = 0.3

data = pd.read_table('https://raw.githubusercontent.com/PineBiotech/omicslogic/master/TCGA_43_clinical.csv',sep='\,',header=(0))

rawdata = pd.read_table('https://raw.githubusercontent.com/PineBiotech/omicslogic/master/LIHC_RSEM_42cases.txt',sep='\t',header=(0))


#prepare data
features = data.iloc[1:, 0].values
data=data.drop(['Variable'], axis = 1) 
dataT = np.transpose(data)


X = rawdata.iloc[:, 0:].values
y = data.iloc[:, 1].values

race = np.unique(y)
n_race = len(race)
str(race)

#X = scale(race, axis=1)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1)

race

array(['ASIAN', 'BLACK OR AFRICAN AMERICAN', 'WHITE'], dtype=object)

First step for Random forest is to train data

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

#finding out how many samples were from asian patients
print(len(y[y == 'ASIAN']))


In [None]:

#finding out how many samples were from Black or African American patients
print(len(y[y == 'BLACK OR AFRICAN AMERICAN']))

In [None]:
#finding out how many samples were from White patients
print(len(y[y == 'WHITE']))

In [None]:
#Comparing proportion of the study population by race

for sample_type in race:
  print(sample_type, ": ", round(len(y_test[y_test == sample_type])/len(y[y == sample_type]),2))

Random Forest

In [None]:
#define Random forest model
model = RandomForestClassifier(n_estimators=500, bootstrap=True, verbose=True)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#define Random forest model
model = RandomForestClassifier(n_estimators=500, bootstrap=True, verbose=False)

#fit model on training data
model.fit(X_train, y_train)

estimator = model.estimators_[10]

#predicting test data
predictions = model.predict(X_test)
print("Model accuracy = ", round(accuracy_score(y_test, predictions),2))


Confusion Matrix for Random Forest classification

In [None]:
#prepare a confusion matrix
conf = confusion_matrix(y_test,predictions)
new_conf = pd.DataFrame(conf, columns=race, index=race)

import matplotlib.pyplot as plt
import seaborn as sns

#plot heatmap of confusion matrix
fig, ax = plt.subplots(figsize=(13, 10))
sns.heatmap(new_conf, annot=True);

Feature Importance for Random Forest

In [None]:
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)

forest_importances = pd.Series(importances, index=features)
forest_importances_top = forest_importances[forest_importances > 0.003]
forest_importances_top = forest_importances_top.sort_values()

#plot forest_importances
forest_importances_top.plot.bar(figsize=(15, 10));

SVC Classification

In [None]:
#ignoring warnings
warnings.filterwarnings('ignore')  # "error", "ignore", "always", "default", "module" or "once"

# Using to find train data
model = LinearSVC(C=1.0)
model.fit(X_train, y_train)

# after training we can predict labels for the test samples
predictions = model.predict(X_test)

c_matrix = confusion_matrix(y_test,predictions)
cr = classification_report(y_test,predictions, output_dict=True)
model_accuracy = accuracy_score(y_test, predictions)

SVC Results

In [None]:
clsf_report = pd.DataFrame(cr).transpose()
clsf_report1 = clsf_report[0:n_race]

fig = px.bar(clsf_report1, barmode='group', template=template)
fig.update_layout(width=700, height=700, template=template, title="Classification Report")
fig.show()

PCA for SVC Results

In [None]:
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 

dataPCA = dataT
dataPCA = dataPCA.iloc[:,1:len(dataPCA.columns)]
dataPCA

scaled = StandardScaler() 
scaled.fit(dataPCA) 
scaled_data = scaled.transform(dataPCA) 

#Run PCA
n_components=3
pca = PCA(n_components) 
pca.fit(scaled_data) 
xpca = pca.transform(scaled_data)

labels = {str(i): f'PC{i+1}: {pca.explained_variance_ratio_[i]*100:.2f}%' for i in range(n_components)}
template = 'ggplot2'

# Displaying the PCA for Race
figPCA2D = px.scatter(xpca, x=0, y=1, color=y, 
                      labels=dataPCA.index,
                      hover_name = dataPCA.index,
                      title="PCA for Race")
figPCA2D.update_layout(width=1000, height=800, template=template)
figPCA2D.show()

Confusion Matrix for SVC classification of Race

In [None]:
CM_df = pd.DataFrame(c_matrix, columns=race, index=race)

heat = go.Heatmap(z = CM_df, x=CM_df.index, 
                  y=CM_df.columns, 
                  showscale = False,
                  text=CM_df,
                  texttemplate="%{text}",
                  textfont={"size":10})
fig = go.Figure(heat)
fig.update_layout(width=700, height=700, template=template, title="Confusion Matrix")
fig.show()