In [None]:
# Notebook written by Shamima Rashid, Nanyang Technological University, Singapore. June 2022.
# Adapted from: machinelearningmastery.com (Jason Brownlee). 
#Version Information:
#xgboost 1.5.1
#scipy 1.8.1

In [None]:
import pandas as pd
import pandas_profiling as pp
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
#import sklearn
import seaborn as sns
from xgboost import XGBClassifier
import scipy
from pylab import savefig

#comment if 'A value is trying to be set on a slice of a copy of a dataframe' warning is needed
pd.options.mode.chained_assignment = None  # default='warn'



In [None]:
fn1 = "Sites_final_SARSCoV2.csv" #Cleaned datafile with 54484 records
                                 #B-factors not applicable as many are cryo-EM files 

In [None]:
###Only 10 relevant features are selected. Non-informative features (eg all zero columns, high correlation) 
###identified in profile report were removed
fields_1 = ['Hydropathy', 'SS', 'RSA', 'PHI', 'PSI', 'main_chain_rel', 'CA_Up', 'CA_down', 'CA_PCB_Angle', 
            'Residue_Depth', 'Target'] #10 features
fields_2 = ['Hydropathy', 'SS', 'RSA', 'PHI', 'PSI', 'main_chain_rel', 'all_polar_rel', 'CA_Up', 
             'CA_down', 'CA_PCB_Angle','CA_Up.1','CA_down.1', 'CA_Count_r12',	'Residue_Depth',
             'CA_Depth', 'Target'] #15 features
fields_4 = ['Hydropathy','SS','RSA','PHI','PSI','all_atoms_abs','all_atoms_rel','side_chain_abs',
            'side_chain_rel','main_chain_abs','main_chain_rel',	'non_polar_abs','non_polar_rel',
            'all_polar_abs', 'all_polar_rel', 'CA_Up', 'CA_down', 'CA_PCB_Angle', 'CA_Up.1', 
            'CA_down.1', 'CA_Count_r12','Residue_Depth','CA_Depth','Target'] #23 features


In [None]:
#df = pd.read_csv(fn1, usecols=fields_1)
df = pd.read_csv(fn1, usecols=fields_4)

In [None]:
#df.profile_report()
df.head(10)

In [None]:
### Drop Clade 'O' for 4-Clade classification.
X = df[df['Target'] != 'O']


In [None]:
X.head()

In [None]:
Clade = {'G':0, 'L':1, 'S':2,'V':3}
SS_Num = {"H":1, "E":2, "C":3}
X['Target']= X['Target'].apply(lambda x: Clade[x])
X['SS'] = X['SS'].apply(lambda x: SS_Num[x])

In [None]:
#F_names = fields_1[:-1] #Selected Feature Names
F_names = fields_4[:-1] #Selected Feature Names

In [None]:
#Set up X and Y for XGBoost Training

Y =X['Target']
X =X[F_names]
X = X.to_numpy()
Y = Y.to_numpy()
print("X - Type:", type(X), "Shape:", X.shape)
print("Y - Type:", type(Y), "Shape:", Y.shape)



In [None]:
sns.countplot(x=Y).set(title = 'True Label Distribution', ylabel = 'Count')

In [None]:
## Prepare vector of randomly generated labels
n = 4
np.random.seed(1)
R = np.random.randint(n, size=(len(Y),))
#R=R.tolist()
print("Type:", type(R), "Shape:", R.shape)

In [None]:
sns.countplot(x=R).set(title = 'Random Label Distribution', ylabel = 'Count')

In [None]:
# Test protocol.
#Prepare for actual Target and Randomly assigned SARS_COV2

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Using train_test_split()
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=0.25, random_state = 1)
XR_train, XR_test, R_train, R_test= train_test_split(X, R, test_size=0.25, random_state = 1)

print(XR_train.shape)
print(XR_test.shape)

In [None]:
model=XGBClassifier(importance_type='gain', eval_metric='error', use_label_encoder=False)

In [None]:
model.fit(X_train, Y_train)
Y_hat = model.predict(X_test)

In [None]:
accuracy = accuracy_score(Y_test, Y_hat)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
Rmodel=XGBClassifier(importance_type='gain', eval_metric='error', use_label_encoder=False)
Rmodel.fit(XR_train, R_train)
R_hat = model.predict(XR_test)

In [None]:
accuracy = accuracy_score(R_test, R_hat)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
np.random.seed(1)

In [None]:
#### Average of n runs

n = 100
ActualLabel = []
RandomLabel = []


for i in range(n):
   
    X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=0.25)
    XR_train, XR_test, R_train, R_test= train_test_split(X, R, test_size=0.25)
    model=XGBClassifier(importance_type='gain', eval_metric='error', use_label_encoder=False)
    model.fit(X_train, Y_train)
    Y_hat = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_hat)
    ActualLabel.append(accuracy)
    print('Round %0d:'% (i))
    print('True Label Accuracy: %.2f' % (accuracy*100))
    Rmodel=XGBClassifier(importance_type='gain', eval_metric='error', use_label_encoder=False)
    Rmodel.fit(XR_train, R_train)
    R_hat = model.predict(XR_test)
    accuracy = accuracy_score(R_test, R_hat)
    RandomLabel.append(accuracy)
    print('Random Label Accuracy: %.2f' % (accuracy*100))
   


In [None]:
A1 = np.asarray(ActualLabel)
R1 = np.asarray(RandomLabel)

In [None]:
A1_mean = np.mean(A1)
A1_std = np.std(A1)
print('True mean: %.2f; True s.t.d: %.3f' %(A1_mean*100, A1_std))

In [None]:
R1_mean = np.mean(R1)
R1_std = np.std(R1)
print('Random mean: %.2f; Random s.t.d: %.3f' %(R1_mean*100, R1_std))

In [None]:
# Uncomment for feature selection
# def select_features(X_train, y_train, X_test):
#     #Code by Jason Brownlee, MachineLearningMastery.com
# 	# configure to select a subset of features
# 	fs = SelectFromModel(RandomForestClassifier(n_estimators=1000), max_features=15)
# 	# learn relationship from training data
# 	fs.fit(X_train, y_train)
# 	# transform train input data
# 	X_train_fs = fs.transform(X_train)
# 	# transform test input data
# 	X_test_fs = fs.transform(X_test)
# 	return X_train_fs, X_test_fs, fs

In [None]:
# X_train_fs, X_test_fs, fs = select_features(X_train, Y_train, X_test)
# XR_train_fs, XR_test_fs, Rfs = select_features(XR_train, R_train, XR_test)
# model=XGBClassifier(importance_type = 'gain')

# model.fit(X_train_fs, Y_train)
# Y_hat = model.predict(X_test_fs)


# accuracy = accuracy_score(Y_test, Y_hat)
# print('Accuracy: %.2f' % (accuracy*100))


# Rmodel=XGBClassifier(importance_type = 'gain')
# Rmodel.fit(XR_train_fs, R_train)
# R_hat = model.predict(XR_test_fs)