In [None]:
# Notebook written by Shamima Rashid, Nanyang Technological University, Singapore. June 2022.
# XGBoost portions were adapted from: machinelearningmastery.com (Jason Brownlee). 
#Version Information:
#xgboost 1.5.1
#scipy 1.8.1

In [None]:
import pandas as pd
import pandas_profiling as pp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from xgboost import XGBClassifier
import scipy

In [None]:
fn1 = "Sites_final_Influenza.csv" #Cleaned datafile with 19911 records
                                  #Update with B-factor and try

In [None]:
###Only relevant features are selected in fields_1. Non-informative features (eg all zero columns, high correlation) identified with
### pandas profile report were removed 
fields_1 = ['Hydropathy', 'SS', 'RSA', 'PHI', 'PSI', 'main_chain_rel', 'all_polar_rel', 'CA_Up', 'CA_down', 'CA_PCB_Angle',
            'CA_Up.1','CA_down.1', 'CA_Count_r12',	'Residue_Depth','CA_Depth', 'B_Norm', 'Target'] #16 features
fields_2 = ['Hydropathy', 'SS', 'RSA', 'PHI', 'PSI', 'main_chain_rel', 'CA_Up', 'CA_down', 'CA_PCB_Angle', 
            'Residue_Depth','B_Norm', 'Target'] #11 features
# fields_3 = ['Hydropathy', 'SS', 'RSA', 'PHI', 'PSI', 'main_chain_rel', 'CA_Up', 'CA_down', 'CA_PCB_Angle', 'Residue_Depth','B_Norm']
# fields_4 = ['Hydropathy','SS','RSA','PHI','PSI','all_atoms_abs','all_atoms_rel','side_chain_abs',
#             'side_chain_rel','main_chain_abs','main_chain_rel',	'non_polar_abs','non_polar_rel',
#             'all_polar_abs', 'all_polar_rel', 'CA_Up', 'CA_down', 'CA_PCB_Angle', 'CA_Up.1', 
#             'CA_down.1', 'CA_Count_r12','Residue_Depth','CA_Depth','B_Norm', 'Target']

In [None]:
df = pd.read_csv(fn1, usecols=fields_1)
df = df.sample(frac=1) #shuffle the rows

In [None]:
Subtype = {"H1N1":0, "H3N2":1}
SS_Num = {"H":1, "E":2, "C":3}


### Feature Distribution and Importance

In [None]:
##Descriptive Statistics
#df.describe()
#df.head()

In [None]:
df['Target']= df['Target'].apply(lambda x: Subtype[x])
df['SS'] = df['SS'].apply(lambda x: SS_Num[x])

In [None]:
##Target Variable countplot
sns.set_style('darkgrid')
p = sns.countplot(x = 'Target', data=df, color='navy').set(title='Actual Subtypes of Influenza', ylabel= 'No. of Residues', xlabel='')


In [None]:
sns.countplot(x='SS', data = df)

In [None]:
pretty = {'PHI':"$\phi$'", 'PSI':"$\psi$", 'main_chain_rel':"Main Chain RSA", 
          'all_polar_rel':"All Polar RSA", 'CA_Up':"$C\alpha Up$", 'CA_down':"$C\alpha Down$", 
          'CA_PCB_Angle':"$C\alpha P C\beta Angle$ ", 'CA_Up.1':"$C\alpha Up.1$",'CA_down.1':"$C\alpha Down.1$", 
          'CA_Count_r12':"$C\alpha Count r12$",	'Residue_Depth':"Residue Depth",'CA_Depth':"$C\alpha Depth$", 'B_Norm':"B Norm"}    

In [None]:
## Target variable plot seperations
idx_train = df['Target'].astype("bool").values
fig, axes = plt.subplots(4,4,figsize=(20,15))
sns.set_style('darkgrid')
axes = [x for a in axes for x in a]

pretty = {'PHI':"$\phi$", 'PSI':"$\psi$", 'main_chain_rel':"Main Chain RSA", 
          'all_polar_rel':"All Polar RSA", 'CA_Up':r'C$\alpha$ Up', 'CA_down':r'C$\alpha$ Down', 
          'CA_PCB_Angle':r'C$\alpha$ PC$\beta$ Angle', 'CA_Up.1':r'C$\alpha$ Up.1','CA_down.1':r'C$\alpha$ Down.1', 
          'CA_Count_r12':r'C$\alpha$ Count r12', 'Residue_Depth':"Residue Depth",'CA_Depth':r'C$\alpha$ Depth', 'B_Norm':"B Norm"}
for i,name in enumerate(fields_1[:-1]): #all columns in df except target
    value = df[name]
    sns.distplot(value[~idx_train],ax = axes[i], color='red')
    sns.distplot(value[idx_train],ax = axes[i], color = 'blue')
    if name in pretty:        
        axes[i].set_xlabel(pretty[name],fontsize=12)
        
    if name == 'SS':
        #x_labels = ['H', 'E', 'C']
        axes[i].set_xticks([1, 2, 3])
        axes[i].set_xticklabels(['H', 'E','C'])
  
    fig.suptitle('Distribution of 16 Structural Properties for Influenza', fontsize = 22, fontweight='bold')   
    fig.legend(labels = ["H1N1","H3N2"],loc="upper right",fontsize=12)
    fig.tight_layout()

In [None]:
#fig.savefig('Influenza_FeaturesDistPlot16.png', format='png', dpi=300, bbox_inches='tight')

In [None]:
F_names = fields_2[:-1]

X = df[F_names]
Y = df['Target']
X = X.to_numpy()
Y = Y.to_numpy()
print("X:", X.shape)
print("Y:", Y.shape)

In [None]:
#Generate random target
#Fixed seed for reproducibility
np.random.seed(1)
R = [] 
rand_list = np.random.random(size=(len(Y)))
for i in rand_list:
    if i<0.5:
        a = 0
    else:
        a = 1
    R.append(a)

In [None]:
#Only for counting random subtypes
Subtype_r = {0:"H1N1", 1:"H3N2"}
List = []
for i in R:
    List.append(Subtype_r[i])

In [None]:
##Target Variable countplot
sns.set_style('darkgrid')
p = sns.countplot(x=List, color='navy').set(title='Random Subtypes of Influenza', ylabel= 'No. of Residues', xlabel='')

In [None]:
## GetXGBoost Feature Importance
model = XGBClassifier(importance_type ='gain', eval_metric='error', use_label_encoder=False)
model.fit(X, Y)

In [None]:
importance=model.feature_importances_
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
model = XGBClassifier(importance_type = 'gain', eval_metric='error', use_label_encoder=False)
model.fit(X, R)

In [None]:
importanceR=model.feature_importances_
# summarize feature importance
for i,v in enumerate(importanceR):
	print('Feature: %0d, Score: %.5f' % (i,v))

fig, ax = plt.subplots(1,1, figsize = (8,6))
ax.barh([F_names[x] for x in range(len(importance))], importance, alpha = 0.4, color = 'red')
ax.barh([F_names[x] for x in range(len(importanceR))], importanceR, alpha = 0.4, color = 'blue')
ax.set_title('XGBoost Feature Gains for HA Type Detection', fontsize=14, fontweight='bold')
ax.legend(labels = ["True Type","Random Type"],loc="center right",bbox_to_anchor=(1.4,0.5),fontsize=12)
plt.show()


In [None]:
#fig.savefig('HAGains_11Features.png', format='png', dpi=300, bbox_inches='tight')

In [None]:
# Test protocol.
#Prepare for actual Target and Randomly assigned H1N1 & H3N2 Labels

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=0.25, random_state = 1)
XR_train, XR_test, R_train, R_test= train_test_split(X, R, test_size=0.25, random_state = 1)

In [None]:
model=XGBClassifier(importance_type = 'gain', eval_metric ='error', use_label_encoder=False)


In [None]:
model.fit(X_train, Y_train)
Y_hat = model.predict(X_test)

In [None]:
accuracy = accuracy_score(Y_test, Y_hat)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
Rmodel=XGBClassifier(importance_type = 'gain', eval_metric = 'error', use_label_encoder=False)
Rmodel.fit(XR_train, R_train)
R_hat = model.predict(XR_test)

In [None]:
accuracy = accuracy_score(R_test, R_hat)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
#### Average of n runs

n = 100
ActualLabel = []
RandomLabel = []


for i in range(n):
   
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
    XR_train, XR_test, R_train, R_test = train_test_split(X, R, test_size=0.25)
    model=XGBClassifier(importance_type ='gain', eval_metric='error', use_label_encoder=False)
    model.fit(X_train, Y_train)
    Y_hat = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_hat)
    ActualLabel.append(accuracy)
    print('Round %0d :'% (i))
    print('True Label Accuracy: %.2f' % (accuracy*100))
    Rmodel=XGBClassifier(importance_type = 'gain', eval_metric = 'error', use_label_encoder=False)
    Rmodel.fit(XR_train, R_train)
    R_hat = model.predict(XR_test)
    accuracy = accuracy_score(R_test, R_hat)
    RandomLabel.append(accuracy)
    print('Random Label Accuracy: %.2f' % (accuracy*100))
    


In [None]:
A1 = np.asarray(ActualLabel)
R1 = np.asarray(RandomLabel)

In [None]:
A1_mean = np.mean(A1)
A1_std = np.std(A1)
print('True mean: %.2f; True s.t.d: %.3f' %(A1_mean*100, A1_std))

In [None]:
R1_mean = np.mean(R1)
R1_std = np.std(R1)
print('Random mean: %.2f; Random s.t.d: %.3f' %(R1_mean*100, R1_std))