In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os 
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

seed = 42
np.random.seed(seed)


In [4]:
BASE_PATH = 'data/ROSMAP/'
methy_path = os.path.join(BASE_PATH, 'methy.csv')
mirna_path = os.path.join(BASE_PATH, 'mirna.csv')
mrna_path = os.path.join(BASE_PATH, 'mrna.csv')
if not os.path.exists(methy_path) or not os.path.exists(mirna_path) or not os.path.exists(mrna_path):
    raise Exception('File not found')

In [5]:
methy_df = pd.read_csv(methy_path, index_col=0)
mirna_df = pd.read_csv(mirna_path, index_col=0)
mrna_df = pd.read_csv(mrna_path, index_col=0)
print(methy_df.shape, mirna_df.shape, mrna_df.shape)

(351, 202) (351, 202) (351, 202)


In [7]:
df_merged  = pd.merge(methy_df, mirna_df, how = 'inner')

In [11]:
def get_train_test(df):
    train_df = df[df['Split']==1].drop('Split', axis=1)
    test_df = df[df['Split']==0].drop('Split', axis=1)
    y_train = train_df['Label']
    y_test = test_df['Label']
    return train_df.drop('Label', axis=1), test_df.drop('Label', axis=1), y_train, y_test

In [12]:
methy_train_df, methy_test_df, methy_y_train, methy_y_test = get_train_test(methy_df)
mirna_train_df, mirna_test_df, mirna_y_train, mirna_y_test = get_train_test(mirna_df)
mrna_train_df, mrna_test_df, mrna_y_train, mrna_y_test = get_train_test(mrna_df)

print(methy_train_df.shape, mirna_train_df.shape, mrna_train_df.shape)
print(methy_test_df.shape, mirna_test_df.shape, mrna_test_df.shape)
print(methy_y_train.shape, mirna_y_train.shape, mrna_y_train.shape)
print(methy_y_test.shape, mirna_y_test.shape, mrna_y_test.shape)


(245, 200) (245, 200) (245, 200)
(106, 200) (106, 200) (106, 200)
(245,) (245,) (245,)
(106,) (106,) (106,)


In [13]:
train_combined_df = pd.concat([methy_train_df, mirna_train_df, mrna_train_df], axis=1)
test_combined_df = pd.concat([methy_test_df, mirna_test_df, mrna_test_df], axis=1)

print(train_combined_df.shape, test_combined_df.shape)

(245, 600) (106, 600)


In [17]:
# find and remove correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(train_combined_df, 0.80)
print('correlated features: ', len(set(corr_features)) )

correlated features:  73


In [18]:
X_train = train_combined_df.drop(corr_features, axis=1)
X_test = test_combined_df.drop(corr_features, axis=1)

In [19]:
y_train = methy_y_train
y_test = methy_y_test


In [20]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [27]:
model = XGBClassifier()
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 7, 9],
    'learning_rate': [0.001, 0.005, 0.1]
}


In [28]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1)
grid_result = grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [29]:
best_model = grid_result.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.78      0.78      0.78        51
         1.0       0.80      0.80      0.80        55

    accuracy                           0.79       106
   macro avg       0.79      0.79      0.79       106
weighted avg       0.79      0.79      0.79       106

0.7924528301886793
