# Analyzing Liquid Biopsy, MRI, and Clinical Data
## Author: Shehbeel Arif

---

# Libraries used

In [1]:
import pandas as pd

# Preprocessing the datasets

In [117]:
mirna_df = pd.read_csv('miRNA expression raw data_plasma.csv', index_col='SDG_ID')
mirna_df = mirna_df.T
#mirna_df

clinical_df = pd.read_csv('Clinical Data miRNA Cohort.csv', index_col='SDG_ID')
clinical_df = clinical_df.drop(['Unnamed: 22'], axis=1)
#clinical_df

mri_df = pd.read_csv('MRI Imaging  results_2022-04-20.csv', index_col='SDG_ID')
mri_df = mri_df.drop(['Short_histology'], axis=1)
#mri_df

main_df = pd.concat([clinical_df, mri_df, mirna_df], axis=1, join='inner')
#main_df

In [118]:
labels = ['Short_histology']
clinical_features = ['WHO_Grade','Age_at_LBcollection','Age_at_ Initial_Diagnosis',
                     'PFS_from_InitialDiagnosis','OverallSurvival_Time_From_Initial_Tumor_Diagnosis','OverallSurvival_LBcollection']

mri_features = ['Imaging_timepoint_(age_in_days)','Total_Tumor_Volume (mm^3)', 'Enhancing_(mm^3)',
               'Non-enhancing_(mm^3)','Cystic_core_(mm^3)','Cystic_reactive_(mm^3)','AmountTumor_Enhancing',
               'AmountTumor_Nonenhancing','AmountTumor_CysticCore','AmountTumor_CysticCore']

mri_features_extra = ['Imaging_timepoint_(age_in_days)','Total_Tumor_Volume (mm^3)', 'Enhancing_(mm^3)',
               'Non-enhancing_(mm^3)','Cystic_core_(mm^3)','Cystic_reactive_(mm^3)','AmountTumor_Enhancing',
               'AmountTumor_Nonenhancing','AmountTumor_CysticCore','AmountTumor_CysticCore', 
               'Leptomeningeal_disease_(y/n)','Adjacent_surface_(y/n)','Adjacent_ventricular_system_(y/n)',
               'Edema_YesNo','Edema_(mm^3)']

mirna_features = mirna_df.columns.tolist()

---

---

# Clinical Data

In [4]:
main_df[labels+clinical_features]

Unnamed: 0,Short_histology,WHO_Grade,Age_at_LBcollection,Age_at_ Initial_Diagnosis,PFS_from_InitialDiagnosis,OverallSurvival_Time_From_Initial_Tumor_Diagnosis,OverallSurvival_LBcollection
15635-1,EP,3,6710,5717,385,1503,510
15635-100,LGG,1,4052,4052,775,789,789
15635-101,LGG,1,4905,2251,2654,3367,713
15635-108,GCT,2,6152,6152,755,755,755
15635-11,MB,4,479,479,528,528,528
15635-120,EP,3,1555,1182,373,890,517
15635-127,LGG,1,8046,7116,799,1698,768
15635-129,ATRT,4,2082,383,1699,2077,378
15635-132,MB,4,1794,1794,715,715,715
15635-134,LGG,Not Available,1544,1544,155,728,728


---

# MRI Data

In [5]:
main_df[labels+mri_features]

Unnamed: 0,Short_histology,Imaging_timepoint_(age_in_days),Total_Tumor_Volume (mm^3),Enhancing_(mm^3),Non-enhancing_(mm^3),Cystic_core_(mm^3),Cystic_reactive_(mm^3),AmountTumor_Enhancing,AmountTumor_Nonenhancing,AmountTumor_CysticCore,AmountTumor_CysticCore.1
15635-1,EP,6708,125178.0,86645.0,14437.0,24096.0,0,0.692174,0.115332,0.192494,0.192494
15635-100,LGG,4051,56379.0,17403.0,15767.0,23209.0,0,0.308679,0.279661,0.41166,0.41166
15635-101,LGG,4828,18831.0,1029.0,0.0,17802.0,0,0.054644,0.0,0.945356,0.945356
15635-108,GCT,6107,2234.0,2234.0,0.0,0.0,0,1.0,0.0,0.0,0.0
15635-11,MB,479,27321.0,11655.0,14943.0,723.0,0,0.426595,0.546942,0.026463,0.026463
15635-120,EP,1553,7781.0,4579.0,0.0,3202.0,0,0.588485,0.0,0.411515,0.411515
15635-127,LGG,8034,19038.05,12252.0,0.0,6785.840132,0,0.643564,0.0,0.356436,0.356436
15635-129,ATRT,2077,138.5,127.25,11.25,0.0,0,0.918773,0.081227,0.0,0.0
15635-132,MB,1791,19330.0,19310.0,0.0,20.0,0,0.998965,0.0,0.001035,0.001035
15635-134,LGG,1540,35120.0,29935.0,0.0,5185.0,0,0.852363,0.0,0.147637,0.147637


---

# miRNA Data

In [6]:
main_df[labels+mirna_features]

Unnamed: 0,Short_histology,let-7a-2-3p,let-7a-3p,let-7a-5p,let-7b-5p,let-7c-3p,let-7c-5p,let-7d-3p,let-7d-5p,let-7e-3p,...,miR-944,miR-95-3p,miR-95-5p,miR-9-5p,miR-96-3p,miR-96-5p,miR-98-3p,miR-99a-5p,miR-99b-3p,miR-99b-5p
15635-1,EP,2971,2870,12737,6497,2367,6231,3536,4604,2315,...,2930,2784,2490,2186,2508,2735,1211,2792,1886,2459
15635-100,LGG,353,26,26638,13114,34,10611,3812,9560,45,...,0,57,62,24,22,120,72,3586,73,919
15635-101,LGG,241,70,5121,12432,18,8041,1417,1167,35,...,39,62,92,39,27,126,33,1508,64,254
15635-108,GCT,171,143,17436,10308,117,7488,3380,3301,114,...,67,104,32,98,90,181,154,5172,52,1241
15635-11,MB,196,109,8368,7319,66,4502,3367,1833,110,...,43,143,160,78,71,86,51,4260,178,1011
15635-120,EP,107,33,22767,13290,24,9888,2394,7643,11,...,5,46,42,23,9,12,61,824,46,430
15635-127,LGG,201,63,9160,11201,44,8080,1394,2861,13,...,7,28,50,38,10,113,27,865,44,335
15635-129,ATRT,49,6,7132,16705,4,10590,2108,2283,10,...,3,24,9,14,6,331,11,1942,33,311
15635-132,MB,387,26,17234,18375,32,12720,2962,5858,38,...,3,71,28,16,0,367,40,1692,103,881
15635-134,LGG,345,93,19679,9958,54,8204,3225,6787,88,...,60,147,104,40,31,180,75,2621,154,1349


In [73]:
# from sklearn import preprocessing
# le = preprocessing.LabelEncoder()
# sh_classes = le.fit_transform(main_df['Short_histology'])
# main_df['class'] = sh_classes

# To reverse encoding
#le.inverse_transform([1,4,4,2,5])

In [75]:
from sklearn.model_selection import train_test_split

In [119]:
X = main_df[mirna_features].values # Feature matrix
y = main_df['Short_histology'].values.ravel() # target values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=0)

# Sanity Check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(38, 2083) (13, 2083) (38,) (13,)


In [120]:
main_df[mirna_features]

Unnamed: 0,let-7a-2-3p,let-7a-3p,let-7a-5p,let-7b-5p,let-7c-3p,let-7c-5p,let-7d-3p,let-7d-5p,let-7e-3p,let-7e-5p,...,miR-944,miR-95-3p,miR-95-5p,miR-9-5p,miR-96-3p,miR-96-5p,miR-98-3p,miR-99a-5p,miR-99b-3p,miR-99b-5p
15635-1,2971,2870,12737,6497,2367,6231,3536,4604,2315,2538,...,2930,2784,2490,2186,2508,2735,1211,2792,1886,2459
15635-100,353,26,26638,13114,34,10611,3812,9560,45,1149,...,0,57,62,24,22,120,72,3586,73,919
15635-101,241,70,5121,12432,18,8041,1417,1167,35,153,...,39,62,92,39,27,126,33,1508,64,254
15635-108,171,143,17436,10308,117,7488,3380,3301,114,1308,...,67,104,32,98,90,181,154,5172,52,1241
15635-11,196,109,8368,7319,66,4502,3367,1833,110,393,...,43,143,160,78,71,86,51,4260,178,1011
15635-120,107,33,22767,13290,24,9888,2394,7643,11,623,...,5,46,42,23,9,12,61,824,46,430
15635-127,201,63,9160,11201,44,8080,1394,2861,13,473,...,7,28,50,38,10,113,27,865,44,335
15635-129,49,6,7132,16705,4,10590,2108,2283,10,158,...,3,24,9,14,6,331,11,1942,33,311
15635-132,387,26,17234,18375,32,12720,2962,5858,38,911,...,3,71,28,16,0,367,40,1692,103,881
15635-134,345,93,19679,9958,54,8204,3225,6787,88,1309,...,60,147,104,40,31,180,75,2621,154,1349


In [77]:
y_test

array([3, 1, 0, 2, 4, 3, 5, 1, 5, 5, 4, 4, 5])

In [78]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Logistic Regression (L1 and L2) and Lasso
from sklearn.linear_model import LogisticRegression, Lasso
# Random Forest 
from sklearn.ensemble import RandomForestClassifier
# Support Vector Machine
from sklearn.svm import SVC
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
# Artificial Neural Network (Multi-Layer Perceptron)
from sklearn.neural_network import MLPClassifier

In [89]:
# pipelines = {
#     'l1-lr':make_pipeline(LogisticRegression(penalty='l1',solver='liblinear')), # solver='saga' or 'liblinear'
#     'l2-lr':make_pipeline(LogisticRegression(penalty='l2')),
#     'lasso':make_pipeline(Lasso()),
#     'rf':make_pipeline(RandomForestClassifier()),
#     'svm':make_pipeline(SVC()),
#     'gnb':make_pipeline(GaussianNB()),
#     'ann':make_pipeline(MLPClassifier()),    
# }

In [123]:
pipelines = {
    #'l1-lr':make_pipeline(StandardScaler(), LogisticRegression(penalty='l1',solver='liblinear',max_iter=250)), # solver='saga' or 'liblinear'
    #'l2-lr':make_pipeline(StandardScaler(), LogisticRegression(penalty='l2',max_iter=250)),
    #'lasso':make_pipeline(StandardScaler(), Lasso()),
    'rf':make_pipeline(StandardScaler(), RandomForestClassifier()),
    'svm':make_pipeline(StandardScaler(), SVC()),
    'gnb':make_pipeline(StandardScaler(), GaussianNB()),
    'ann':make_pipeline(StandardScaler(), MLPClassifier()),    
}

In [124]:
fit_models = {}
for algo, pipeline in pipelines.items():
    model = pipeline.fit(X_train, y_train)
    fit_models[algo] = model

In [125]:
for algo, model in fit_models.items():
    yhat = model.predict(X_test)
    print(algo, accuracy_score(y_test, yhat))

rf 0.38461538461538464
svm 0.23076923076923078
gnb 0.15384615384615385
ann 0.38461538461538464


In [34]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import plotly.express as px

In [127]:
# Initialize random forest classifier
r_rf = RandomForestClassifier(max_depth=2, random_state=0)

# Train the random forest classifier
r_rf.fit(X_train, y_train)

# Make predictions using random forest classifier
r_rf_y_pred = r_rf.predict(X_test)

# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, r_rf_y_pred)}')

Accuracy: 0.3076923076923077


In [116]:
# Calculate a confusion matrix
r_cm = confusion_matrix(y_test, r_rf_y_pred, labels=r_rf.classes_)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(r_cm, text_auto=True,
                labels=dict(x="Predicted Relapse", y="True Relapse", color="Productivity"),
                x=main_df['Short_histology'].unique().tolist(),
                y=main_df['Short_histology'].unique().tolist()
                )
disp.show()

In [None]:
'miR-16-5p', 'miR-3197', 'miR-451a', 'miR-4745-3p', 'miR-6126', 'miR-6870-3p'

In [134]:
chao_miRNA_df = main_df[['miR-16-5p', 'miR-3197', 'miR-451a', 'miR-4745-3p', 'miR-6126', 'miR-6870-3p']]

X = chao_miRNA_df.values # Feature matrix
y = main_df['Short_histology'].values.ravel() # target values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=0)

# Sanity Check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(38, 6) (13, 6) (38,) (13,)


In [136]:
# Initialize random forest classifier
rfc = RandomForestClassifier(max_depth=2, random_state=0)

# Train the random forest classifier
rfc.fit(X_train, y_train)

# Make predictions using random forest classifier
y_pred = rfc.predict(X_test)

# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

Accuracy: 0.23076923076923078


---

# All Data