## Predictive Analysis Module
This module creates looks at the top features from the human trauma list and looks into determining a logistic prediction the ALZ dataset

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
ALZ_plasma_p = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/ALZ_plasma_processed.csv", index_col=0).transpose()
ALZ_csf_p = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/ALZ_csf_processed.csv", index_col=0).transpose()
trauma_human_p = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/trauma_human_processed.csv", index_col=0).transpose()

sort_stress_top_all_plasma_cor = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/sort_stress_top_all_plasma_cor.csv", index_col=0)
sort_stress_top_top_plasma_cor =  pd.read_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/sort_stress_top_top_plasma_cor.csv", index_col=0)
sort_stress_top_all_csf_cor =  pd.read_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/sort_stress_top_all_csf_cor.csv", index_col=0)
sort_stress_top_top_csf_cor =  pd.read_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/sort_stress_top_top_csf_cor.csv", index_col=0)

In [4]:
def get_test_train(dataframe):
  """
  Split data into test train set
  """
  train,test = train_test_split(dataframe, test_size=0.4)
  y_train_headers = train.index
  Y_train_num = []
  for i in y_train_headers:
    if i != None and "AD" in i:
      Y_train_num.append(1)
    else:
      Y_train_num.append(0)

  y_test_headers = test.index
  Y_test_num = []
  for i in y_test_headers:
    if i != None and "AD" in i:
      Y_test_num.append(1)
    else:
      Y_test_num.append(0)
  return train, test, Y_train_num, Y_test_num

In [5]:
def get_preds(X_train, X_test, Y_train, Y_test, metab):
  """
  This function will create the logistic regression predictive model and
  get the metrics. It will also create a confusion matrix
  """
  train_set = np.array(X_train[metab]).reshape(-1,1)
  test_set = np.array(X_test[metab]).reshape(-1,1)
  LogModel = LogisticRegression().fit(train_set, Y_train)
  y_pred = LogModel.predict(test_set)

  f1_ex = f1_score(Y_test, y_pred)
  accuracy = accuracy_score(Y_test, y_pred)
  prec_ex = precision_score(Y_test, y_pred)
  recall_ex = recall_score(Y_test, y_pred)

  cm = confusion_matrix(Y_test, y_pred)
  matrix_plot = ConfusionMatrixDisplay(confusion_matrix=cm)
  return f1_ex, accuracy, prec_ex, recall_ex, matrix_plot

In [6]:
plasma_trainX, plasma_testX, plasma_trainY, plasma_testY = get_test_train(ALZ_plasma_p)
csf_trainX, csf_testX, csf_trainY, csf_testY = get_test_train(ALZ_csf_p)


In [7]:
sort_stress_top_top_plasma_cor.index[0]

'Propionylglycinemethylester'

In [8]:
f1, acc, prec, recall, matrix = get_preds(plasma_trainX, plasma_testX, plasma_trainY, plasma_testY, sort_stress_top_all_plasma_cor.index[0])

In [9]:
stress_plasma1 = pd.DataFrame()
matrix_list = []
for metabolite in range(len(sort_stress_top_all_plasma_cor)):
  f1, acc, prec, recall, matrix = get_preds(plasma_trainX, plasma_testX, plasma_trainY, plasma_testY, sort_stress_top_all_plasma_cor.index[metabolite])
  metric_list = [f1, acc, prec, recall]
  stress_plasma1[sort_stress_top_all_plasma_cor.index[metabolite]] = metric_list
  matrix_list.append(matrix)

stress_plasma1.index = ['f1', 'acc', 'prec', 'recall']
stress_plasma1 = stress_plasma1.transpose().sort_values(by=['acc'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
stress_plasma2 = pd.DataFrame()
matrix_list = []
for metabolite in range(len(sort_stress_top_top_plasma_cor)):
  f1, acc, prec, recall, matrix = get_preds(plasma_trainX, plasma_testX, plasma_trainY, plasma_testY, sort_stress_top_top_plasma_cor.index[metabolite])
  metric_list = [f1, acc, prec, recall]
  stress_plasma2[sort_stress_top_top_plasma_cor.index[metabolite]] = metric_list
  matrix_list.append(matrix)

stress_plasma2.index = ['f1', 'acc', 'prec', 'recall']
stress_plasma2 = stress_plasma2.transpose().sort_values(by=['acc'])

In [11]:
stress_csf1 = pd.DataFrame()
matrix_list = []
for metabolite in range(len(sort_stress_top_all_csf_cor)):
  f1, acc, prec, recall, matrix = get_preds(csf_trainX, csf_testX, csf_trainY, csf_testY, sort_stress_top_all_csf_cor.index[metabolite])
  metric_list = [f1, acc, prec, recall]
  stress_csf1[sort_stress_top_all_csf_cor.index[metabolite]] = metric_list
  matrix_list.append(matrix)

stress_csf1.index = ['f1', 'acc', 'prec', 'recall']
stress_csf1 = stress_csf1.transpose().sort_values(by=['acc'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
stress_csf2 = pd.DataFrame()
matrix_list = []
for metabolite in range(len(sort_stress_top_top_csf_cor)):
  f1, acc, prec, recall, matrix = get_preds(csf_trainX, csf_testX, csf_trainY, csf_testY, sort_stress_top_top_csf_cor.index[metabolite])
  metric_list = [f1, acc, prec, recall]
  stress_csf2[sort_stress_top_top_csf_cor.index[metabolite]] = metric_list
  matrix_list.append(matrix)

stress_csf2.index = ['f1', 'acc', 'prec', 'recall']
stress_csf2 = stress_csf2.transpose().sort_values(by=['acc'])

In [13]:
stress_plasma1.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/stress_top_plasma_all_pred.csv")
stress_plasma2.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/stress_top_plasma_top_pred.csv")
stress_csf1.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/stress_top_csf_all_pred.csv")
stress_csf2.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/stress_top_csf_top_pred.csv")

In [14]:
#for i in matrix_list:
#  i.plot()

In [15]:
stress_plasma1

Unnamed: 0,f1,acc,prec,recall
Dimethylglycine+6.2130203,0.2,0.555556,0.25,0.166667
Dimethylglycine+4.069634,0.0,0.555556,0.0,0.0
5-Methylthioribose,0.0,0.555556,0.0,0.0
Dimethylglycine + 2.050125,0.363636,0.611111,0.4,0.333333
Dimethylglycine + 6.4713974,0.0,0.611111,0.0,0.0
Propionylglycine methyl ester,0.222222,0.611111,0.333333,0.166667
Propionylglycinemethylester,0.222222,0.611111,0.333333,0.166667
Capryloylglycine,0.0,0.611111,0.0,0.0
Propionylglycinemethylester+3.163646,0.222222,0.611111,0.333333,0.166667
Propionylglycine methyl ester + 3.163646,0.363636,0.611111,0.4,0.333333
