<a href="https://colab.research.google.com/github/squinton-gcu/Data-Science/blob/main/Correlation_Module.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Correlation Module
This module will import the processed csv files and the lists of selected features. It will determine if there is any similarities in the files. It then runs a correlation analysis of the similar features between stress and ALZ. It calculates pvalues, standard error, RMSE, and spearman correlation coefficient.
It outputs into files.

In [49]:
import numpy as np
import pandas as pd
import statsmodels.api as sm 
from scipy import stats
from sklearn.model_selection import train_test_split
from statsmodels.tools.eval_measures import rmse

In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
# read in files
ALZ_plasma_feat = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/ALZ_plasma_selected_features.csv", index_col=0)
ALZ_csf_feat = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/ALZ_csf_selected_features.csv", index_col=0)
trauma_human_feat = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/trauma_human_selected_features.csv", index_col=0)

ALZ_plasma_p = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/ALZ_plasma_processed.csv", index_col=0).transpose()
ALZ_csf_p = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/ALZ_csf_processed.csv", index_col=0).transpose()
trauma_human_p = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/trauma_human_processed.csv", index_col=0).transpose()

In [52]:
ALZ_plasma_feat.columns

Index(['C49 H30 Cl N O', '11-deoxy-PGE2',
       '1-Hexadecyl-2-arachidonoyl-glycerol', 'C31 H59 Cl N6 O2',
       'C51 H36 Cl N', '8-Amino-7-oxononanoate', 'BiliverdinIX+1.4045436',
       'C23 H5 N O', 'C4 H8 N2 O4 S', 'Propionylglycinemethylester',
       '2-Pyrrolidinone + 2.8319786', 'C32 H30 N2 S3',
       '5-Acetyl-4-methylthiazole', 'C42 H83 N O7 S', 'C40 H34 Cl N3 O2',
       'C38 H58 N6 O4', 'ValSerLys', 'C31 H43 N3 O8',
       '2-Pyrrolidinone+2.8319786', 'C11 H20 N2 O3 S', 'C8 H12 N2 O3 S3',
       'C12 H25 N7 O2', 'GPEtn(16:0/22:6(4Z,7Z,10Z,13Z,16Z,19Z))', 'PROPOXUR',
       '1-octadecanoyl-rac-glycerol', '2-Pyrrolidinone + 2.0697286',
       'Terephthalicacid', 'ValSerLys+1.0849711', 'C18 H28 O3',
       'C31 H34 N8 O4 S', 'Uricacid', 'Terephthalic acid',
       'C4 H11 N O3 + 6.5848055', 'C20 H31 N5 O', 'Uric acid',
       'Biliverdin IX + 1.4045436', 'C17 H32 Cl N3 O', 'C21 H3 N3 O3',
       'C31 H48 S5', 'C28 H31 N5 O10 S'],
      dtype='object')

In [53]:
trauma_human_feat.columns

Index(['hippuric acid', 'gluconic acid lactone', 'glyceric acid', 'mannose',
       'serine', 'ribonic acid', '2-hydroxybutanoic acid', 'glycine',
       'taurine', 'n-acetylglutamate', 'alanine', 'squalene', 'aspartic acid',
       'glycolic acid', 'levoglucosan', 'proline', 'tryptophan',
       'arachidic acid', 'aminomalonate', 'glucose', 'UDP-glucuronic acid',
       'hexuronic acid', 'citric acid', 'hexadecane', 'methionine sulfoxide',
       'benzylalcohol', 'xylitol', 'ribose', 'mannitol',
       'N-acetylglycine NIST', 'palmitoleic acid', 'hexitol'],
      dtype='object')

In [54]:
ALZ_csf_feat.columns

Index(['C24 H29 N O3', 'C8 H12 O3 S', 'C36 H43 N5 O2', 'C35 H61 N17',
       'C33 H24 Cl N S', 'C13 H20 N6 O', 'Bis (2-hydroxypropyl) amine',
       'C37 H40 N2 O3 S2', 'Ne-Methyl-L-lysine', 'Bis(2-hydroxypropyl)amine',
       'C11 H25 N O5', '1a,1b-dihomo-PGJ2', 'C38 H42 N2 S5',
       'C12 H24 Cl2 N6 O', '4-Aminophenyl1-thio-beta-D-glucuronide+2.5971124',
       'C39 H53 N3 O3 + 4.4767947', 'C4 H2 O8 S2', 'Isoniazid', 'C10 H21 N O',
       'C27 H50 O S5', 'C7 H5 N O3 S2', 'N_N-Didemethylchlorpromazine',
       'C31 H65 N5 O2 S2', 'D-Homoserine',
       '12beta-Hydroxy-3-oxo-5beta-cholan-24-oic Acid + 2.8745131',
       'C19 H38 N2 O3', 'C14 H32 N2 S', 'C10 H20 N6 O',
       'beta-vinylacrylicacid', '8-Amino Caprylic acid', 'C28 H5 N5 O16'],
      dtype='object')

In [70]:
stress_topVS_all_plasma = []
for val in trauma_human_feat.columns:
  for col in ALZ_plasma_p.columns:
    if val in col:
      stress_topVS_all_plasma.append(col)

In [71]:
stress_topVS_all_csf = []
for val in trauma_human_feat.columns:
  for col in ALZ_csf_p.columns:
    if val in col:
      stress_topVS_all_csf.append(col)

In [72]:
stress_topVS_plasma_top = []
for val in trauma_human_feat.columns:
  for col in ALZ_plasma_feat.columns:
    if val in col:
      stress_topVS_plasma_top.append(col)

In [73]:
stress_topVS_csf_top = []
for val in trauma_human_feat.columns:
  for col in ALZ_csf_feat.columns:
    if val in col:
      stress_topVS_csf_top.append(col)

In [74]:
for val in ALZ_csf_feat.columns:
  for col in ALZ_plasma_feat.columns:
    if val in col:
      print(col)

In [75]:
def calculate_corr(x_array, y_array, metabolite):
  """
  calculates the spearman correlation for the given metabolite. The x_array is a
  dataframe that contains all metabolites. The y_array is the binary variables.
  """
  cor_matrix_initial = pd.DataFrame({metabolite: x_array[metabolite], 'ALZ': y_array})
  corr_matrix = cor_matrix_initial.corr(method='spearman')
  correlation_val = corr_matrix[metabolite][1]
  return correlation_val

In [76]:
def calculate_log_values(X_train, X_test, Y_train, Y_test, Metabolite):
  """
  This function will take the test and train datasets with the metabolites
  to help run the correlation analysis. The output variables include pvalue
  standard error, RMSE value.
  """
  X_train1 = X_train[Metabolite]
  X_train1 = sm.add_constant(X_train1)
  logreg1 = sm.Logit(Y_train, X_train1).fit()
  pvalue_train = logreg1.pvalues[1]
  stand_error_train = logreg1.bse[1] #standard error

  X_test1 = X_test[Metabolite]
  X_test1 = sm.add_constant(X_test1)
  logreq1_test = sm.Logit(Y_test, X_test1).fit()
  ypred = logreg1.predict(X_test1)
  rmse2 = rmse(Y_test, ypred)

  return pvalue_train, stand_error_train, rmse2

In [77]:
def correlation_main(dataframe, metabList):
  """
  This function will take the two above functions and complie it into a dataframe.
  The first part creates the test and train dataset from the processed csv file.
  It then will calculate the logistic values for the similar metabolites.
  The output is a dataframe.
  """
  # split data
  train,test = train_test_split(dataframe, test_size=0.4)
  y_train_headers = train.index
  Y_train_num = []
  for i in y_train_headers:
    if i != None and "AD" in i:
      Y_train_num.append(1)
    else:
      Y_train_num.append(0)

  y_test_headers = test.index
  Y_test_num = []
  for i in y_test_headers:
    if i != None and "AD" in i:
      Y_test_num.append(1)
    else:
      Y_test_num.append(0)

  #calculate logistic values
  output_frame = pd.DataFrame()
  for metab in metabList:
    pvalues, standard_error, RMSE_val = calculate_log_values(train, test, Y_train_num, Y_test_num, metab)
    corr_value = calculate_corr(train, Y_train_num, metab)
    logistic_list = [pvalues, standard_error, RMSE_val, corr_value]
    output_frame[metab] = logistic_list

  output_frame.index = ['pvalues', 'standard error', 'RMSE', "Spearman Corr"]

  return output_frame
  

In [83]:
stress_top_all_plasma_cor = correlation_main(ALZ_plasma_p, stress_topVS_all_plasma)
stress_top_top_plasma_cor = correlation_main(ALZ_plasma_p, stress_topVS_plasma_top)
stress_top_all_csf_cor = correlation_main(ALZ_csf_p, stress_topVS_all_csf)
stress_top_top_csf_cor = correlation_main(ALZ_csf_p, stress_topVS_csf_top)


  x = pd.concat(x[::order], 1)


Optimization terminated successfully.
         Current function value: 0.488429
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.661647
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.528371
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.683868
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.527960
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.692861
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.496897
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.688621
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.516459
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.693121
  

In [84]:
sort_stress_top_all_plasma_cor = stress_top_all_plasma_cor.transpose().sort_values(by=['pvalues'])
sort_stress_top_top_plasma_cor = stress_top_top_plasma_cor.transpose().sort_values(by=['pvalues'])
sort_stress_top_all_csf_cor = stress_top_all_csf_cor.transpose().sort_values(by=['pvalues'])
sort_stress_top_top_csf_cor = stress_top_top_csf_cor.transpose().sort_values(by=['pvalues'])



In [85]:
sort_stress_top_top_csf_cor

Unnamed: 0,pvalues,standard error,RMSE,Spearman Corr
D-Homoserine,0.161805,1.015967,0.443387,-0.222585


In [86]:
sort_stress_top_all_csf_cor

Unnamed: 0,pvalues,standard error,RMSE,Spearman Corr
D-4-Hydroxyphenylglycine,0.067118,1.121776,0.491486,-0.281169
2-Pyridylacetylglycine,0.185319,0.577187,0.523949,0.291582
Phenylalanine,0.196756,0.433206,0.494086,-0.124964
D-Homoserine,0.208769,0.724259,0.465215,-0.11455
2-Methylbutyrylglycine,0.272908,0.762014,0.501235,0.281169
"3-Hydroxy-N-glycyl-2,6-xylidine (3-Hydroxyglycinexylidide)",0.282913,2.025842,0.467278,0.218687
3-Hydroxy-N-glycyl-2_6-xylidine(3-Hydroxyglycinexylidide),0.352415,0.484365,0.47513,0.2291
1-heptadecanoyl-2-(9Z-tetradecenoyl)-sn-glycero-3-phosphoserine+5.069642,0.382561,0.449541,0.485896,0.20416
Methyl o-methoxyhippuric acid,0.459211,0.454397,0.502297,0.078523
1-heptadecanoyl-2-(9Z-tetradecenoyl)-sn-glycero-3-phosphoserine + 5.069642,0.472783,0.438735,0.517671,-0.129054


In [87]:
sort_stress_top_top_plasma_cor

Unnamed: 0,pvalues,standard error,RMSE,Spearman Corr
Propionylglycinemethylester,0.098036,0.4608,0.401865,0.324948


In [88]:
sort_stress_top_all_plasma_cor

Unnamed: 0,pvalues,standard error,RMSE,Spearman Corr
Dimethylglycine + 2.050125,0.061886,0.425497,0.618412,-0.172092
2-Methylbutyrylglycine+6.5558777,0.066077,0.729579,0.54495,0.343132
2-Methylbutyrylglycine + 6.5558777,0.074588,0.732702,0.548629,0.308819
Propionylglycinemethylester+6.2598743,0.089289,0.751539,0.538589,0.331694
Propionylglycine methyl ester + 6.2598743,0.101877,0.810036,0.543336,0.297381
L-4-Hydroxy-3-methoxy-a-methylphenylalanine,0.116468,0.584686,0.591729,-0.274547
Dimethylglycine+2.050125,0.131292,0.922481,0.666703,-0.320305
Homoserine,0.16952,0.781288,0.609948,-0.274505
2-Methylbutyrylglycine + 2.0774822,0.174667,0.593103,0.592118,-0.217317
Dimethylglycine+2.8044205,0.190497,1.660355,0.525632,0.194441


In [89]:
#save values
sort_stress_top_all_plasma_cor.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/sort_stress_top_all_plasma_cor.csv")
sort_stress_top_top_plasma_cor.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/sort_stress_top_top_plasma_cor.csv")
sort_stress_top_all_csf_cor.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/sort_stress_top_all_csf_cor.csv")
sort_stress_top_top_csf_cor.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/sort_stress_top_top_csf_cor.csv")

## References

Difference Between T-test and Linear Regression—Ask Any Difference. (2022, January 22). https://askanydifference.com/difference-between-t-test-and-linear-regression/

Harris, C. R., Millman, K. J., van der Walt, S. J., Gommers, R., Virtanen, P., Cournapeau, D., Wieser, E., Taylor, J., Berg, S., Smith, N. J., Kern, R., Picus, M., Hoyer, S., van Kerkwijk, M. H., Brett, M., Haldane, A., del Río, J. F., Wiebe, M., Peterson, P., … Oliphant, T. E. (2020). Array programming with NumPy. Nature, 585(7825), 357–362. https://doi.org/10.1038/s41586-020-2649-2

Larose, C., & Larose, D. (2019). Data Science Using Python and R. John Wiley & Sons, Inc.

Mukaka, M. (2012). A guide to appropriate use of Correlation coefficient in medical research. Malawi Medical Journal : The Journal of Medical Association of Malawi, 24(3), 69–71.

Pedregosa, F., Varoquaux, G., Gramfort, A., Michel, V., Thirion, B., Grisel, O., Blondel, M., Prettenhofer, P., Weiss, R., Dubourg, V., Vanderplas, J., Passos, A., Cournapeau, D., Brucher, M., Perrot, M., & Duchesnay, E. (2011). Scikit-learn: Machine Learning in Python. Journal of Machine Learning Research, 12, 2825–2830.

Ramzai, J. (2021, May 25). Clearly explained: Pearson V/S Spearman Correlation Coefficient. Medium. https://towardsdatascience.com/clearly-explained-pearson-v-s-spearman-correlation-coefficient-ada2f473b8

Reback, J., Jbrockmendel, McKinney, W., Van Den Bossche, J., Augspurger, T., Roeschke, M., Hawkins, S., Cloud, P., Gfyoung, Sinhrks, Hoefler, P., Klein, A., Terji Petersen, Tratner, J., She, C., Ayd, W., Naveh, S., JHM Darbyshire, Garcia, M., … Battiston, P. (2022). pandas-dev/pandas: Pandas 1.4.2 (v1.4.2) [Computer software]. Zenodo. https://doi.org/10.5281/ZENODO.3509134

Seabold, S., & Perktold, J. (2010). statsmodels: Econometric and statistical modeling with python. 9th Python in Science Conference.

Virtanen, P., Gommers, R., Oliphant, T. E., Haberland, M., Reddy, T., Cournapeau, D., Burovski, E., Peterson, P., Weckesser, W., Bright, J., van der Walt, S. J., Brett, M., Wilson, J., Millman, K. J., Mayorov, N., Nelson, A. R. J., Jones, E., Kern, R., Larson, E., … Vázquez-Baeza, Y. (2020). SciPy 1.0: Fundamental algorithms for scientific computing in Python. Nature Methods, 17(3), 261–272. https://doi.org/10.1038/s41592-019-0686-2

Wu, S. (2021, June 5). What are the best metrics to evaluate your regression model? Medium. https://towardsdatascience.com/what-are-the-best-metrics-to-evaluate-your-regression-model-418ca481755b

