<a href="https://colab.research.google.com/github/squinton-gcu/Data-Science/blob/main/Thesis/Modules/Correlation_Module.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Correlation Module
This module will import the processed csv files and the lists of selected features. It will determine if there is any similarities in the files. It then runs a correlation analysis of the similar features between stress and ALZ. It calculates pvalues, standard error, RMSE, and spearman correlation coefficient.
It outputs into files.

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm 
from scipy import stats
from sklearn.model_selection import train_test_split
from statsmodels.tools.eval_measures import rmse

  import pandas.util.testing as tm


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# read in files
ALZ_plasma_feat = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/ALZ_plasma_selected_features.csv", index_col=0)
ALZ_csf_feat = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/ALZ_csf_selected_features.csv", index_col=0)
trauma_human_feat = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/trauma_human_selected_features.csv", index_col=0)

ALZ_plasma_p = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/ALZ_plasma_processed.csv", index_col=0).transpose()
ALZ_csf_p = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/ALZ_csf_processed.csv", index_col=0).transpose()
trauma_human_p = pd.read_csv("/content/drive/MyDrive/Thesis/Processed/trauma_human_processed.csv", index_col=0).transpose()

In [None]:
ALZ_plasma_feat.columns

Index(['C49 H30 Cl N O', '11-deoxy-PGE2',
       '1-Hexadecyl-2-arachidonoyl-glycerol', 'C31 H59 Cl N6 O2',
       'C51 H36 Cl N', 'C4 H4 N4 O2', 'BiliverdinIX+1.4045436', 'C23 H5 N O',
       'C4 H8 N2 O4 S', 'C38 H75 N15', 'C13 H23 Cl N4 O S',
       '5-Acetyl-4-methylthiazole', 'C40 H34 Cl N3 O2', 'C38 H58 N6 O4',
       'Propionylglycine methyl ester', 'C11 H20 N2 O3 S', 'C8 H12 N2 O3 S3',
       'Glucoheptonicacid', 'GPEtn(16:0/22:6(4Z,7Z,10Z,13Z,16Z,19Z))',
       'C32 H57 N5 S2', 'C13 H15 N O', 'DIHYDROFISSINOLIDE',
       '1-octadecanoyl-rac-glycerol', '2-Pyrrolidinone + 2.0697286',
       'Terephthalicacid', '12-amino-dodecanoicacid', 'Vigabatrin',
       'C31 H34 N8 O4 S', 'Uricacid', 'C4 H11 N O3 + 6.5848055', 'Uric acid',
       '2-Hydroxyadipicacid', 'URSINICACID'],
      dtype='object')

In [None]:
trauma_human_feat.columns

Index(['hippuric acid', 'gluconic acid lactone', 'glyceric acid', 'mannose',
       'serine', 'ribonic acid', '2-hydroxybutanoic acid', 'glycine',
       'taurine', 'alanine', 'squalene', 'aspartic acid', 'glycolic acid',
       'levoglucosan', 'proline', 'tryptophan', 'arachidic acid',
       'aminomalonate', 'glucose', 'phosphate', 'UDP-glucuronic acid',
       'hexuronic acid', '5-methoxytryptamine', 'citric acid',
       'methionine sulfoxide', 'inosine', 'benzylalcohol', 'ethanolamine',
       'threitol', 'mannitol', 'fumaric acid'],
      dtype='object')

In [None]:
ALZ_csf_feat.columns

Index(['C24 H29 N O3', 'C36 H43 N5 O2', 'C35 H61 N17', 'C33 H24 Cl N S',
       '2-pentadecenoic acid', '4-Hydroxypyridine', 'C37 H40 N2 O3 S2',
       'C19 H4 Cl N S', 'Gln Lys Glu', 'C11 H25 N O5', '1a,1b-dihomo-PGJ2',
       'C5 H5 N5 O3 + 2.1794953', 'C22 H22 Cl N3 O', 'C4 H2 O8 S2',
       'Isoniazid', 'C10 H21 N O', 'N_N-Didemethylchlorpromazine',
       'C31 H65 N5 O2 S2', 'C29 H51 Cl N10 O2', 'C4 H7 N3 O', 'C19 H38 N2 O3',
       'C14 H32 N2 S', 'C10 H20 N6 O', 'beta-vinylacrylicacid',
       'C28 H5 N5 O16', 'Pyroglutamic acid + 6.857936'],
      dtype='object')

In [None]:
stress_topVS_all_plasma = []
for val in trauma_human_feat.columns:
  for col in ALZ_plasma_p.columns:
    if val in col:
      stress_topVS_all_plasma.append(col)

In [None]:
stress_topVS_all_csf = []
for val in trauma_human_feat.columns:
  for col in ALZ_csf_p.columns:
    if val in col:
      stress_topVS_all_csf.append(col)

In [None]:
stress_topVS_plasma_top = []
for val in trauma_human_feat.columns:
  for col in ALZ_plasma_feat.columns:
    if val in col:
      stress_topVS_plasma_top.append(col)

In [None]:
stress_topVS_csf_top = []
for val in trauma_human_feat.columns:
  for col in ALZ_csf_feat.columns:
    if val in col:
      stress_topVS_csf_top.append(col)

In [None]:
for val in ALZ_csf_feat.columns:
  for col in ALZ_plasma_feat.columns:
    if val in col:
      print(col)

In [None]:
def calculate_corr(x_array, y_array, metabolite):
  """
  calculates the spearman correlation for the given metabolite. The x_array is a
  dataframe that contains all metabolites. The y_array is the binary variables.
  """
  cor_matrix_initial = pd.DataFrame({metabolite: x_array[metabolite], 'ALZ': y_array})
  corr_matrix = cor_matrix_initial.corr(method='spearman')
  correlation_val = corr_matrix[metabolite][1]
  return correlation_val

In [None]:
def calculate_log_values(X_train, X_test, Y_train, Y_test, Metabolite):
  """
  This function will take the test and train datasets with the metabolites
  to help run the correlation analysis. The output variables include pvalue
  standard error, RMSE value.
  """
  X_train1 = X_train[Metabolite]
  X_train1 = sm.add_constant(X_train1)
  logreg1 = sm.Logit(Y_train, X_train1).fit()
  pvalue_train = logreg1.pvalues[1]
  stand_error_train = logreg1.bse[1] #standard error

  X_test1 = X_test[Metabolite]
  X_test1 = sm.add_constant(X_test1)
  logreq1_test = sm.Logit(Y_test, X_test1).fit()
  ypred = logreg1.predict(X_test1)
  rmse2 = rmse(Y_test, ypred)

  return pvalue_train, stand_error_train, rmse2

In [None]:
def correlation_main(dataframe, metabList):
  """
  This function will take the two above functions and complie it into a dataframe.
  The first part creates the test and train dataset from the processed csv file.
  It then will calculate the logistic values for the similar metabolites.
  The output is a dataframe.
  """
  # split data
  train,test = train_test_split(dataframe, test_size=0.4)
  y_train_headers = train.index
  Y_train_num = []
  for i in y_train_headers:
    if i != None and "AD" in i:
      Y_train_num.append(1)
    else:
      Y_train_num.append(0)

  y_test_headers = test.index
  Y_test_num = []
  for i in y_test_headers:
    if i != None and "AD" in i:
      Y_test_num.append(1)
    else:
      Y_test_num.append(0)

  #calculate logistic values
  output_frame = pd.DataFrame()
  for metab in metabList:
    pvalues, standard_error, RMSE_val = calculate_log_values(train, test, Y_train_num, Y_test_num, metab)
    corr_value = calculate_corr(train, Y_train_num, metab)
    logistic_list = [pvalues, standard_error, RMSE_val, corr_value]
    output_frame[metab] = logistic_list

  output_frame.index = ['pvalues', 'standard error', 'RMSE', "Spearman Corr"]

  return output_frame
  

In [None]:
stress_top_all_plasma_cor = correlation_main(ALZ_plasma_p, stress_topVS_all_plasma)
stress_top_top_plasma_cor = correlation_main(ALZ_plasma_p, stress_topVS_plasma_top)
stress_top_all_csf_cor = correlation_main(ALZ_csf_p, stress_topVS_all_csf)
stress_top_top_csf_cor = correlation_main(ALZ_csf_p, stress_topVS_csf_top)


  x = pd.concat(x[::order], 1)


Optimization terminated successfully.
         Current function value: 0.622670
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.615348
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.636331
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.606843
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.635884
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.619874
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.565316
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.625607
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.627250
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.635723
  

In [None]:
sort_stress_top_all_plasma_cor = stress_top_all_plasma_cor.transpose().sort_values(by=['pvalues'])
sort_stress_top_top_plasma_cor = stress_top_top_plasma_cor.transpose().sort_values(by=['pvalues'])
sort_stress_top_all_csf_cor = stress_top_all_csf_cor.transpose().sort_values(by=['pvalues'])
sort_stress_top_top_csf_cor = stress_top_top_csf_cor.transpose().sort_values(by=['pvalues'])



In [None]:
sort_stress_top_top_csf_cor

Unnamed: 0,pvalues,standard error,RMSE,Spearman Corr


In [None]:
sort_stress_top_all_csf_cor

Unnamed: 0,pvalues,standard error,RMSE,Spearman Corr
Diethanolamine,0.033891,1.11018,0.583919,-0.42318
Diethanolamine + 5.846524,0.062883,3.737103,0.545717,-0.379777
Diethanolamine+5.846524,0.077197,3.811653,0.53915,-0.314673
D-Homoserine,0.116657,0.221461,0.5351,-0.227901
Glutarylglycine,0.124715,2.070371,0.616797,-0.292971
Ethanolamine phosphate (Phosphoethanolamine),0.252973,0.096475,0.561916,-0.054337
2-Deoxyglucose,0.282607,1.281292,0.555194,0.227866
D-4-Hydroxyphenylglycine,0.295635,0.284597,0.508057,-0.097657
Isobutylglycine,0.305658,0.315055,0.534807,-0.238717
"3-Hydroxy-N-glycyl-2,6-xylidine (3-Hydroxyglycinexylidide)",0.39396,0.807965,0.499337,0.184463


In [None]:
sort_stress_top_top_plasma_cor

Unnamed: 0,pvalues,standard error,RMSE,Spearman Corr
Propionylglycine methyl ester,0.106771,0.844985,0.417623,0.324948


In [None]:
sort_stress_top_all_plasma_cor

Unnamed: 0,pvalues,standard error,RMSE,Spearman Corr
Propionylglycinemethylester,0.065484,0.775096,0.433224,0.342962
2-Methylbutyrylglycine+6.5558777,0.065506,0.354389,0.46658,0.403485
2-Methylbutyrylglycine,0.065693,0.477201,0.520616,0.322788
Dimethylglycine+2.8044205,0.06923,0.816419,0.453713,0.342962
Propionylglycinemethylester+6.2598743,0.072163,0.463442,0.44888,0.383311
Propionylglycine methyl ester + 6.2598743,0.072846,0.476407,0.454579,0.413572
2-Methylbutyrylglycine + 6.5558777,0.077885,0.381905,0.465583,0.363137
Propionylglycine methyl ester,0.097942,0.763496,0.437416,0.28244
1-O-(1Z-hexadecenyl)-2-(4Z_7Z_10Z_13Z_16Z_19Z-docosahexaenoyl)-sn-glycero-3-phosphoethanolamine,0.102785,0.6538,0.442018,-0.353049
Dimethylglycine + 2.8044205,0.134699,0.694422,0.44935,0.221917


In [None]:
#save values
sort_stress_top_all_plasma_cor.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/sort_stress_top_all_plasma_cor.csv")
sort_stress_top_top_plasma_cor.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/sort_stress_top_top_plasma_cor.csv")
sort_stress_top_all_csf_cor.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/sort_stress_top_all_csf_cor.csv")
sort_stress_top_top_csf_cor.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/sort_stress_top_top_csf_cor.csv")

In [None]:
#create files for graphs
def create_matrix_graph(dataframe, metabList):
  y_dataframe_headers = dataframe.index
  Y_dataframe_num = []
  for i in y_dataframe_headers:
    if i != None and "AD" in i:
      Y_dataframe_num.append(1)
    else:
      Y_dataframe_num.append(0) 

  cor_graph_matrix = pd.DataFrame()
  for metab in metabList:
    cor_graph_matrix[metab] = dataframe[metab]
  cor_graph_matrix["y"] = Y_dataframe_num
  return(cor_graph_matrix)



In [None]:
plasma_top_stress = create_matrix_graph(ALZ_plasma_p, stress_topVS_plasma_top)
plasma_all_stress = create_matrix_graph(ALZ_plasma_p, stress_topVS_all_plasma)
csf_top_stress = create_matrix_graph(ALZ_csf_p, stress_topVS_csf_top)
csf_all_stress = create_matrix_graph(ALZ_csf_p, stress_topVS_all_csf)


In [None]:
plasma_top_stress.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/plasma_top_stress_graphs.csv")
plasma_all_stress.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/plasma_all_stress_graph.csv")
csf_top_stress.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/csf_top_stress_graph.csv")
csf_all_stress.to_csv("/content/drive/MyDrive/Thesis/Processed/Selected_Features/csf_all_stress_graph.csv")

## References

Difference Between T-test and Linear Regression—Ask Any Difference. (2022, January 22). https://askanydifference.com/difference-between-t-test-and-linear-regression/

Harris, C. R., Millman, K. J., van der Walt, S. J., Gommers, R., Virtanen, P., Cournapeau, D., Wieser, E., Taylor, J., Berg, S., Smith, N. J., Kern, R., Picus, M., Hoyer, S., van Kerkwijk, M. H., Brett, M., Haldane, A., del Río, J. F., Wiebe, M., Peterson, P., … Oliphant, T. E. (2020). Array programming with NumPy. Nature, 585(7825), 357–362. https://doi.org/10.1038/s41586-020-2649-2

Larose, C., & Larose, D. (2019). Data Science Using Python and R. John Wiley & Sons, Inc.

Mukaka, M. (2012). A guide to appropriate use of Correlation coefficient in medical research. Malawi Medical Journal : The Journal of Medical Association of Malawi, 24(3), 69–71.

Pedregosa, F., Varoquaux, G., Gramfort, A., Michel, V., Thirion, B., Grisel, O., Blondel, M., Prettenhofer, P., Weiss, R., Dubourg, V., Vanderplas, J., Passos, A., Cournapeau, D., Brucher, M., Perrot, M., & Duchesnay, E. (2011). Scikit-learn: Machine Learning in Python. Journal of Machine Learning Research, 12, 2825–2830.

Ramzai, J. (2021, May 25). Clearly explained: Pearson V/S Spearman Correlation Coefficient. Medium. https://towardsdatascience.com/clearly-explained-pearson-v-s-spearman-correlation-coefficient-ada2f473b8

Reback, J., Jbrockmendel, McKinney, W., Van Den Bossche, J., Augspurger, T., Roeschke, M., Hawkins, S., Cloud, P., Gfyoung, Sinhrks, Hoefler, P., Klein, A., Terji Petersen, Tratner, J., She, C., Ayd, W., Naveh, S., JHM Darbyshire, Garcia, M., … Battiston, P. (2022). pandas-dev/pandas: Pandas 1.4.2 (v1.4.2) [Computer software]. Zenodo. https://doi.org/10.5281/ZENODO.3509134

Seabold, S., & Perktold, J. (2010). statsmodels: Econometric and statistical modeling with python. 9th Python in Science Conference.

Virtanen, P., Gommers, R., Oliphant, T. E., Haberland, M., Reddy, T., Cournapeau, D., Burovski, E., Peterson, P., Weckesser, W., Bright, J., van der Walt, S. J., Brett, M., Wilson, J., Millman, K. J., Mayorov, N., Nelson, A. R. J., Jones, E., Kern, R., Larson, E., … Vázquez-Baeza, Y. (2020). SciPy 1.0: Fundamental algorithms for scientific computing in Python. Nature Methods, 17(3), 261–272. https://doi.org/10.1038/s41592-019-0686-2

Wu, S. (2021, June 5). What are the best metrics to evaluate your regression model? Medium. https://towardsdatascience.com/what-are-the-best-metrics-to-evaluate-your-regression-model-418ca481755b

