# Chromatin Profile and Metabolomics Correlation
**Author**: Rachael Jin

## Summary
This notebook computes the Pearson correlation coefficient and p-value between each histone marker and metabolite pair for 800+ cancer cell lines.

## Import data
First, we'll read in the metabolomics and chromatin profile data, which are saved as Excel workbooks.

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import kendalltau, pearsonr, spearmanr
from scipy import stats
md = pd.read_excel(io='CCLE metabolomics dataset.xlsx',sheet_name="All")
md.head()

Unnamed: 0,CCL,Tissue,Medium,Culture,2-aminoadipate,3-phosphoglycerate,Alpha-glycerophosphate,4-pyridoxate,Aconitate,Adenine,...,C56:8 TAG,C56:7 TAG,C56:6 TAG,C56:5 TAG,C56:4 TAG,C56:3 TAG,C56:2 TAG,C58:8 TAG,C58:7 TAG,C58:6 TAG
0,DMS53,LUNG,Waymouth,Adherent,6.112727,6.034198,5.896896,6.000532,5.513618,5.868529,...,6.070239,6.133433,6.091089,6.257711,6.372732,6.202511,5.939576,6.309821,6.115974,5.999436
1,SW1116,LARGE INTESTINE,L15,Adherent,5.577413,5.727045,5.111468,6.07325,5.802494,5.824473,...,6.248653,6.633575,6.378052,6.341043,6.360945,6.33354,6.137271,7.065858,6.832174,6.363064
2,NCIH1694,LUNG,DMEM-F12 wGln,Suspension,5.886398,5.574881,5.541259,5.848375,5.665026,5.875548,...,5.942887,5.946988,5.83798,5.91335,6.13753,5.807546,5.704149,5.881193,5.785208,5.504225
3,P3HR1,HAEMATOPOIETIC AND LYMPHOID TISSUE,RPMI,Empty,5.77003,6.099229,6.233259,5.543495,5.767759,6.155905,...,6.516922,6.113791,6.282113,6.248667,6.10948,6.04357,5.846802,6.429402,5.779815,6.24153
4,HUT78,HAEMATOPOIETIC AND LYMPHOID TISSUE,RPMI,Empty,5.480683,5.469742,6.509397,6.251005,5.190578,5.897085,...,6.161981,6.777932,6.67639,6.695659,6.751029,6.385056,6.682612,6.757899,6.72857,6.87926


In [2]:
metabolites = md.drop(['Tissue', 'Medium','Culture'], axis=1)
print('\n\nmd after deleting column\n--------------')
print(metabolites)



md after deleting column
--------------
          CCL  2-aminoadipate  3-phosphoglycerate  Alpha-glycerophosphate  \
0       DMS53        6.112727            6.034198                5.896896   
1      SW1116        5.577413            5.727045                5.111468   
2    NCIH1694        5.886398            5.574881                5.541259   
3       P3HR1        5.770030            6.099229                6.233259   
4       HUT78        5.480683            5.469742                6.509397   
..        ...             ...                 ...                     ...   
917     SF268        5.977636            6.026483                6.480536   
918     SF539        5.957233            6.090834                5.323475   
919     SNB75        5.967707            5.931487                5.620542   
920     HOP92        5.962415            5.992640                6.296222   
921     MUTZ3        6.332344            5.812531                5.446330   

     4-pyridoxate  Aconitate   Ad

In [3]:
hm = pd.read_csv('GCP_proteomics_remapped.csv')
hm.head()


Unnamed: 0,Cell Line,H3K4me0,H3K4me1,H3K4me2,H3K4ac1,H3K9me0K14ac0,H3K9me1K14ac0,H3K9me2K14ac0,H3K9me3K14ac0,H3K9ac1K14ac0,...,H3K27ac1K36me0,H3K27ac1K36me1,H3K27ac1K36me2,H3K27ac1K36me3,H3.3K27me0K36me0,H3K56me0,H3K56me1,H3K79me0,H3K79me1,H3K79me2
0,DMS53,0.11602,-0.153144,-0.348607,-1.417128,-1.281177,-0.719707,-0.20808,-0.033416,-0.967821,...,0.396178,1.261963,0.492776,-0.211349,-0.554973,-0.222912,-0.31091,-0.272655,0.271469,0.469647
1,SW1116,-0.058624,0.219592,0.110946,-0.170282,0.33463,0.497303,0.307907,-0.466686,0.062518,...,-1.198709,-1.394997,-1.123119,-1.501911,-0.180229,-0.075173,,0.051018,0.099032,0.169761
2,NCIH1694,0.480909,0.29844,0.073777,0.413953,-0.479543,0.13328,0.053279,-0.220467,-0.42716,...,0.055683,-0.659294,0.114288,-1.289012,0.280396,0.117564,,0.185984,0.19176,-0.437561
3,P3HR1,-0.079957,-0.617656,-0.566702,0.079932,0.37314,0.159682,0.060946,-0.181112,0.208328,...,1.098303,0.381884,0.258282,0.751323,0.031194,-0.199316,0.037929,0.003978,-0.225147,-0.061445
4,HUT78,-0.059965,-0.063483,-0.26798,0.357422,0.075651,0.04783,0.115243,-0.498239,-0.059567,...,-0.185237,0.239421,0.358072,-0.176527,-0.351188,0.037021,,0.045495,-0.153684,-0.106306


## Merge datasets based on unique cancer cell line name
Next, we'll concatenate the two dataframes and match based on cancer cell lines.

In [4]:
result = metabolites.merge(hm,how='inner',left_on='CCL', right_on='Cell Line')
result.head()

Unnamed: 0,CCL,2-aminoadipate,3-phosphoglycerate,Alpha-glycerophosphate,4-pyridoxate,Aconitate,Adenine,Adipate,Alpha-ketoglutarate,AMP,...,H3K27ac1K36me0,H3K27ac1K36me1,H3K27ac1K36me2,H3K27ac1K36me3,H3.3K27me0K36me0,H3K56me0,H3K56me1,H3K79me0,H3K79me1,H3K79me2
0,DMS53,6.112727,6.034198,5.896896,6.000532,5.513618,5.868529,5.977177,5.693074,5.923737,...,0.396178,1.261963,0.492776,-0.211349,-0.554973,-0.222912,-0.31091,-0.272655,0.271469,0.469647
1,SW1116,5.577413,5.727045,5.111468,6.07325,5.802494,5.824473,5.888821,5.768379,5.760784,...,-1.198709,-1.394997,-1.123119,-1.501911,-0.180229,-0.075173,,0.051018,0.099032,0.169761
2,NCIH1694,5.886398,5.574881,5.541259,5.848375,5.665026,5.875548,5.894904,5.83964,5.742613,...,0.055683,-0.659294,0.114288,-1.289012,0.280396,0.117564,,0.185984,0.19176,-0.437561
3,P3HR1,5.77003,6.099229,6.233259,5.543495,5.767759,6.155905,6.111148,5.949481,6.342703,...,1.098303,0.381884,0.258282,0.751323,0.031194,-0.199316,0.037929,0.003978,-0.225147,-0.061445
4,HUT78,5.480683,5.469742,6.509397,6.251005,5.190578,5.897085,6.148333,5.607481,5.8716,...,-0.185237,0.239421,0.358072,-0.176527,-0.351188,0.037021,,0.045495,-0.153684,-0.106306


In [5]:
metabolites.info()
print('\n')
hm.info()
print('\n')
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 922 entries, 0 to 921
Columns: 226 entries, CCL to C58:6 TAG
dtypes: float64(225), object(1)
memory usage: 1.6+ MB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 866 entries, 0 to 865
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Cell Line         866 non-null    object 
 1   H3K4me0           865 non-null    float64
 2   H3K4me1           866 non-null    float64
 3   H3K4me2           865 non-null    float64
 4   H3K4ac1           790 non-null    float64
 5   H3K9me0K14ac0     866 non-null    float64
 6   H3K9me1K14ac0     866 non-null    float64
 7   H3K9me2K14ac0     865 non-null    float64
 8   H3K9me3K14ac0     864 non-null    float64
 9   H3K9ac1K14ac0     865 non-null    float64
 10  H3K9me0K14ac1     866 non-null    float64
 11  H3K9me1K14ac1     866 non-null    float64
 12  H3K9me2K14ac1     865 non-null    float64
 13  H3K9me3K14ac1     

## Separate dataframes 
Now that the data is matched by cell lines, we can separate the dataframes again.

In [6]:
left = result.iloc[:,1:226]
left.head()

Unnamed: 0,2-aminoadipate,3-phosphoglycerate,Alpha-glycerophosphate,4-pyridoxate,Aconitate,Adenine,Adipate,Alpha-ketoglutarate,AMP,Citrate,...,C56:8 TAG,C56:7 TAG,C56:6 TAG,C56:5 TAG,C56:4 TAG,C56:3 TAG,C56:2 TAG,C58:8 TAG,C58:7 TAG,C58:6 TAG
0,6.112727,6.034198,5.896896,6.000532,5.513618,5.868529,5.977177,5.693074,5.923737,5.641242,...,6.070239,6.133433,6.091089,6.257711,6.372732,6.202511,5.939576,6.309821,6.115974,5.999436
1,5.577413,5.727045,5.111468,6.07325,5.802494,5.824473,5.888821,5.768379,5.760784,5.914742,...,6.248653,6.633575,6.378052,6.341043,6.360945,6.33354,6.137271,7.065858,6.832174,6.363064
2,5.886398,5.574881,5.541259,5.848375,5.665026,5.875548,5.894904,5.83964,5.742613,5.570208,...,5.942887,5.946988,5.83798,5.91335,6.13753,5.807546,5.704149,5.881193,5.785208,5.504225
3,5.77003,6.099229,6.233259,5.543495,5.767759,6.155905,6.111148,5.949481,6.342703,6.054781,...,6.516922,6.113791,6.282113,6.248667,6.10948,6.04357,5.846802,6.429402,5.779815,6.24153
4,5.480683,5.469742,6.509397,6.251005,5.190578,5.897085,6.148333,5.607481,5.8716,5.128463,...,6.161981,6.777932,6.67639,6.695659,6.751029,6.385056,6.682612,6.757899,6.72857,6.87926


In [7]:
right = result.iloc[:,227:269]
right.head()

Unnamed: 0,H3K4me0,H3K4me1,H3K4me2,H3K4ac1,H3K9me0K14ac0,H3K9me1K14ac0,H3K9me2K14ac0,H3K9me3K14ac0,H3K9ac1K14ac0,H3K9me0K14ac1,...,H3K27ac1K36me0,H3K27ac1K36me1,H3K27ac1K36me2,H3K27ac1K36me3,H3.3K27me0K36me0,H3K56me0,H3K56me1,H3K79me0,H3K79me1,H3K79me2
0,0.11602,-0.153144,-0.348607,-1.417128,-1.281177,-0.719707,-0.20808,-0.033416,-0.967821,-1.150058,...,0.396178,1.261963,0.492776,-0.211349,-0.554973,-0.222912,-0.31091,-0.272655,0.271469,0.469647
1,-0.058624,0.219592,0.110946,-0.170282,0.33463,0.497303,0.307907,-0.466686,0.062518,-0.517698,...,-1.198709,-1.394997,-1.123119,-1.501911,-0.180229,-0.075173,,0.051018,0.099032,0.169761
2,0.480909,0.29844,0.073777,0.413953,-0.479543,0.13328,0.053279,-0.220467,-0.42716,0.215504,...,0.055683,-0.659294,0.114288,-1.289012,0.280396,0.117564,,0.185984,0.19176,-0.437561
3,-0.079957,-0.617656,-0.566702,0.079932,0.37314,0.159682,0.060946,-0.181112,0.208328,0.182229,...,1.098303,0.381884,0.258282,0.751323,0.031194,-0.199316,0.037929,0.003978,-0.225147,-0.061445
4,-0.059965,-0.063483,-0.26798,0.357422,0.075651,0.04783,0.115243,-0.498239,-0.059567,0.845077,...,-0.185237,0.239421,0.358072,-0.176527,-0.351188,0.037021,,0.045495,-0.153684,-0.106306


## Compute pearson correlation coefficient and pvalue
Finally, we'll compute the correlation coefficients between metabolites and histone markers and the p-value of correlation. Note that we're also computing metabolite-metabolite and histone-histone correlations. While those are interesting as well, we'll ignore those for downstream analyses.


In [8]:
correlation = result.corr(method ='pearson')
correlation.to_csv('correlation.csv')

In [9]:
corr = pd.concat([left, right], axis=1, keys=['left', 'right']).corr().loc['left', 'right']

In [10]:
corr.to_csv('corr.csv')

In [11]:
def calculate_pvalues(df,left,right):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    df1 = left.dropna()._get_numeric_data()
    df2 = right.dropna()._get_numeric_data()
    df1cols = pd.DataFrame(columns=df1.columns)
    df2cols = pd.DataFrame(columns=df2.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    newpvalues = df2cols.transpose().join(df1cols, how='outer')
    for r in (df.columns):
        for c in (df.columns):
            pvalues[r][c] = round(pearsonr(df[r], df[c])[1],4)
    for r in (df1.columns):
        for c in (df2.columns):
            if pvalues[r][c] <= 0.05:
                newpvalues[r][c] = pvalues[r][c]
    newpvalues.to_csv('pvalues<0.05.csv')
    return newpvalues

calculate_pvalues(result,left,right)

Unnamed: 0,2-aminoadipate,3-phosphoglycerate,Alpha-glycerophosphate,4-pyridoxate,Aconitate,Adenine,Adipate,Alpha-ketoglutarate,AMP,Citrate,...,C56:8 TAG,C56:7 TAG,C56:6 TAG,C56:5 TAG,C56:4 TAG,C56:3 TAG,C56:2 TAG,C58:8 TAG,C58:7 TAG,C58:6 TAG
H3K4me0,,,,,,,,,,,...,,0.0051,0.0017,0.0014,0.0015,0.0155,0.0132,0.0494,0.017,0.0085
H3K4me1,,,,,0.0135,,,0.01,0.0289,,...,0.0009,0.0004,0.0004,0.0008,0.0027,0.0056,0.0008,0.001,0.0031,0.0004
H3K4me2,,,0.0012,,,,,,,,...,,,,,,,,,,
H3K4ac1,,,0.0002,,,,0.0342,,,,...,,,,,,,,,,
H3K9me0K14ac0,,,,,0.0153,0.0193,,,,,...,,,,,0.0165,0.0305,,,,
H3K9me1K14ac0,,,0.0437,,,0.0076,,,0.0006,,...,,,,,,,,,,
H3K9me2K14ac0,,,,,,,,,0.0008,,...,,,,,,,,,,
H3K9me3K14ac0,,0.0027,,,0.0102,0.0172,0.0372,,0.0,,...,0.0025,0.0384,0.0064,0.0012,0.0001,0.002,0.0015,,,0.0068
H3K9ac1K14ac0,0.0391,,,,0.0016,0.0005,,,,0.0173,...,0.0001,0.0004,0.0002,0.0004,0.0446,,,0.0018,0.0026,0.0003
H3K9me0K14ac1,,,0.0077,,,,,,0.0006,,...,,,,,,,,,,
