# Proteomics Data Analysis

## Cell lines in CCLE


In [1]:
import pandas as pd

#read in data from excel
cell_lines = pd.read_excel('CCLE_cell_lines_for_prot.xlsx', sheet_name = "Sample_Information")


In [2]:
clines_of_interest = ["MCF10A",'HCC1937',"HCC1395","HCC1599"]
mouse_clines_of_interest = ["NMUMG","4T1"]
ref_clines_of_interest = ["HeLa",'RPES']

In [3]:
cell_lines.head()

Unnamed: 0,Cell Line,CCLE Code,Tissue of Origin,Protein 10-Plex ID,Protein TMT Label,Notes
0,CAL-33,CAL33_UPPER_AERODIGESTIVE_TRACT,Upper Aerodigestive Tract,0,126,Bridge line
1,Hs578t,HS578T_BREAST,Breast,0,127c,Bridge line
2,SW837,SW837_LARGE_INTESTINE,Large Intestine,0,127n,Bridge line
3,WM-266-4,WM2664_SKIN,Skin,0,128c,Bridge line
4,CAL-51,CAL51_BREAST,Breast,0,128n,Bridge line


In [5]:
for i in clines_of_interest:
    print(i)
    if cell_lines[cell_lines["Cell Line"]==i].empty:
        print("not found")
    else:
        print('Found in dataset')

MCF10A
not found
HCC1937
Found in dataset
HCC1395
Found in dataset
HCC1599
not found


## Proteins in CCLE

In [8]:
data = pd.read_excel('CCLE_Protein.xlsx', sheet_name='Normalized Protein Expression')


In [21]:
model_proteins = ['BUB1',
                  "AURKB",
                  "CDCA8", #Borealin
                  "BIRC5", #Survivin
                  "CASC5", #Knl1
                  "SGOL1", #SGO1
                  "TTK", #Mps1
                  "GSG2", #Haspin
                  "PLK1",
                  "NDC80",
                  'INCENP'
 ]
#uniprot ID mouse:
# Aurkb: O70126; Bub1: Q9Z1S0; Ttk: P35761;

In [15]:
# get columns with sample names in them
sample_cols = [i for i in data.columns if "HCC1937" in i or "HCC1395" in i]
data_sub = data[["Gene_Symbol"]+sample_cols]
data_sub.head()

Unnamed: 0,Gene_Symbol,HCC1395_BREAST_TenPx15,HCC1937_BREAST_TenPx18
0,SLC12A2,-0.699377,-0.303711
1,HOXD13,,
2,KDM1A,-0.142239,0.096498
3,SOX1,-1.58449,0.368069
4,SOX2,0.092982,-0.247699


In [16]:
data_sub.loc[data_sub["Gene_Symbol"].isin(model_proteins)]

Unnamed: 0,Gene_Symbol,HCC1395_BREAST_TenPx15,HCC1937_BREAST_TenPx18
3476,CASC5,-0.342402,1.062156
5088,NDC80,0.400557,-0.014357
5383,PLK1,0.128413,0.324359
6040,AURKB,0.085803,0.784818
6380,BIRC5,-0.070965,1.396607
6564,TTK,0.896795,0.398749
7786,CDCA8,-0.015251,0.340106
7825,BUB1,-0.868929,0.506849
8548,GSG2,-0.024352,
8587,SGOL1,-0.057897,


In [34]:
breast = [i for i in data.columns if "BREAST" in i]

In [36]:
breast_data = data[['Gene_Symbol']+breast]

In [39]:
breast_data.index = breast_data['Gene_Symbol']
breast_data.drop('Gene_Symbol', axis=1, inplace=True)
breast_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  breast_data.drop('Gene_Symbol', axis=1, inplace=True)


Unnamed: 0_level_0,MDAMB468_BREAST_TenPx01,AU565_BREAST_TenPx01,CAL51_BREAST_TenPx01,CAL120_BREAST_TenPx02,JIMT1_BREAST_TenPx03,KPL1_BREAST_TenPx03,CAMA1_BREAST_TenPx04,BT549_BREAST_TenPx07,CAL851_BREAST_TenPx07,MDAMB157_BREAST_TenPx07,...,HCC2218_BREAST_TenPx21,EFM19_BREAST_TenPx22,MDAMB231_BREAST_TenPx27,CAL120_BREAST_TenPx28,EFM192A_BREAST_TenPx37,HCC1806_BREAST_TenPx14,HCC1395_BREAST_TenPx15,HDQP1_BREAST_TenPx41,HCC1937_BREAST_TenPx18,BT20_BREAST_TenPx24
Gene_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SLC12A2,2.111348,-0.463928,0.788565,-0.347311,-0.928188,1.737461,-0.53817,-0.725208,0.191062,-0.711907,...,2.387635,0.02421,-0.833119,-0.536117,1.350658,0.426052,-0.699377,0.858136,-0.303711,-1.045727
HOXD13,,,,,,,,,,,...,,,-0.996259,,,0.408148,,,,
KDM1A,0.379683,0.191211,0.391243,-0.245094,-0.225535,0.669501,0.372413,-0.45618,0.064371,-0.320139,...,-0.36425,0.676871,-0.634099,-0.487763,-0.385244,0.039815,-0.142239,0.213539,0.096498,0.392393
SOX1,,,,,-1.166476,0.417165,,,,,...,-0.995162,-0.232445,1.350044,,,,-1.58449,,0.368069,
SOX2,-0.246367,-0.341305,-0.124474,,-0.200521,-0.253344,-0.327802,,,,...,-3.392947,-1.404017,-2.861446,,-0.974087,-1.042297,0.092982,-0.960049,-0.247699,-3.179027


In [47]:
set(breast_data.index).intersection(set(model_proteins))

{'AURKB',
 'BIRC5',
 'BUB1',
 'CASC5',
 'CDCA8',
 'GSG2',
 'INCENP',
 'NDC80',
 'PLK1',
 'SGOL1',
 'TTK'}

In [44]:
data_model = breast_data.loc[model_proteins]

In [45]:
data_model

Unnamed: 0_level_0,MDAMB468_BREAST_TenPx01,AU565_BREAST_TenPx01,CAL51_BREAST_TenPx01,CAL120_BREAST_TenPx02,JIMT1_BREAST_TenPx03,KPL1_BREAST_TenPx03,CAMA1_BREAST_TenPx04,BT549_BREAST_TenPx07,CAL851_BREAST_TenPx07,MDAMB157_BREAST_TenPx07,...,HCC2218_BREAST_TenPx21,EFM19_BREAST_TenPx22,MDAMB231_BREAST_TenPx27,CAL120_BREAST_TenPx28,EFM192A_BREAST_TenPx37,HCC1806_BREAST_TenPx14,HCC1395_BREAST_TenPx15,HDQP1_BREAST_TenPx41,HCC1937_BREAST_TenPx18,BT20_BREAST_TenPx24
Gene_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BUB1,-0.023445,0.039755,0.168012,0.150959,0.115552,-0.484967,-0.842154,0.464926,-1.135813,1.046275,...,-0.955957,-0.859266,0.049861,0.958667,,0.11499,-0.868929,-0.187482,0.506849,-0.147275
AURKB,-0.058279,-0.722529,0.777772,0.405197,-0.042972,-0.859013,-1.574758,0.596165,-0.608539,0.265651,...,-1.823246,-0.691457,-0.041271,1.032039,-1.790283,1.688725,0.085803,-0.043956,0.784818,0.479855
CDCA8,0.146511,-0.25492,0.564989,0.351528,0.186106,-0.344574,-1.282904,0.437511,-0.414063,0.248623,...,-0.610384,-0.222247,-0.240527,0.820279,0.039516,1.180208,-0.015251,0.012745,0.340106,-0.430072
BIRC5,0.351816,-0.401229,0.647033,0.475564,0.105604,-0.749893,-1.568951,0.152072,0.150354,0.048914,...,-1.247004,,-0.874006,0.693977,0.527555,0.973702,-0.070965,-0.501863,1.396607,-0.683279
CASC5,-0.123447,-0.239041,0.176877,0.157099,0.2607,-0.893336,-0.769735,0.427673,-1.080305,0.405101,...,-0.939556,0.149713,-0.339741,0.109969,0.263727,1.478456,-0.342402,-0.586185,1.062156,0.412868
SGOL1,,,,,0.272527,-0.16164,0.077698,0.327034,-0.084952,0.864439,...,-1.681876,,-0.657557,,0.969077,1.544288,-0.057897,,,-0.734814
TTK,0.853014,-0.117931,-0.321871,0.197435,0.149444,-1.123198,-1.188538,0.565357,-0.237179,0.936025,...,-0.616247,-0.284482,0.231939,0.643331,-0.369142,-0.113739,0.896795,-0.194308,0.398749,0.307877
GSG2,,,,0.037716,,,-0.578183,,,,...,,,-0.154264,,0.679921,,-0.024352,-0.247556,,0.036266
PLK1,-0.187642,-0.034517,0.171916,-0.123829,-0.06104,-0.656116,-1.150509,0.255744,-0.449991,0.4147,...,-0.67613,-0.911527,-0.171928,0.478997,-0.948773,-0.281049,0.128413,-0.453642,0.324359,-0.177043
NDC80,0.475772,-0.548868,0.125874,1.02988,0.207462,-0.822915,-1.067373,0.709207,-0.554803,0.622958,...,-0.507762,-1.114235,0.62116,1.750572,-0.481056,0.039039,0.400557,-0.059599,-0.014357,0.643549


In [51]:
full_samples = []
for i in data_model.columns:
    if data_model[i].isna().sum() == 0:
        print(i)
        full_samples.append(i)

CAMA1_BREAST_TenPx04
T47D_BREAST_TenPx09
HCC38_BREAST_TenPx11
HCC1500_BREAST_TenPx12
MDAMB436_BREAST_TenPx12
HCC1143_BREAST_TenPx16
HCC1187_BREAST_TenPx20
MDAMB231_BREAST_TenPx27
HCC1395_BREAST_TenPx15
BT20_BREAST_TenPx24


In [50]:
data_model.to_csv('bc_CCLE_TMT_model_proteins.csv')

In [53]:
data_model[full_samples+['HCC1937_BREAST_TenPx18']].to_csv('bc_CCLE_TMT_model_proteins_full_samples.csv')