In [25]:
import pandas as pd
import numpy as np

In [26]:
import re

def clean_cell_line_name(name):
    cleaned_name = re.sub(r'[^a-zA-Z0-9]', '', str(name))  # 移除非字母数字字符
    uppercased_name = cleaned_name.upper()  # 将所有字符转换为大写
    return uppercased_name

In [27]:
df_all = pd.read_csv('../data/drug_sensitivity.csv')
print(df_all.shape)
df_all.head()

(141222, 3)


Unnamed: 0,drug,cell_line,IC50
0,5-Fluorouracil,HL60,2.558926
1,5-azacytidine,HL60,0.917132
2,A-366,HL60,4.83616
3,ABT737,HL60,-2.817798
4,AGI-5198,HL60,3.644734


In [28]:
df_model = pd.read_csv('../Figs/data/Model.csv')
df_model['StrippedCellLineName'] = df_model['StrippedCellLineName'].apply(clean_cell_line_name)
# 取StrippedCellLineName列、ModelID列和OncotreeCode列
df_model = df_model[['StrippedCellLineName', 'ModelID', 'OncotreeCode']]
df_model.head()

Unnamed: 0,StrippedCellLineName,ModelID,OncotreeCode
0,NIHOVCAR3,ACH-000001,HGSOC
1,HL60,ACH-000002,AML
2,CACO2,ACH-000003,COAD
3,HEL,ACH-000004,AML
4,HEL9217,ACH-000005,AML


In [29]:
df_all_merged = pd.merge(df_all, df_model, left_on='cell_line', right_on='StrippedCellLineName')
# 取drug列、cell_line列、IC50列、prediction列和OncotreeCode列
df_all_merged = df_all_merged[['drug', 'cell_line', 'IC50', 'OncotreeCode']]
df_all_merged.head()

Unnamed: 0,drug,cell_line,IC50,OncotreeCode
0,5-Fluorouracil,HL60,2.558926,AML
1,5-azacytidine,HL60,0.917132,AML
2,A-366,HL60,4.83616,AML
3,ABT737,HL60,-2.817798,AML
4,AGI-5198,HL60,3.644734,AML


In [30]:
# 统计df_merged中每个OncotreeCode的数量
df_all_merged['OncotreeCode'].value_counts()

OncotreeCode
LUAD     10894
SCLC      7887
COAD      7667
PAAD      5351
GB        4713
         ...  
PRSCC      179
ABC        178
THME       153
AASTR      153
GBC        153
Name: count, Length: 118, dtype: int64

In [31]:
# 将OncotreeCode中LUAD LUSC SCLC NSCLC LCLC LUAS 全都挑选出来
df_lung = df_all_merged[df_all_merged['OncotreeCode'].isin(['LUAD', 'LUSC', 'SCLC', 'NSCLC', 'LCLC'])]
print(df_lung.shape)

(25367, 4)


In [32]:
# 根据df_lung中的cell_line列和OncotreeCode列，统计每个cell_line对应的OncotreeCode的数量
df_lung.groupby(['cell_line', 'OncotreeCode']).size()

cell_line  OncotreeCode
A427       LUAD            212
A549       LUAD            219
ABC1       LUAD            211
BEN        NSCLC           180
CAL12T     NSCLC           180
                          ... 
SBC5       SCLC            217
SHP77      SCLC            217
SKMES1     LUSC            217
SW1271     SCLC            218
SW1573     LUAD            217
Length: 125, dtype: int64

In [33]:
# 将df_all_merged中OncotreeCode为LUAD LUSC SCLC NSCLC LCLC LUAS 全部剔除
df_all_merged = df_all_merged[~df_all_merged['OncotreeCode'].isin(['LUAD', 'LUSC', 'SCLC', 'NSCLC', 'LCLC', 'LUAS'])]
print(df_all_merged.shape)

(115246, 4)


In [34]:
df_lung_LUAD = df_lung[df_lung['OncotreeCode'] == 'LUAD']
df_lung_LUSC = df_lung[df_lung['OncotreeCode'] == 'LUSC']
df_lung_SCLC = df_lung[df_lung['OncotreeCode'] == 'SCLC']
df_lung_NSCLC = df_lung[df_lung['OncotreeCode'] == 'NSCLC']
df_lung_LCLC = df_lung[df_lung['OncotreeCode'] == 'LCLC']
#计算每个肺癌亚型的IC50均值
print(df_lung_LUAD['IC50'].mean())
print(df_lung_LUSC['IC50'].mean())
print(df_lung_SCLC['IC50'].mean())
print(df_lung_NSCLC['IC50'].mean())
print(df_lung_LCLC['IC50'].mean())
print(df_lung_LUAD.shape)
print(df_lung_LUSC.shape)
print(df_lung_SCLC.shape)
print(df_lung_NSCLC.shape)
print(df_lung_LCLC.shape)

3.2642688614833855
2.847476608967291
2.774671021427666
3.455652426305354
3.0147438154761907
(10894, 4)
(2721, 4)
(7887, 4)
(1513, 4)
(2352, 4)


In [35]:
# 将df_lung_LUAD按照cell_line列分组计数
print(df_lung_LUAD.groupby('cell_line').size().shape)
df_lung_LUAD.groupby('cell_line').size()

(53,)


cell_line
A427        212
A549        219
ABC1        211
CALU3       218
CALU6       217
CORL105     218
EKVX        215
HCC44       208
HCC78       218
HCC827      216
HOP62       217
LXF289      217
NCIH1355    218
NCIH1435    216
NCIH1437    180
NCIH1563    217
NCIH1568    217
NCIH1573    152
NCIH1623    217
NCIH1648    152
NCIH1651    218
NCIH1666    218
NCIH1693    217
NCIH1734    179
NCIH1755    218
NCIH1781    217
NCIH1792    218
NCIH1793    217
NCIH1838    152
NCIH1944    217
NCIH1975    216
NCIH2009    152
NCIH2023    218
NCIH2030    180
NCIH2085    218
NCIH2087    218
NCIH2122    217
NCIH2228    217
NCIH2291    180
NCIH23      217
NCIH2342    152
NCIH2347    152
NCIH2405    217
NCIH3122    217
NCIH358     218
NCIH441     218
NCIH522     217
NCIH650     217
NCIH838     180
PC14        233
RERFLCKJ    217
RERFLCMS    180
SW1573      217
dtype: int64

In [36]:
# 将df_lung_LUAD按照cell_line列分组，计算每组IC50均值
df_lung_LUAD.groupby('cell_line')['IC50'].mean()

cell_line
A427        2.083734
A549        2.126839
ABC1        3.373758
CALU3       3.685103
CALU6       3.259278
CORL105     3.331660
EKVX        4.287032
HCC44       2.315300
HCC78       2.764644
HCC827      3.464406
HOP62       3.653889
LXF289      2.541765
NCIH1355    3.869713
NCIH1435    3.460048
NCIH1437    2.259255
NCIH1563    3.567885
NCIH1568    3.905032
NCIH1573    3.659925
NCIH1623    2.830177
NCIH1648    2.244419
NCIH1651    2.828278
NCIH1666    3.671152
NCIH1693    4.349220
NCIH1734    2.196774
NCIH1755    2.765261
NCIH1781    2.953486
NCIH1792    2.236629
NCIH1793    4.643482
NCIH1838    4.492617
NCIH1944    2.900394
NCIH1975    2.725264
NCIH2009    2.674907
NCIH2023    2.990786
NCIH2030    2.820422
NCIH2085    3.504790
NCIH2087    3.883753
NCIH2122    1.631861
NCIH2228    3.561005
NCIH2291    4.185628
NCIH23      2.790524
NCIH2342    3.688610
NCIH2347    4.802210
NCIH2405    5.198243
NCIH3122    2.987052
NCIH358     3.420865
NCIH441     5.336580
NCIH522     3.214659
NCI

In [37]:
# 将df_lung_LUSC按照cell_line列分组计数
print(df_lung_LUSC.groupby('cell_line').size().shape)
df_lung_LUSC.groupby('cell_line').size()

(13,)


cell_line
EBC1         217
EPLC272H     217
HARA         218
HCC15        216
KNS62        218
LK2          217
LOUNH91      217
NCIH1703     153
NCIH1869     181
NCIH2170     215
NCIH520      218
RERFLCSQ1    217
SKMES1       217
dtype: int64

In [38]:
# 将df_lung_LUSC按照cell_line列分组，计算每组IC50均值
df_lung_LUSC.groupby('cell_line')['IC50'].mean()

cell_line
EBC1         2.756060
EPLC272H     2.448825
HARA         2.220111
HCC15        3.204460
KNS62        3.113284
LK2          2.967368
LOUNH91      2.992204
NCIH1703     1.947610
NCIH1869     3.635554
NCIH2170     2.366260
NCIH520      3.030817
RERFLCSQ1    2.961082
SKMES1       3.236934
Name: IC50, dtype: float64

In [39]:
# 将df_lung_SCLC按照cell_line列分组计数
df_lung_SCLC.groupby('cell_line').size()

cell_line
COLO668     218
CORL279     151
CORL311     217
CORL88      153
CORL95       90
DMS114      218
DMS273      217
DMS53       217
DMS79       212
HCC33       180
LU134A      180
LU135       217
LU165       218
NCIH1048    217
NCIH1092    217
NCIH1105    179
NCIH1341    152
NCIH1436    179
NCIH146     180
NCIH1694    199
NCIH1836    180
NCIH1876    216
NCIH196     217
NCIH1963    200
NCIH2029    214
NCIH2081    180
NCIH209     216
NCIH211     207
NCIH2171    178
NCIH2196    180
NCIH2227    211
NCIH446     216
NCIH524     218
NCIH526     214
NCIH69      151
NCIH82      218
NCIH841     208
SBC5        217
SHP77       217
SW1271      218
dtype: int64

In [40]:
# 将df_lung_SCLC按照cell_line列分组，计算每组IC50均值
df_lung_SCLC.groupby('cell_line')['IC50'].mean()

cell_line
COLO668     2.958644
CORL279     1.840898
CORL311     2.292915
CORL88      3.377413
CORL95      3.293159
DMS114      2.996707
DMS273      2.297249
DMS53       3.465210
DMS79       3.029730
HCC33       2.536028
LU134A      2.009420
LU135       2.600281
LU165       4.729243
NCIH1048    2.608096
NCIH1092    3.994255
NCIH1105    3.127476
NCIH1341    2.525309
NCIH1436    2.876299
NCIH146     2.549774
NCIH1694    2.903202
NCIH1836    4.343933
NCIH1876    2.184054
NCIH196     3.913057
NCIH1963    2.265536
NCIH2029    3.344416
NCIH2081    2.720299
NCIH209     1.959613
NCIH211     1.567235
NCIH2171    1.512347
NCIH2196    3.194765
NCIH2227    3.090786
NCIH446     2.121046
NCIH524     2.057855
NCIH526     2.202300
NCIH69      2.314723
NCIH82      1.890956
NCIH841     3.068002
SBC5        2.299435
SHP77       2.526963
SW1271      4.275926
Name: IC50, dtype: float64

In [41]:
# 将df_lung_NSCLC按照cell_line列分组计数
df_lung_NSCLC.groupby('cell_line').size()

cell_line
BEN         180
CAL12T      180
CHAGOK1     180
HOP92       217
NCIH1650    217
NCIH2110    180
NCIH2172    180
NCIH2444    179
dtype: int64

In [42]:
# 将df_lung_NSCLC按照cell_line列分组，计算每组IC50均值
df_lung_NSCLC.groupby('cell_line')['IC50'].mean()

cell_line
BEN         2.940921
CAL12T      2.717184
CHAGOK1     3.142675
HOP92       3.507879
NCIH1650    4.404607
NCIH2110    2.131641
NCIH2172    3.678711
NCIH2444    4.923961
Name: IC50, dtype: float64

In [43]:
# 将df_lung_LCLC按照cell_line列分组计数
df_lung_LCLC.groupby('cell_line').size()

cell_line
CORL23       217
IALM         218
LCLC103H     218
LCLC97TM1    217
NCIH1155     216
NCIH1299     218
NCIH1581     218
NCIH1915     217
NCIH460      178
NCIH661      217
NCIH810      218
dtype: int64

In [44]:
# 将df_lung_LCLC按照cell_line列分组，计算每组IC50均值
df_lung_LCLC.groupby('cell_line')['IC50'].mean()

cell_line
CORL23       3.102153
IALM         4.097379
LCLC103H     3.168922
LCLC97TM1    2.824349
NCIH1155     2.313106
NCIH1299     3.083103
NCIH1581     2.484422
NCIH1915     3.383698
NCIH460      1.705192
NCIH661      3.682746
NCIH810      3.074673
Name: IC50, dtype: float64

# 分割训练和测试集 Cell Blind

In [52]:
#设置随机种子
np.random.seed(3)
# 获取所有唯一的cell_line
unique_cell_lines = df_all_merged['cell_line'].unique()
# 随机打乱这些唯一的cell_line
np.random.shuffle(unique_cell_lines)
# 根据你的比例来分割这些cell_line
test_size = 0.082  # 测试集大小
n_test = int(np.ceil(len(unique_cell_lines) * test_size))
print('测试集细胞系数量：',n_test)
train_cell_lines = unique_cell_lines[n_test:]
test_cell_lines = unique_cell_lines[:n_test]

# 现在根据这些cell_line来分割原始数据集
df_train = df_all_merged[df_all_merged['cell_line'].isin(train_cell_lines)]
df_test = df_all_merged[df_all_merged['cell_line'].isin(test_cell_lines)]

print(df_train.shape)
print(df_test.shape)


测试集细胞系数量： 46
(105754, 4)
(9492, 4)


In [53]:
# 将df_lung中cell_line列中[ABC1,NCIH650,CORL105,CALU6,NCIH522,NCIH2081,NCIH1048,LU135,NCIH146,RERFLCSQ1,EBC1,NCIH2172,HOP92,NCIH810,NCIH1299]取出来
df_lung_test = df_lung[df_lung['cell_line'].isin(['NCIH2172','CHAGOK1','LCLC103H','NCIH1299','NCIH2170','RERFLCSQ1','NCIH146','NCIH526','NCIH1963','LU135','EKVX','PC14','HCC827','NCIH1355','NCIH2228'])]
df_lung_train = df_lung[~df_lung['cell_line'].isin(['NCIH2172','CHAGOK1','LCLC103H','NCIH1299','NCIH2170','RERFLCSQ1','NCIH146','NCIH526','NCIH1963','LU135','EKVX','PC14','HCC827','NCIH1355','NCIH2228'])]
print(df_lung_train.shape)
print(df_lung_test.shape)

(22229, 4)
(3138, 4)


In [54]:
#df_train和df_lung_train合并；df_test和df_lung_test合并
df_train_final = pd.concat([df_train, df_lung_train])
df_test_final = pd.concat([df_test, df_lung_test])
print(df_train_final.shape)
print(df_test_final.shape)

(127983, 4)
(12630, 4)


In [56]:
# df_train_final 和df_test_final 重新编号
df_train_final = df_train_final.reset_index(drop=True)
df_test_final = df_test_final.reset_index(drop=True)