# 1.将数据分割成训练和测试

In [12]:
import pandas as pd
import numpy as np

In [2]:
df_drug_sensitivity = pd.read_csv('../data/drug_sensitivity_duplicates_mean.csv')[['drug','cell_line','IC50']]
print(df_drug_sensitivity.shape)
df_drug_sensitivity.head()

(229608, 3)


Unnamed: 0,drug,cell_line,IC50
0,123138,HL60,2.397135
1,123829,HL60,1.910071
2,150412,HL60,1.909186
3,5-Fluorouracil,HL60,2.558926
4,5-azacytidine,HL60,0.917132


# 缩减数据，只保留细胞系在df_gsva_cell中的数据

In [3]:
df_gsva_cell = pd.read_csv('../data_omics/GeneExp_Wilcoxon_test_Analysis_Log10_P_value_C2_KEGG_MEDICUS.csv')['cell_line']
print(df_gsva_cell.shape)
df_gsva_cell.head()

(688,)


0        HL60
1         HEL
2    MONOMAC6
3       LS513
4       A101D
Name: cell_line, dtype: object

In [4]:
# 筛选出df_drug_sensitivity中的细胞系在df_gsva_cell中的数据
df_drug_sensitivity = df_drug_sensitivity[df_drug_sensitivity['cell_line'].isin(df_gsva_cell)]
print(df_drug_sensitivity.shape)
df_drug_sensitivity.head()

(171013, 3)


Unnamed: 0,drug,cell_line,IC50
0,123138,HL60,2.397135
1,123829,HL60,1.910071
2,150412,HL60,1.909186
3,5-Fluorouracil,HL60,2.558926
4,5-azacytidine,HL60,0.917132


In [5]:
# 筛选出df_drug_sensitivity中的drug在df_drugs中的数据
df_drugs = pd.read_csv('../data/CCLE-GDSC-SMILES.csv')['DRUG_NAME']
print(df_drugs.shape)
df_drug_sensitivity = df_drug_sensitivity[df_drug_sensitivity['drug'].isin(df_drugs)]
print(df_drug_sensitivity.shape)
df_drug_sensitivity.head()

(233,)
(141222, 3)


Unnamed: 0,drug,cell_line,IC50
3,5-Fluorouracil,HL60,2.558926
4,5-azacytidine,HL60,0.917132
15,A-366,HL60,4.83616
16,ABT737,HL60,-2.817798
17,AGI-5198,HL60,3.644734


# 1.严格保持细胞系数据隔离 Cell Blind

In [6]:
# 分割数据集，保证cell_line隔离，即训练集和测试集中不会出现相同的cell_line，训练集：测试集=10:1
# 假设df_drug_sensitivity是你的数据集
# cell_lines = df_drug_sensitivity['cell_line'].unique()
#设置随机种子
np.random.seed(3)
# 获取所有唯一的cell_line
unique_cell_lines = df_drug_sensitivity['cell_line'].unique()

# 随机打乱这些唯一的cell_line
np.random.shuffle(unique_cell_lines)

# 根据你的比例来分割这些cell_line
test_size = 0.09  # 测试集大小
n_test = int(np.ceil(len(unique_cell_lines) * test_size))
print('测试集细胞系数量：',n_test)
train_cell_lines = unique_cell_lines[n_test:]
test_cell_lines = unique_cell_lines[:n_test]

# 现在根据这些cell_line来分割原始数据集
df_train = df_drug_sensitivity[df_drug_sensitivity['cell_line'].isin(train_cell_lines)]
df_test = df_drug_sensitivity[df_drug_sensitivity['cell_line'].isin(test_cell_lines)]

print(df_train.shape)
print(df_test.shape)

print(test_cell_lines)

测试集细胞系数量： 62
(128613, 3)
(12609, 3)
['MM1S' 'JHOS4' 'SIHA' 'MOLM13' 'CAOV3' 'OCIM1' 'T98G' 'KYSE270'
 'KARPAS620' '42MGBA' 'RCK8' 'TE441T' 'TC71' 'DEL' 'NCIH660' 'OE19'
 'SKGT2' 'RERFLCMS' 'P30OHK' 'KMRC1' 'NCIH1105' 'NCIH211' 'COLO678'
 'CA922' 'HUCCT1' 'CAKI1' 'MHHCALL4' 'SKMEL1' '647V' 'DANG' 'LK2' 'TT'
 'U2OS' 'DU4475' 'A2780' 'NCIH1355' 'HS578T' 'SKNFI' 'NCIH1869' 'GAMG'
 'PC14' 'A704' 'SKNSH' 'SUDHL6' 'VAL' 'EKVX' 'TOV21G' 'UACC812' 'NCIH1563'
 'HCC1143' 'HCC1187' 'EN' 'SAS' 'COLO684' 'HCT15' 'HARA' 'EFO21' 'NCIH522'
 'NCIH1648' 'KCL22' 'NCIH2452' 'CAL33']


In [7]:
# 统计每种细胞系有多少个样本
counts = df_train['cell_line'].value_counts()
print(counts)
# 找出细胞系名称为SW620的所有药物名称
drugs = df_train[df_train['cell_line']=='SW620']['drug'].unique()
print(drugs)

cell_line
HT29       233
SW620      233
C32        233
MHHES1     232
A375       232
          ... 
NB1        140
SNU1040    109
CORL95      90
HCC202      14
RH18        11
Name: count, Length: 626, dtype: int64
['5-Fluorouracil' '5-azacytidine' 'A-366' 'ABT737' 'AGI-5198' 'AGI-6780'
 'AGK2' 'AMG-319' 'AT13148' 'AZ6102' 'AZ960' 'AZD1208' 'AZD1332' 'AZD2014'
 'AZD3759' 'AZD4547' 'AZD5153' 'AZD5363' 'AZD5438' 'AZD5582' 'AZD5991'
 'AZD6482' 'AZD6738' 'AZD7762' 'AZD8055' 'AZD8186' 'Acetalax' 'Afatinib'
 'Afuresertib' 'Alisertib' 'Alpelisib' 'Avagacestat' 'Axitinib' 'BI-2536'
 'BIBR-1532' 'BMS-345541' 'BMS-536924' 'BMS-754807' 'BX795' 'Bicalutamide'
 'Bleomycin' 'Bleomycin (50 uM)' 'Bortezomib' 'Bosutinib' 'Bromosporine'
 'Buparlisib' 'CCT-018159' 'CCT007093' 'CHIR-99021' 'CPI-637' 'CZC24832'
 'Camptothecin' 'Carmustine' 'Cediranib' 'Cisplatin' 'Crizotinib'
 'Cyclophosphamide' 'Cytarabine' 'Dabrafenib' 'Dacarbazine' 'Dactinomycin'
 'Dactolisib' 'Daporinad' 'Dasatinib' 'Dihydrorotenone' 'D

In [8]:
# 重置索引
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
# 保存数据
df_train.to_csv('../planD/data/drug_sensitivity_CellBlind_train.csv',index=True,header=True)
df_test.to_csv('../planD/data/drug_sensitivity_CellBlind_test.csv',index=True,header=True)

# 2.将数据分割成训练和测试，严格保持药物数据隔离 Drug Blind

# 3.将数据分割成训练和测试，混合集 Mixed-Set

In [6]:
# 将df_drug_sensitivity中的数据分割成训练集和测试集，训练集：测试集=10:1；保证训练集和测试集要出现相同的cell_line和drug
import pandas as pd
from sklearn.model_selection import train_test_split

# 假设df_drug_sensitivity是你的数据集
# df_drug_sensitivity = pd.read_csv('path_to_your_dataset.csv')
def split_data_by_cell_line(df):
    # 按cell_line对数据进行分组
    grouped = df.groupby('cell_line')

    train_data = []
    test_data = []

    # 对每一组数据进行分割
    for _, group in grouped:
        if len(group) <= 10:  # 如果小于等于10个样本，直接取1个为测试集
            group_train, group_test = group.iloc[:-1], group.iloc[-1:]
        else:
            group_train, group_test = train_test_split(group, test_size=1/11.5)  # 10:1的比例
        
        train_data.append(group_train)
        test_data.append(group_test)

    # 将数据整合回DataFrame格式
    train_df = pd.concat(train_data, ignore_index=True)
    test_df = pd.concat(test_data, ignore_index=True)

    return train_df, test_df


In [7]:

train_df, test_df = split_data_by_cell_line(df_drug_sensitivity)
print(train_df.shape)
print(test_df.shape)
train_df.head()

(128741, 3)
(12481, 3)


Unnamed: 0,drug,cell_line,IC50
0,Entinostat,22RV1,1.987387
1,GSK2830371,22RV1,6.076753
2,SN-38,22RV1,-4.972312
3,Nutlin-3a (-),22RV1,2.716152
4,Wnt-C59,22RV1,4.158391


In [11]:
# 统计每种细胞系有多少个样本
counts = train_df['cell_line'].value_counts()
print(counts)
# 找出细胞系名称为SW620的所有药物名称
drugs = train_df[train_df['cell_line']=='HL60']['drug'].unique()
print(drugs)
print(len(drugs))

cell_line
HT29       212
C32        212
PC14       212
SW620      212
KMH2       211
          ... 
SNU16      127
SNU1040     99
CORL95      82
HCC202      12
RH18        10
Name: count, Length: 688, dtype: int64
['Lenalidomide' '5-Fluorouracil' 'YK-4-279' 'Vismodegib' 'Leflunomide'
 'Bleomycin (50 uM)' 'Mirin' 'Wee1 Inhibitor' 'Ulixertinib' 'IWP-2'
 'Taselisib' 'Obatoclax Mesylate' 'Tamoxifen' 'Pyridostatin' 'SN-38'
 'AZD8055' 'WZ4003' 'AZD4547' 'AZD2014' 'EPZ5676'
 'Podophyllotoxin bromide' 'POMHEX' 'Fludarabine' 'Bosutinib' 'LCL161'
 'AZD6738' 'GSK2830371' 'Doramapimod' 'Osimertinib' 'Sepantronium bromide'
 'GSK2578215A' 'JQ1' 'Refametinib' 'Niraparib' 'RVX-208' 'ICL-SIRT078'
 'VE-822' 'Vinblastine' 'GSK1904529A' 'WIKI4' 'Dihydrorotenone'
 'Elephantin' 'I-BRD9' 'Venetoclax' 'Schweinfurthin A' 'glutathione'
 'VX-11e' 'AZD5363' 'UNC0379' 'Carmustine' 'AZD1208' 'Rapamycin'
 'GSK-LSD1' 'CZC24832' 'Epirubicin' 'GSK2606414' 'LJI308' 'Trametinib'
 'Entospletinib' 'BIBR-1532' 'CPI-637' 'GN

In [12]:
# 重置索引
# train_df = train_df.reset_index(drop=True)
# test_df = test_df.reset_index(drop=True)
# 保存数据
train_df.to_csv('../planD/data/drug_sensitivity_MixedSet_train.csv',index=True,header=True)
test_df.to_csv('../planD/data/drug_sensitivity_MixedSet_test.csv',index=True,header=True)

In [6]:
df_model = pd.read_csv('../Figs/data/Model.csv')[['StrippedCellLineName','OncotreeCode']]
# 将StrippedCellLineName改为cell_line
df_model = df_model.rename(columns={'StrippedCellLineName':'cell_line'})
print(df_model.shape)
df_model.head()

(1864, 2)


Unnamed: 0,cell_line,OncotreeCode
0,NIHOVCAR3,HGSOC
1,HL60,AML
2,CACO2,COAD
3,HEL,AML
4,HEL9217,AML


# 调整数据，将药物Erlotinib加入测试集

In [19]:
df_drug_sensitivity_MixedSet_Erlotinib_train = pd.read_csv('../model_omics_experiment/data/drug_sensitivity_MixedSet_Erlotinib_train.csv')
# 去掉第一列，将数据根据cell_line和drug进行排序
df_drug_sensitivity_MixedSet_Erlotinib_train = df_drug_sensitivity_MixedSet_Erlotinib_train.iloc[:,1:]
df_drug_sensitivity_MixedSet_Erlotinib_train = df_drug_sensitivity_MixedSet_Erlotinib_train.sort_values(by=['cell_line','drug'])
print(df_drug_sensitivity_MixedSet_Erlotinib_train.shape)
df_drug_sensitivity_MixedSet_Erlotinib_train.head()

(128737, 4)


Unnamed: 0,drug,cell_line,IC50,OncotreeCode
0,5-Fluorouracil,22RV1,2.319585,PRAD
1,5-azacytidine,22RV1,3.192165,PRAD
2,A-366,22RV1,4.685324,PRAD
3,ABT737,22RV1,3.98776,PRAD
4,AGI-5198,22RV1,6.255464,PRAD


In [4]:
# 将df_drug_sensitivity_MixedSet_Erlotinib_train与df_model合并，得到每个细胞系的癌症类型
df_drug_sensitivity_MixedSet_Erlotinib_train = pd.merge(df_drug_sensitivity_MixedSet_Erlotinib_train,df_model,on='cell_line',how='left')
print(df_drug_sensitivity_MixedSet_Erlotinib_train.shape)
df_drug_sensitivity_MixedSet_Erlotinib_train.head()

NameError: name 'df_model' is not defined

In [20]:
# 重置索引
df_drug_sensitivity_MixedSet_Erlotinib_train = df_drug_sensitivity_MixedSet_Erlotinib_train.reset_index(drop=True)
# 保存数据
df_drug_sensitivity_MixedSet_Erlotinib_train.to_csv('../model_omics_experiment/data/drug_sensitivity_MixedSet_Erlotinib_train.csv',index=True,header=True)

In [21]:
df_drug_sensitivity_MixedSet_Erlotinib_test = pd.read_csv('../model_omics_experiment/data/drug_sensitivity_MixedSet_Erlotinib_test.csv')
# 去掉第一列，将数据根据cell_line和drug进行排序
df_drug_sensitivity_MixedSet_Erlotinib_test = df_drug_sensitivity_MixedSet_Erlotinib_test.iloc[:,1:]
df_drug_sensitivity_MixedSet_Erlotinib_test = df_drug_sensitivity_MixedSet_Erlotinib_test.sort_values(by=['cell_line','drug'])
print(df_drug_sensitivity_MixedSet_Erlotinib_test.shape)
df_drug_sensitivity_MixedSet_Erlotinib_test.head()

(12485, 4)


Unnamed: 0,drug,cell_line,IC50,OncotreeCode
0,AZD5582,22RV1,2.402949,PRAD
1,BMS-754807,22RV1,2.273743,PRAD
2,Cediranib,22RV1,2.109625,PRAD
3,Crizotinib,22RV1,2.502973,PRAD
4,Daporinad,22RV1,-3.112784,PRAD


In [22]:
# 将df_drug_sensitivity_MixedSet_Erlotinib_test与df_model合并，得到每个细胞系的癌症类型
df_drug_sensitivity_MixedSet_Erlotinib_test = pd.merge(df_drug_sensitivity_MixedSet_Erlotinib_test,df_model,on='cell_line',how='left')
print(df_drug_sensitivity_MixedSet_Erlotinib_test.shape)
df_drug_sensitivity_MixedSet_Erlotinib_test.head()

NameError: name 'df_model' is not defined

In [23]:
# 重置索引
df_drug_sensitivity_MixedSet_Erlotinib_test = df_drug_sensitivity_MixedSet_Erlotinib_test.reset_index(drop=True)
# 保存数据
df_drug_sensitivity_MixedSet_Erlotinib_test.to_csv('../model_omics_experiment/data/drug_sensitivity_MixedSet_Erlotinib_test.csv',index=True,header=True)


In [24]:
# 检测train和test中是否有相同的cell_line-drug对
train = df_drug_sensitivity_MixedSet_Erlotinib_train[['cell_line','drug']]
test = df_drug_sensitivity_MixedSet_Erlotinib_test[['cell_line','drug']]
print(train.shape)
print(test.shape)
# 检测train和test中是否有相同的cell_line-drug对
train_set = set([tuple(line) for line in train.values])
test_set = set([tuple(line) for line in test.values])
print(len(train_set))
print(len(test_set))
print(train_set & test_set) # 没有重复的细胞系药物对

(128737, 2)
(12485, 2)
128737
12485
set()
