In [1]:
import pandas as pd
from sklearn.model_selection import KFold

In [2]:
df_temp = pd.read_csv('../model_omics_experiment/data/drug_sensitivity_CellBlind_test.csv',index_col=0)
df_temp.head()

Unnamed: 0,drug,cell_line,IC50
0,5-Fluorouracil,PC14,4.138791
1,5-azacytidine,PC14,2.782368
2,A-366,PC14,4.651003
3,ABT737,PC14,2.341568
4,AGI-5198,PC14,5.232588


In [3]:
# 读取数据
df = pd.read_csv('../data/drug_sensitivity.csv') 
print(df.shape)
df.head()

(141222, 3)


Unnamed: 0,drug,cell_line,IC50
0,5-Fluorouracil,HL60,2.558926
1,5-azacytidine,HL60,0.917132
2,A-366,HL60,4.83616
3,ABT737,HL60,-2.817798
4,AGI-5198,HL60,3.644734


In [6]:
# Mixed Set分割策略
# 根据cell_line分组
grouped = df.groupby('cell_line')
# 设置交叉验证折数 
n_splits = 10
# 存放分割结果
cv_datasets = [] 
# 对每一组cell_line数据 分别进行K折交叉验证
for name, group in grouped:
    X = group
    kf = KFold(n_splits=n_splits, shuffle=True,random_state=42)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        cv_datasets.append((X_train, X_test))

# 新建文件夹

# 得到整体数据的K折分割        
for i in range(n_splits):
    train_dfs = [ds[0] for ds in cv_datasets[i::n_splits]]
    test_dfs = [ds[1] for ds in cv_datasets[i::n_splits]]
    total_train_df = pd.concat(train_dfs)
    total_test_df = pd.concat(test_dfs)
    # 打印文件
    total_train_df.to_csv('../model_omics_experiment/data/k_fold_data/mixed/MixedSet_train_Fold{}.csv'.format(i),index=False)
    total_test_df.to_csv('../model_omics_experiment/data/k_fold_data/mixed/MixedSet_test_Fold{}.csv'.format(i),index=False)
    print(total_train_df.shape, total_test_df.shape)
    

(126876, 3) (14346, 3)
(126904, 3) (14318, 3)
(126945, 3) (14277, 3)
(126982, 3) (14240, 3)
(126998, 3) (14224, 3)
(127015, 3) (14207, 3)
(127061, 3) (14161, 3)
(127283, 3) (13939, 3)
(127444, 3) (13778, 3)
(127490, 3) (13732, 3)


In [8]:
# 另一个K-Fold分割策略，Cell Blind
# df = pd.read_csv('data.csv')

# 获取所有cell line  
cell_lines = df['cell_line'].unique()  
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
count = 0
for train_index, test_index in kf.split(cell_lines):
    
    train_cell_lines = cell_lines[train_index]
    test_cell_lines = cell_lines[test_index]
    
    train_df = df[df['cell_line'].isin(train_cell_lines)]
    test_df = df[df['cell_line'].isin(test_cell_lines)]
    # 打印文件
    train_df.to_csv('../model_omics_experiment/data/k_fold_data/cell_blind/CellBlind_train_Fold{}.csv'.format(count),index=False)
    test_df.to_csv('../model_omics_experiment/data/k_fold_data/cell_blind/CellBlind_test_Fold{}.csv'.format(count),index=False)
    count += 1
    print("TRAIN:", train_df.shape, "TEST:", test_df.shape) 
    print("Cell lines:", set(train_df['cell_line']).intersection(set(test_df['cell_line'])))

TRAIN: (127041, 3) TEST: (14181, 3)
Cell lines: set()
TRAIN: (127062, 3) TEST: (14160, 3)
Cell lines: set()
TRAIN: (126737, 3) TEST: (14485, 3)
Cell lines: set()
TRAIN: (127406, 3) TEST: (13816, 3)
Cell lines: set()
TRAIN: (127236, 3) TEST: (13986, 3)
Cell lines: set()
TRAIN: (126898, 3) TEST: (14324, 3)
Cell lines: set()
TRAIN: (127212, 3) TEST: (14010, 3)
Cell lines: set()
TRAIN: (127013, 3) TEST: (14209, 3)
Cell lines: set()
TRAIN: (127402, 3) TEST: (13820, 3)
Cell lines: set()
TRAIN: (126991, 3) TEST: (14231, 3)
Cell lines: set()
