In [1]:
import pandas as pd
from sklearn.model_selection import KFold

In [2]:
df_temp = pd.read_csv('../model_omics_experiment/data/drug_sensitivity_CellBlind_test.csv',index_col=0)
df_temp.head()

Unnamed: 0,drug,cell_line,IC50
0,5-Fluorouracil,PC14,4.138791
1,5-azacytidine,PC14,2.782368
2,A-366,PC14,4.651003
3,ABT737,PC14,2.341568
4,AGI-5198,PC14,5.232588


In [3]:
# 读取数据
df = pd.read_csv('../data/drug_sensitivity.csv') 
print(df.shape)
df.head()

(141222, 3)


Unnamed: 0,drug,cell_line,IC50
0,5-Fluorouracil,HL60,2.558926
1,5-azacytidine,HL60,0.917132
2,A-366,HL60,4.83616
3,ABT737,HL60,-2.817798
4,AGI-5198,HL60,3.644734


In [18]:
# Mixed Set分割策略
# 根据cell_line分组
grouped = df.groupby('cell_line')
min_group_size = min(grouped.size())
# 设置交叉验证折数 
n_splits = 11
# 存放分割结果
cv_datasets = [] 
# 对每一组cell_line数据 分别进行K折交叉验证
for name, group in grouped:
    X = group
    kf = KFold(n_splits=n_splits, shuffle=True,random_state=None)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        cv_datasets.append((X_train, X_test))

# 新建文件夹

# 得到整体数据的K折分割        
for i in range(n_splits):
    train_dfs = [ds[0] for ds in cv_datasets[i::n_splits]]
    test_dfs = [ds[1] for ds in cv_datasets[i::n_splits]]
    total_train_df = pd.concat(train_dfs)
    total_test_df = pd.concat(test_dfs)
    # 重置索引
    total_train_df = total_train_df.reset_index(drop=True)
    total_test_df = total_test_df.reset_index(drop=True)
    # 打印文件
    total_train_df.to_csv('../model_omics_experiment/data/k_fold_data/mixed/MixedSet_train_Fold{}.csv'.format(i),index=True)
    total_test_df.to_csv('../model_omics_experiment/data/k_fold_data/mixed/MixedSet_test_Fold{}.csv'.format(i),index=True)
    print(total_train_df.shape, total_test_df.shape)
    

(128137, 3) (13085, 3)
(128166, 3) (13056, 3)
(128198, 3) (13024, 3)
(128236, 3) (12986, 3)
(128300, 3) (12922, 3)
(128308, 3) (12914, 3)
(128324, 3) (12898, 3)
(128379, 3) (12843, 3)
(128601, 3) (12621, 3)
(128763, 3) (12459, 3)
(128808, 3) (12414, 3)


In [19]:
# 另一个K-Fold分割策略，Cell Blind
# df = pd.read_csv('data.csv')

# 获取所有cell line  
cell_lines = df['cell_line'].unique()  
n_splits = 11
kf = KFold(n_splits=n_splits, shuffle=True, random_state=None)
count = 0
for train_index, test_index in kf.split(cell_lines):
    
    train_cell_lines = cell_lines[train_index]
    test_cell_lines = cell_lines[test_index]
    
    train_df = df[df['cell_line'].isin(train_cell_lines)]
    test_df = df[df['cell_line'].isin(test_cell_lines)]
    # 重置索引
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)
    # 打印文件
    train_df.to_csv('../model_omics_experiment/data/k_fold_data/cell_blind/CellBlind_train_Fold{}.csv'.format(count),index=True)
    test_df.to_csv('../model_omics_experiment/data/k_fold_data/cell_blind/CellBlind_test_Fold{}.csv'.format(count),index=True)
    count += 1
    print("TRAIN:", train_df.shape, "TEST:", test_df.shape) 
    print("Cell lines:", set(train_df['cell_line']).intersection(set(test_df['cell_line'])))

TRAIN: (128411, 3) TEST: (12811, 3)
Cell lines: set()
TRAIN: (128186, 3) TEST: (13036, 3)
Cell lines: set()
TRAIN: (128226, 3) TEST: (12996, 3)
Cell lines: set()
TRAIN: (128150, 3) TEST: (13072, 3)
Cell lines: set()
TRAIN: (128474, 3) TEST: (12748, 3)
Cell lines: set()
TRAIN: (128293, 3) TEST: (12929, 3)
Cell lines: set()
TRAIN: (129046, 3) TEST: (12176, 3)
Cell lines: set()
TRAIN: (128270, 3) TEST: (12952, 3)
Cell lines: set()
TRAIN: (128358, 3) TEST: (12864, 3)
Cell lines: set()
TRAIN: (128365, 3) TEST: (12857, 3)
Cell lines: set()
TRAIN: (128441, 3) TEST: (12781, 3)
Cell lines: set()
