In [1]:
import pandas as pd
from sklearn.model_selection import KFold

In [2]:
df_temp = pd.read_csv('../model_omics_experiment/data/drug_sensitivity_CellBlind_test.csv',index_col=0)
df_temp.head()

Unnamed: 0,drug,cell_line,IC50
0,5-Fluorouracil,PC14,4.138791
1,5-azacytidine,PC14,2.782368
2,A-366,PC14,4.651003
3,ABT737,PC14,2.341568
4,AGI-5198,PC14,5.232588


In [3]:
# 读取数据
df = pd.read_csv('../data/drug_sensitivity.csv') 
print(df.shape)
df.head()

(141222, 3)


Unnamed: 0,drug,cell_line,IC50
0,5-Fluorouracil,HL60,2.558926
1,5-azacytidine,HL60,0.917132
2,A-366,HL60,4.83616
3,ABT737,HL60,-2.817798
4,AGI-5198,HL60,3.644734


In [5]:
# Mixed Set分割策略
# 根据cell_line分组
grouped = df.groupby('cell_line')
min_group_size = min(grouped.size())
# 设置交叉验证折数 
n_splits = 10
# 存放分割结果
cv_datasets = [] 
# 对每一组cell_line数据 分别进行K折交叉验证
for name, group in grouped:
    X = group
    kf = KFold(n_splits=n_splits, shuffle=True,random_state=None)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        cv_datasets.append((X_train, X_test))

# 新建文件夹

# 得到整体数据的K折分割        
for i in range(n_splits):
    train_dfs = [ds[0] for ds in cv_datasets[i::n_splits]]
    test_dfs = [ds[1] for ds in cv_datasets[i::n_splits]]
    total_train_df = pd.concat(train_dfs)
    total_test_df = pd.concat(test_dfs)
    # 重置索引
    total_train_df = total_train_df.reset_index(drop=True)
    total_test_df = total_test_df.reset_index(drop=True)
    # 打印文件
    total_train_df.to_csv('../model_omics_experiment/data/k_fold_data/mixed/10fold/MixedSet_train_Fold{}.csv'.format(i),index=True)
    total_test_df.to_csv('../model_omics_experiment/data/k_fold_data/mixed/10fold/MixedSet_test_Fold{}.csv'.format(i),index=True)
    print(total_train_df.shape, total_test_df.shape)
    

(126876, 3) (14346, 3)
(126904, 3) (14318, 3)
(126945, 3) (14277, 3)
(126982, 3) (14240, 3)
(126998, 3) (14224, 3)
(127015, 3) (14207, 3)
(127061, 3) (14161, 3)
(127283, 3) (13939, 3)
(127444, 3) (13778, 3)
(127490, 3) (13732, 3)


In [8]:
# 另一个K-Fold分割策略，Cell Blind
# df = pd.read_csv('data.csv')

# 获取所有cell line  
cell_lines = df['cell_line'].unique()  
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=None)
count = 0
for train_index, test_index in kf.split(cell_lines):
    
    train_cell_lines = cell_lines[train_index]
    test_cell_lines = cell_lines[test_index]
    
    train_df = df[df['cell_line'].isin(train_cell_lines)]
    test_df = df[df['cell_line'].isin(test_cell_lines)]
    # 重置索引
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)
    # 打印文件
    train_df.to_csv('../model_omics_experiment/data/k_fold_data/cell_blind/10fold/CellBlind_train_Fold{}.csv'.format(count),index=True)
    test_df.to_csv('../model_omics_experiment/data/k_fold_data/cell_blind/10fold/CellBlind_test_Fold{}.csv'.format(count),index=True)
    count += 1
    print("TRAIN:", train_df.shape, "TEST:", test_df.shape) 
    print("Cell lines:", set(train_df['cell_line']).intersection(set(test_df['cell_line'])))

TRAIN: (127179, 3) TEST: (14043, 3)
Cell lines: set()
TRAIN: (127388, 3) TEST: (13834, 3)
Cell lines: set()
TRAIN: (126964, 3) TEST: (14258, 3)
Cell lines: set()
TRAIN: (126955, 3) TEST: (14267, 3)
Cell lines: set()
TRAIN: (127456, 3) TEST: (13766, 3)
Cell lines: set()
TRAIN: (126943, 3) TEST: (14279, 3)
Cell lines: set()
TRAIN: (126905, 3) TEST: (14317, 3)
Cell lines: set()
TRAIN: (126796, 3) TEST: (14426, 3)
Cell lines: set()
TRAIN: (127117, 3) TEST: (14105, 3)
Cell lines: set()
TRAIN: (127295, 3) TEST: (13927, 3)
Cell lines: set()
