# Split the preprocessed datasets into training,validation and test sets.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Batch 1

In [2]:
df1 = pd.read_csv("/Users/isabelgiray/Desktop/classification_median_batch_1.tsv", sep="\t")
df2 = pd.read_csv("/Users/isabelgiray/Desktop/batch_1_clustering.tsv", sep="\t")


In [3]:
barcodes_unique = df1['barcode'].unique()

In [4]:
train_barcodes, test_barcodes = train_test_split(barcodes_unique, test_size=0.2, random_state=2)

In [5]:
train_df = df1[df1['barcode'].isin(train_barcodes)]
test_df = df1[df1['barcode'].isin(test_barcodes)]

In [6]:
train_df = train_df[train_df['expression_count'] != 0]

The non-zero training set contains 70% of the non-zero entire set

In [7]:
train_df['classification'].value_counts(), test_df['classification'].value_counts()

(classification
 low     198570
 high    198470
 Name: count, dtype: int64,
 classification
 zero    1372498
 low       50765
 high      50737
 Name: count, dtype: int64)

In [8]:
test_train = df2[df2['barcodes'].isin(train_barcodes)]
test_test = df2[df2['barcodes'].isin(test_barcodes)]

In [9]:
df2['celltype'].value_counts(), test_train['celltype'].value_counts(), test_test['celltype'].value_counts()

(celltype
 cancer     4002
 APC's       155
 T cells      55
 Name: count, dtype: int64,
 celltype
 cancer     2798
 APC's       107
 T cells      39
 Name: count, dtype: int64,
 celltype
 cancer     698
 APC's       30
 T cells      9
 Name: count, dtype: int64)

Now split the training set into 'actual' training set and validation set, such that we have splits 70%:10%:20%.

In [10]:
training_barcodes, val_barcodes = train_test_split(train_barcodes, test_size=10/80, random_state=2)

In [11]:
training_df = train_df[train_df['barcode'].isin(training_barcodes)]
val_df = train_df[train_df['barcode'].isin(val_barcodes)]

In [12]:
training_df['classification'].value_counts(), val_df['classification'].value_counts(), test_df['classification'].value_counts()

(classification
 high    174324
 low     173400
 Name: count, dtype: int64,
 classification
 low     25170
 high    24146
 Name: count, dtype: int64,
 classification
 zero    1372498
 low       50765
 high      50737
 Name: count, dtype: int64)

In [13]:
test_training = df2[df2['barcodes'].isin(training_barcodes)]
test_val = df2[df2['barcodes'].isin(val_barcodes)]
test_test = df2[df2['barcodes'].isin(test_barcodes)]


In [14]:
test_training['celltype'].value_counts(), test_val['celltype'].value_counts(), test_test['celltype'].value_counts()

(celltype
 cancer     2448
 APC's        93
 T cells      35
 Name: count, dtype: int64,
 celltype
 cancer     350
 APC's       14
 T cells      4
 Name: count, dtype: int64,
 celltype
 cancer     698
 APC's       30
 T cells      9
 Name: count, dtype: int64)

In [15]:
test_df.to_csv('/Users/isabelgiray/Desktop/batch1_test.tsv', sep='\t', index=False)
training_df.to_csv('/Users/isabelgiray/Desktop/batch1_training.tsv', sep='\t', index=False)
val_df.to_csv('/Users/isabelgiray/Desktop/batch1_val.tsv', sep='\t', index=False)

# Batch 2

In [18]:
df3 = pd.read_csv("/Users/isabelgiray/Desktop/classification_median_batch_2.tsv", sep="\t")
df4 = pd.read_csv("/Users/isabelgiray/Desktop/batch_2_clustering.tsv", sep="\t")

In [19]:
barcodes_unique = df3['barcode'].unique()

In [20]:
train_barcodes, test_barcodes = train_test_split(barcodes_unique, test_size=0.2, random_state=2)

In [21]:
train_df = df3[df3['barcode'].isin(train_barcodes)]
test_df = df3[df3['barcode'].isin(test_barcodes)]

In [22]:
train_df = train_df[train_df['expression_count'] != 0]

In [23]:
train_df['classification'].value_counts(), test_df['classification'].value_counts()

(classification
 low     241458
 high    238440
 Name: count, dtype: int64,
 classification
 zero    2170155
 high      61332
 low       58513
 Name: count, dtype: int64)

In [24]:
test_train = df4[df4['barcodes'].isin(train_barcodes)]
test_test = df4[df4['barcodes'].isin(test_barcodes)]

In [25]:
df4['celltype'].value_counts(), test_train['celltype'].value_counts(), test_test['celltype'].value_counts()

(celltype
 cancer     6357
 healthy     192
 Name: count, dtype: int64,
 celltype
 cancer     4450
 healthy     128
 Name: count, dtype: int64,
 celltype
 cancer     1112
 healthy      33
 Name: count, dtype: int64)

In [26]:
training_barcodes, val_barcodes = train_test_split(train_barcodes, test_size=10/80, random_state=2)

In [27]:
training_df = train_df[train_df['barcode'].isin(training_barcodes)]
val_df = train_df[train_df['barcode'].isin(val_barcodes)]

In [28]:
training_df['classification'].value_counts(), val_df['classification'].value_counts(), test_df['classification'].value_counts()

(classification
 low     211456
 high    208559
 Name: count, dtype: int64,
 classification
 low     30002
 high    29881
 Name: count, dtype: int64,
 classification
 zero    2170155
 high      61332
 low       58513
 Name: count, dtype: int64)

In [29]:
test_training = df4[df4['barcodes'].isin(training_barcodes)]
test_val = df4[df4['barcodes'].isin(val_barcodes)]
test_test = df4[df4['barcodes'].isin(test_barcodes)]

In [30]:
test_training['celltype'].value_counts(), test_val['celltype'].value_counts(), test_test['celltype'].value_counts()

(celltype
 cancer     3893
 healthy     112
 Name: count, dtype: int64,
 celltype
 cancer     557
 healthy     16
 Name: count, dtype: int64,
 celltype
 cancer     1112
 healthy      33
 Name: count, dtype: int64)

In [31]:
test_df.to_csv('/Users/isabelgiray/Desktop/batch2_test.tsv', sep='\t', index=False)
training_df.to_csv('/Users/isabelgiray/Desktop/batch2_training.tsv', sep='\t', index=False)
val_df.to_csv('/Users/isabelgiray/Desktop/batch2_val.tsv', sep='\t', index=False)