# Split the preprocessed datasets into training,validation and test sets.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Batch 1

In [2]:
df1 = pd.read_csv("../data/preprocessing/classification_median_batch_1.tsv", sep="\t")
df2 = pd.read_csv("../data/preprocessing/batch_1_clustering.tsv", sep="\t")
best_barcodes_1 = pd.read_csv('../data/preprocessing/batch_1_best_quality_barcodes.tsv').columns


In [3]:
df2 = df2.rename(columns={'barcodes': 'barcode'})

In [4]:
df1 = df1[df1['barcode'].isin(best_barcodes_1)]
df2 = df2[df2['barcode'].isin(best_barcodes_1)]

In [5]:
df1 = df1[df1['expression_count'] != 0]

In [6]:
train_barcodes, test_barcodes = train_test_split(best_barcodes_1, test_size=0.2, random_state=2)

In [7]:
train_df = df1[df1['barcode'].isin(train_barcodes)]
test_df = df1[df1['barcode'].isin(test_barcodes)]

The non-zero training set contains 70% of the non-zero entire set

In [44]:
train_df['classification'].value_counts(), test_df['classification'].value_counts()

(classification
 low     37640
 high    30655
 Name: count, dtype: int64,
 classification
 low     11310
 high     7325
 Name: count, dtype: int64)

In [8]:
test_train = df2[df2['barcode'].isin(train_barcodes)]
test_test = df2[df2['barcode'].isin(test_barcodes)]

In [47]:
df2['celltype'].value_counts(), test_train['celltype'].value_counts(), test_test['celltype'].value_counts()

(celltype
 cancer     449
 APC's       46
 T cells     14
 Name: count, dtype: int64,
 celltype
 cancer     359
 APC's       37
 T cells     11
 Name: count, dtype: int64,
 celltype
 cancer     90
 APC's       9
 T cells     3
 Name: count, dtype: int64)

Now split the training set into 'actual' training set and validation set, such that we have splits 70%:10%:20%.

In [9]:
training_barcodes, val_barcodes = train_test_split(train_barcodes, test_size=10/80, random_state=2)

In [10]:
training_df = train_df[train_df['barcode'].isin(training_barcodes)]
val_df = train_df[train_df['barcode'].isin(val_barcodes)]

In [50]:
training_df['classification'].value_counts(), val_df['classification'].value_counts(), test_df['classification'].value_counts()

(classification
 low     32885
 high    26456
 Name: count, dtype: int64,
 classification
 low     4755
 high    4199
 Name: count, dtype: int64,
 classification
 low     11310
 high     7325
 Name: count, dtype: int64)

In [11]:
test_training = df2[df2['barcode'].isin(training_barcodes)]
test_val = df2[df2['barcode'].isin(val_barcodes)]
test_test = df2[df2['barcode'].isin(test_barcodes)]


In [53]:
test_training['celltype'].value_counts(), test_val['celltype'].value_counts(), test_test['celltype'].value_counts()

(celltype
 cancer     317
 APC's       32
 T cells      7
 Name: count, dtype: int64,
 celltype
 cancer     42
 APC's       5
 T cells     4
 Name: count, dtype: int64,
 celltype
 cancer     90
 APC's       9
 T cells     3
 Name: count, dtype: int64)

In [54]:
test_df.to_csv('/Users/isabelgiray/Desktop/batch1_test_filtered.tsv', sep='\t', index=False)
training_df.to_csv('/Users/isabelgiray/Desktop/batch1_training_filtered.tsv', sep='\t', index=False)
val_df.to_csv('/Users/isabelgiray/Desktop/batch1_val_filtered.tsv', sep='\t', index=False)

# Batch 2

In [2]:
df3 = pd.read_csv("../data/preprocessing/classification_median_batch_2.tsv", sep="\t")
df4 = pd.read_csv("../data/preprocessing/batch_2_clustering.tsv", sep="\t")
best_barcodes_2 = pd.read_csv('../data/preprocessing/batch_2_best_quality_barcodes.tsv').columns

In [3]:
df4 = df4.rename(columns={'barcodes':'barcode'})

In [4]:
df3 = df3[df3['barcode'].isin(best_barcodes_2)]
df4 = df4[df4['barcode'].isin(best_barcodes_2)]

In [5]:
df3 = df3[df3['expression_count'] != 0]

In [6]:
train_barcodes, test_barcodes = train_test_split(best_barcodes_2, test_size=0.2, random_state=2)

In [7]:
train_df = df3[df3['barcode'].isin(train_barcodes)]
test_df = df3[df3['barcode'].isin(test_barcodes)]

In [8]:
train_df['classification'].value_counts(), test_df['classification'].value_counts()

(classification
 high    16839
 low     10260
 Name: count, dtype: int64,
 classification
 high    3907
 low     3421
 Name: count, dtype: int64)

In [9]:
test_train = df4[df4['barcode'].isin(train_barcodes)]
test_test = df4[df4['barcode'].isin(test_barcodes)]

In [10]:
df4['celltype'].value_counts(), test_train['celltype'].value_counts(), test_test['celltype'].value_counts()

(celltype
 cancer     299
 healthy     44
 Name: count, dtype: int64,
 celltype
 cancer     237
 healthy     37
 Name: count, dtype: int64,
 celltype
 cancer     62
 healthy     7
 Name: count, dtype: int64)

In [11]:
training_barcodes, val_barcodes = train_test_split(train_barcodes, test_size=10/80, random_state=2)

In [12]:
training_df = train_df[train_df['barcode'].isin(training_barcodes)]
val_df = train_df[train_df['barcode'].isin(val_barcodes)]

In [13]:
training_df['classification'].value_counts(), val_df['classification'].value_counts(), test_df['classification'].value_counts()

(classification
 high    14425
 low      9071
 Name: count, dtype: int64,
 classification
 high    2414
 low     1189
 Name: count, dtype: int64,
 classification
 high    3907
 low     3421
 Name: count, dtype: int64)

In [14]:
test_training = df4[df4['barcode'].isin(training_barcodes)]
test_val = df4[df4['barcode'].isin(val_barcodes)]
test_test = df4[df4['barcode'].isin(test_barcodes)]

In [15]:
test_training['celltype'].value_counts(), test_val['celltype'].value_counts(), test_test['celltype'].value_counts()

(celltype
 cancer     210
 healthy     29
 Name: count, dtype: int64,
 celltype
 cancer     27
 healthy     8
 Name: count, dtype: int64,
 celltype
 cancer     62
 healthy     7
 Name: count, dtype: int64)

In [16]:
test_df.to_csv('../data/splits/batch2_test_filtered.tsv', sep='\t', index=False)
training_df.to_csv('../data/splits/batch2_training_filtered.tsv', sep='\t', index=False)
val_df.to_csv('../data/splits/batch2_val_filtered.tsv', sep='\t', index=False)

# Try undersampling the training set

In [2]:
import os
os.environ["SCIPY_ARRAY_API"] = "1"

import sklearn
import scipy
import pandas as pd 

In [3]:
clas = pd.read_csv('../data/preprocessing/classification_median_batch_1.tsv',sep='\t')

In [4]:
best_barcodes_1 = pd.read_csv('../data/preprocessing/batch_1_best_quality_barcodes.tsv').columns

In [5]:
# Distribution before splitting
clas['classification'].value_counts()

classification
zero    6863458
low      249335
high     249207
Name: count, dtype: int64

In [6]:
clas = clas[clas['barcode'].isin(best_barcodes_1)]
clas = clas[clas['expression_count'] != 0]

In [7]:
# Distrubution after filtering for EpiAneufinder results
clas['classification'].value_counts()

classification
low     48950
high    37980
Name: count, dtype: int64

In [35]:
len(best_barcodes_1)

509

In [8]:
clas_train = pd.read_csv('../data/splits/batch1_training_filtered.tsv',sep='\t')

In [9]:
# Distribution of classes in filtered training set
clas_train['classification'].value_counts()

classification
low     32885
high    26456
Name: count, dtype: int64

In [23]:
26456/(26456+32885)

0.44583003319795755

Perform subsampling on the training set.

In [10]:
from imblearn.under_sampling import RandomUnderSampler

In [11]:
# Define feature columns (excluding classification)
feature_cols = [col for col in clas_train.columns if col not in ['classification']]

# Apply RandomUnderSampler on full barcode-gene dataset
rus = RandomUnderSampler(random_state=42)
train_resampled, y_train_resampled = rus.fit_resample(
    clas_train[feature_cols],  # Features (all except classification)
    clas_train['classification']  # Labels (classification column)
)

# Convert back to a DataFrame
clas_train_resampled = pd.DataFrame(train_resampled, columns=feature_cols)
clas_train_resampled['classification'] = y_train_resampled  # Add classification back

# 4Print new class distribution
print("Undersampled Train class distribution:\n", clas_train_resampled['classification'].value_counts(normalize=True))


Undersampled Train class distribution:
 classification
high    0.5
low     0.5
Name: proportion, dtype: float64


In [12]:
# Distribution of subsampled training set
clas_train_resampled['classification'].value_counts()

classification
high    26456
low     26456
Name: count, dtype: int64

Check if it is balanced in annotation

In [16]:
barcodes = clas_train_resampled['barcode'].drop_duplicates() #Checked that these are the same as before, we just removed genes.

In [24]:
anno = pd.read_csv('../data/preprocessing/batch_1_clustering.tsv',sep='\t')

In [25]:
anno = anno[anno['barcodes'].isin(barcodes)]

In [26]:
anno['celltype'].value_counts()

celltype
cancer     317
APC's       32
T cells      7
Name: count, dtype: int64

In [None]:
clas_train_resampled.to_csv('../data/splits/batch1_train_subsampled',sep='\t',index=False)