In [21]:
import pickle
import numpy as np
import pandas as pd

cell_type = "k562"

# File path for saving
filename = f'../../data/{cell_type}_datasets.pkl'

froot = f'../../data/{cell_type}_epAllmer_zeta_norm.csv'
df = pd.read_csv(froot)

# convert R csv file positions 1-based positions to 0-based for Python
df['start'] = df['start'] - 1
df['end'] = df['end'] - 1

print(df.head())

  exec(code_obj, self.user_global_ns, self.user_ns)


  seqnames    start      end strand  ensembl_gene_id    score      ctcf  \
0        1  1002760  1002760      +  ENSG00000187608  0.00000 -0.124708   
1        1  1002761  1002761      +  ENSG00000187608  0.00000 -0.124708   
2        1  1002762  1002762      +  ENSG00000187608  0.00000 -0.124708   
3        1  1002763  1002763      +  ENSG00000187608  0.00000 -0.124708   
4        1  1002764  1002764      +  ENSG00000187608  0.62265 -0.124708   

   h4k20me1  h3k79me2   h3k4me1  ...      sj3      rpts  wgbs  lambda_alphaj  \
0  -0.47533 -0.202922 -0.276494  ...  0.04604 -0.187111   0.0       0.044328   
1  -0.47533 -0.202922 -0.276494  ...  0.04604 -0.187111   0.0       0.044328   
2  -0.47533 -0.202922 -0.276494  ...  0.04604 -0.187111   0.0       0.044328   
3  -0.47533 -0.202922 -0.276494  ...  0.04604 -0.187111   0.0       0.044328   
4  -0.47533 -0.202922 -0.276494  ...  0.04604 -0.187111   0.0       0.044328   

       zeta  A  T  G  C  combined_zeta  
0  1.066961  0  0  1  0    

In [22]:
from sklearn.model_selection import train_test_split

# train size = 80%, validation size = 10%, test size = 10%
train_size = 0.8

grouped = df.groupby('ensembl_gene_id')

# split by gene into train, val, test sets
train_idx, temp_idx = train_test_split(list(grouped.groups.keys()), test_size=(1.0 - train_size), random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

# create dictionary mapping each gene id to its assigned train, val, test dataset labels
dataset_mapping = {gene_id: 'train' for gene_id in train_idx}
dataset_mapping.update({gene_id: 'val' for gene_id in val_idx})
dataset_mapping.update({gene_id: 'test' for gene_id in test_idx})

# filter rows based on assigned dataset field
df['dataset'] = df['ensembl_gene_id'].map(dataset_mapping)
train_data = df[df['dataset'] == 'train']
valid_data = df[df['dataset'] == 'val']
test_data = df[df['dataset'] == 'test']

print("train data size: " + str(len(train_data)))
print("val data size: " + str(len(valid_data)))
print("test data size: " + str(len(test_data)) + "\n")

train data size: 136927782
val data size: 17166667
test data size: 17113151



In [23]:
print(train_data.iloc[0])

seqnames                         1
start                      1002760
end                        1002760
strand                           +
ensembl_gene_id    ENSG00000187608
score                          0.0
ctcf                     -0.124708
h4k20me1                  -0.47533
h3k79me2                 -0.202922
h3k4me1                  -0.276494
h3k9me3                  -0.260088
h3k36me3                 -0.680032
sj5                      -0.012938
sj3                        0.04604
rpts                     -0.187111
wgbs                           0.0
lambda_alphaj             0.044328
zeta                      1.066961
A                                0
T                                0
G                                1
C                                0
combined_zeta             0.786135
dataset                      train
Name: 0, dtype: object


In [24]:
print("train # genes: " + str(len(train_data.groupby('ensembl_gene_id'))))
print("val # genes: " + str(len(valid_data.groupby('ensembl_gene_id'))))
print("test # genes: " + str(len(test_data.groupby('ensembl_gene_id'))))

train # genes: 2547
val # genes: 318
test # genes: 319


In [25]:
combined_datasets = {
    'train': train_data,
    'valid': valid_data,
    'test': test_data
}

# Serialize the combined datasets to a pickle file with protocol=4 or higher
with open(filename, 'wb') as file:
    pickle.dump(combined_datasets, file, protocol=4) # protocol 4 for python >= 3.4


In [26]:
with open(filename, 'rb') as file:
    combined_datasets = pickle.load(file)

dataset1 = combined_datasets['train']

print(dataset1.iloc[0])

seqnames                         1
start                      1002760
end                        1002760
strand                           +
ensembl_gene_id    ENSG00000187608
score                          0.0
ctcf                     -0.124708
h4k20me1                  -0.47533
h3k79me2                 -0.202922
h3k4me1                  -0.276494
h3k9me3                  -0.260088
h3k36me3                 -0.680032
sj5                      -0.012938
sj3                        0.04604
rpts                     -0.187111
wgbs                           0.0
lambda_alphaj             0.044328
zeta                      1.066961
A                                0
T                                0
G                                1
C                                0
combined_zeta             0.786135
dataset                      train
Name: 0, dtype: object
