In [2]:
import pandas as pd
import numpy as np

In [32]:

import gzip
import json

def parse_data(info_path, json_zip_path):
   # loads data
   print(f"Loading {json_zip_path}...")
   with gzip.open(json_zip_path, "r") as f:
      data = [json.loads(line) for line in f]
   
   # loads data with label
   print(f"Loading {info_path}...")
   info = pd.read_csv(info_path)

   #transfer information from json dict to list
   print("Transferring data from json to dataframe...")
   res = []
   for row in data:
      for trans_id in row.keys():
         for trans_pos in row[trans_id].keys():
            for nucleo_seq in row[trans_id][trans_pos].keys():
               temp = list(np.mean(np.array(row[trans_id][trans_pos][nucleo_seq]), axis=0))
               # to get raw data without aggregation
               # for features in row[trans_id][trans_pos][nucleo_seq]:
               res.append([trans_id, int(trans_pos), nucleo_seq] + temp)

   data = pd.DataFrame(res, columns = ['transcript_id', 'transcript_pos', 'nucleo_seq',
                                       'dwell_time_-1', 'sd_-1', 'mean_-1',
                                       'dwell_time_0', 'sd_0', 'mean_0',
                                       'dwell_time_1', 'sd_1', 'mean_1'
                                       ])
   # Merge json data with labels
   print("Merging dataframes to obtain labels")
   merged_data = pd.merge(data,info,left_on=["transcript_id","transcript_pos"],right_on=["transcript_id","transcript_position"], how="left")

   return merged_data

  

In [33]:
data = parse_data("/Users/shaun/Desktop/DSA4266/project2/data.info",'/Users/shaun/Desktop/DSA4266/project2/dataset0.json.gz')

Loading /Users/shaun/Desktop/DSA4266/project2/dataset0.json.gz...
Loading /Users/shaun/Desktop/DSA4266/project2/data.info...
Transferring data from json to dataframe...
Merging dataframes to obtain labels


In [34]:
data

Unnamed: 0,transcript_id,transcript_pos,nucleo_seq,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,sd_1,mean_1,gene_id,transcript_position,label
0,ENST00000000233,244,AAGACCA,0.008264,4.223784,123.702703,0.009373,7.382162,125.913514,0.007345,4.386989,80.570270,ENSG00000004059,244,0
1,ENST00000000233,261,CAAACTG,0.006609,3.216424,109.681395,0.006813,3.226535,107.889535,0.007710,3.016599,94.290698,ENSG00000004059,261,0
2,ENST00000000233,316,GAAACAG,0.007570,2.940541,105.475676,0.007416,3.642703,98.947027,0.007555,2.087146,89.364324,ENSG00000004059,316,0
3,ENST00000000233,332,AGAACAT,0.010620,6.476350,129.355000,0.008632,2.899200,97.836500,0.006102,2.236520,89.154000,ENSG00000004059,332,0
4,ENST00000000233,368,AGGACAA,0.010701,6.415051,117.924242,0.011479,5.870303,121.954545,0.010019,4.260253,85.178788,ENSG00000004059,368,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000641834,1348,GGGACAT,0.009594,3.294164,118.232877,0.007300,4.929726,116.342466,0.006555,4.005616,82.004110,ENSG00000167747,1348,1
121834,ENST00000641834,1429,CTGACAC,0.008393,4.511014,110.969565,0.010305,9.105797,114.927536,0.005568,3.644638,80.497101,ENSG00000167747,1429,0
121835,ENST00000641834,1531,TGGACAC,0.008161,3.918438,113.968750,0.006877,4.759687,113.562500,0.006410,2.181562,84.190625,ENSG00000167747,1531,1
121836,ENST00000641834,1537,CTGACCA,0.008044,3.191228,109.354386,0.007419,6.552982,123.263158,0.006472,2.540877,82.289474,ENSG00000167747,1537,0


In [35]:
def onehote(sequence):
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    seq2 = [mapping[i] for i in sequence]
    return np.eye(4)[seq2]

In [36]:
def kmers(sequence, ksize=5):
    kmers = []
    n_kmers = len(sequence) - ksize + 1

    for i in range(n_kmers):
        kmer = sequence[i:i + ksize]
        kmers.append(kmer)

    return kmers

In [37]:
kmer_df = pd.DataFrame(data.apply(lambda x: kmers(x.nucleo_seq),axis=1).to_list(), columns=["nucleo_seq_-1","nucleo_seq_0","nucleo_seq_1"])
data = pd.concat([data,kmer_df], axis=1)

In [38]:
data

Unnamed: 0,transcript_id,transcript_pos,nucleo_seq,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,sd_1,mean_1,gene_id,transcript_position,label,nucleo_seq_-1,nucleo_seq_0,nucleo_seq_1
0,ENST00000000233,244,AAGACCA,0.008264,4.223784,123.702703,0.009373,7.382162,125.913514,0.007345,4.386989,80.570270,ENSG00000004059,244,0,AAGAC,AGACC,GACCA
1,ENST00000000233,261,CAAACTG,0.006609,3.216424,109.681395,0.006813,3.226535,107.889535,0.007710,3.016599,94.290698,ENSG00000004059,261,0,CAAAC,AAACT,AACTG
2,ENST00000000233,316,GAAACAG,0.007570,2.940541,105.475676,0.007416,3.642703,98.947027,0.007555,2.087146,89.364324,ENSG00000004059,316,0,GAAAC,AAACA,AACAG
3,ENST00000000233,332,AGAACAT,0.010620,6.476350,129.355000,0.008632,2.899200,97.836500,0.006102,2.236520,89.154000,ENSG00000004059,332,0,AGAAC,GAACA,AACAT
4,ENST00000000233,368,AGGACAA,0.010701,6.415051,117.924242,0.011479,5.870303,121.954545,0.010019,4.260253,85.178788,ENSG00000004059,368,0,AGGAC,GGACA,GACAA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000641834,1348,GGGACAT,0.009594,3.294164,118.232877,0.007300,4.929726,116.342466,0.006555,4.005616,82.004110,ENSG00000167747,1348,1,GGGAC,GGACA,GACAT
121834,ENST00000641834,1429,CTGACAC,0.008393,4.511014,110.969565,0.010305,9.105797,114.927536,0.005568,3.644638,80.497101,ENSG00000167747,1429,0,CTGAC,TGACA,GACAC
121835,ENST00000641834,1531,TGGACAC,0.008161,3.918438,113.968750,0.006877,4.759687,113.562500,0.006410,2.181562,84.190625,ENSG00000167747,1531,1,TGGAC,GGACA,GACAC
121836,ENST00000641834,1537,CTGACCA,0.008044,3.191228,109.354386,0.007419,6.552982,123.263158,0.006472,2.540877,82.289474,ENSG00000167747,1537,0,CTGAC,TGACC,GACCA


In [62]:
def create_pwm_from_sequences(sequence):
    """
    Create a Position Weight Matrix (PWM) from a list of sequences.
    
    Args:
        sequences (list of str): List of DNA sequences of equal length.
        
    Returns:
        np.ndarray: The PWM matrix.
    """

    # Initialize the PWM matrix with zeros
    pwm = np.zeros((4, 7))  # 4 rows for A, C, G, T

    # Count the occurrences of each nucleotide at each position
    for i, nucleotide in enumerate(sequence):
        if nucleotide == 'A':
            pwm[0, i] += 1
        elif nucleotide == 'C':
            pwm[1, i] += 1
        elif nucleotide == 'G':
            pwm[2, i] += 1
        elif nucleotide == 'T':
            pwm[3, i] += 1

    # Convert counts to probabilities
    pwm = pwm / len(sequence)

    return pwm

In [64]:
data.apply(lambda x: create_pwm_from_sequences(x.nucleo_seq), axis=1)[0]

array([[0.14285714, 0.14285714, 0.        , 0.14285714, 0.        ,
        0.        , 0.14285714],
       [0.        , 0.        , 0.        , 0.        , 0.14285714,
        0.14285714, 0.        ],
       [0.        , 0.        , 0.14285714, 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ]])

In [61]:
create_pwm_from_sequences(data["nucleo_seq"])

array([[0.31899736, 0.3764589 , 0.47591884, 1.        , 0.        ,
        0.37444804, 0.26014051],
       [0.18503258, 0.        , 0.        , 0.        , 1.        ,
        0.30063691, 0.21133801],
       [0.25936079, 0.34526174, 0.52408116, 0.        , 0.        ,
        0.        , 0.24189498],
       [0.23660927, 0.27827935, 0.        , 0.        , 0.        ,
        0.32491505, 0.2866265 ]])

In [40]:
data["seq"] = data.apply(lambda x: onehote(x.nucleo_seq), axis=1)
data["seq_-1"] = data.apply(lambda x: onehote(x["nucleo_seq_-1"]), axis=1)
data["seq_0"] = data.apply(lambda x: onehote(x.nucleo_seq_0), axis=1)
data["seq_1"] = data.apply(lambda x: onehote(x.nucleo_seq_1), axis=1)


In [41]:
data

Unnamed: 0,transcript_id,transcript_pos,nucleo_seq,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,...,gene_id,transcript_position,label,nucleo_seq_-1,nucleo_seq_0,nucleo_seq_1,seq,seq_-1,seq_0,seq_1
0,ENST00000000233,244,AAGACCA,0.008264,4.223784,123.702703,0.009373,7.382162,125.913514,0.007345,...,ENSG00000004059,244,0,AAGAC,AGACC,GACCA,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...","[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...","[[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [...","[[0.0, 0.0, 1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
1,ENST00000000233,261,CAAACTG,0.006609,3.216424,109.681395,0.006813,3.226535,107.889535,0.007710,...,ENSG00000004059,261,0,CAAAC,AAACT,AACTG,"[[0.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...","[[0.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...","[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...","[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
2,ENST00000000233,316,GAAACAG,0.007570,2.940541,105.475676,0.007416,3.642703,98.947027,0.007555,...,ENSG00000004059,316,0,GAAAC,AAACA,AACAG,"[[0.0, 0.0, 1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...","[[0.0, 0.0, 1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...","[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...","[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
3,ENST00000000233,332,AGAACAT,0.010620,6.476350,129.355000,0.008632,2.899200,97.836500,0.006102,...,ENSG00000004059,332,0,AGAAC,GAACA,AACAT,"[[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [...","[[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [...","[[0.0, 0.0, 1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...","[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
4,ENST00000000233,368,AGGACAA,0.010701,6.415051,117.924242,0.011479,5.870303,121.954545,0.010019,...,ENSG00000004059,368,0,AGGAC,GGACA,GACAA,"[[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [...","[[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [...","[[0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 1.0, 0.0], [...","[[0.0, 0.0, 1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000641834,1348,GGGACAT,0.009594,3.294164,118.232877,0.007300,4.929726,116.342466,0.006555,...,ENSG00000167747,1348,1,GGGAC,GGACA,GACAT,"[[0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 1.0, 0.0], [...","[[0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 1.0, 0.0], [...","[[0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 1.0, 0.0], [...","[[0.0, 0.0, 1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
121834,ENST00000641834,1429,CTGACAC,0.008393,4.511014,110.969565,0.010305,9.105797,114.927536,0.005568,...,ENSG00000167747,1429,0,CTGAC,TGACA,GACAC,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [...","[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [...","[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 0.0], [...","[[0.0, 0.0, 1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
121835,ENST00000641834,1531,TGGACAC,0.008161,3.918438,113.968750,0.006877,4.759687,113.562500,0.006410,...,ENSG00000167747,1531,1,TGGAC,GGACA,GACAC,"[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 0.0], [...","[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 0.0], [...","[[0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 1.0, 0.0], [...","[[0.0, 0.0, 1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
121836,ENST00000641834,1537,CTGACCA,0.008044,3.191228,109.354386,0.007419,6.552982,123.263158,0.006472,...,ENSG00000167747,1537,0,CTGAC,TGACC,GACCA,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [...","[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [...","[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 0.0], [...","[[0.0, 0.0, 1.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."


In [42]:
from sklearn.model_selection import GroupShuffleSplit 

splitter = GroupShuffleSplit(test_size=.10, n_splits=2, random_state = 42)
split = splitter.split(data, groups=data['gene_id'])
train_idx, test_idx = next(split)

train = data.iloc[train_idx]
test = data.iloc[test_idx]

In [45]:
train[train["gene_id"] == "ENSG00000006451"]

Unnamed: 0,transcript_id,transcript_pos,nucleo_seq,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,...,gene_id,transcript_position,label,nucleo_seq_-1,nucleo_seq_0,nucleo_seq_1,seq,seq_-1,seq_0,seq_1
