In [1]:
import pandas as pd
import numpy as np
import gzip
import json

To parse data from json and info

In [2]:

def parse_data(info_path, json_zip_path):
   # loads data
   print(f"Loading {json_zip_path}...")
   with gzip.open(json_zip_path, "r") as f:
      data = [json.loads(line) for line in f]
   
   # loads data with label
   print(f"Loading {info_path}...")
   info = pd.read_csv(info_path)

   #transfer information from json dict to list
   print("Transferring data from json to dataframe...")
   res = []
   for row in data:
      for trans_id in row.keys():
         for trans_pos in row[trans_id].keys():
            for nucleo_seq in row[trans_id][trans_pos].keys():
               temp = list(np.mean(np.array(row[trans_id][trans_pos][nucleo_seq]), axis=0))
               # to get raw data without aggregation
               # for features in row[trans_id][trans_pos][nucleo_seq]:
               res.append([trans_id, int(trans_pos), nucleo_seq] + temp)

   data = pd.DataFrame(res, columns = ['transcript_id', 'transcript_pos', 'nucleo_seq',
                                       'dwell_time_-1', 'sd_-1', 'mean_-1',
                                       'dwell_time_0', 'sd_0', 'mean_0',
                                       'dwell_time_1', 'sd_1', 'mean_1'
                                       ])
   # Merge json data with labels
   print("Merging dataframes to obtain labels")
   merged_data = pd.merge(data,info,left_on=["transcript_id","transcript_pos"],right_on=["transcript_id","transcript_position"], how="left")

   return merged_data

  

In [3]:
data = parse_data("/Users/shaun/Desktop/DSA4266/project2/data.info",'/Users/shaun/Desktop/DSA4266/project2/dataset0.json.gz')

Loading /Users/shaun/Desktop/DSA4266/project2/dataset0.json.gz...
Loading /Users/shaun/Desktop/DSA4266/project2/data.info...
Transferring data from json to dataframe...
Merging dataframes to obtain labels


In [4]:
data

Unnamed: 0,transcript_id,transcript_pos,nucleo_seq,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,sd_1,mean_1,gene_id,transcript_position,label
0,ENST00000000233,244,AAGACCA,0.008264,4.223784,123.702703,0.009373,7.382162,125.913514,0.007345,4.386989,80.570270,ENSG00000004059,244,0
1,ENST00000000233,261,CAAACTG,0.006609,3.216424,109.681395,0.006813,3.226535,107.889535,0.007710,3.016599,94.290698,ENSG00000004059,261,0
2,ENST00000000233,316,GAAACAG,0.007570,2.940541,105.475676,0.007416,3.642703,98.947027,0.007555,2.087146,89.364324,ENSG00000004059,316,0
3,ENST00000000233,332,AGAACAT,0.010620,6.476350,129.355000,0.008632,2.899200,97.836500,0.006102,2.236520,89.154000,ENSG00000004059,332,0
4,ENST00000000233,368,AGGACAA,0.010701,6.415051,117.924242,0.011479,5.870303,121.954545,0.010019,4.260253,85.178788,ENSG00000004059,368,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000641834,1348,GGGACAT,0.009594,3.294164,118.232877,0.007300,4.929726,116.342466,0.006555,4.005616,82.004110,ENSG00000167747,1348,1
121834,ENST00000641834,1429,CTGACAC,0.008393,4.511014,110.969565,0.010305,9.105797,114.927536,0.005568,3.644638,80.497101,ENSG00000167747,1429,0
121835,ENST00000641834,1531,TGGACAC,0.008161,3.918438,113.968750,0.006877,4.759687,113.562500,0.006410,2.181562,84.190625,ENSG00000167747,1531,1
121836,ENST00000641834,1537,CTGACCA,0.008044,3.191228,109.354386,0.007419,6.552982,123.263158,0.006472,2.540877,82.289474,ENSG00000167747,1537,0


To parse the intermediate submission dataset

In [31]:
def parse_test_data(json_zip_path):
    # loads data
    print(f"Loading {json_zip_path}...")
    with gzip.open(json_zip_path, "r") as f:
        data = [json.loads(line) for line in f]


    #transfer information from json dict to list
    print("Transferring data from json to dataframe...")
    res = []
    for row in data:
        for trans_id in row.keys():
            for trans_pos in row[trans_id].keys():
                for nucleo_seq in row[trans_id][trans_pos].keys():
                    temp = list(np.mean(np.array(row[trans_id][trans_pos][nucleo_seq]), axis=0))
                    # to get raw data without aggregation
                    # for features in row[trans_id][trans_pos][nucleo_seq]:
                    res.append([trans_id, int(trans_pos), nucleo_seq] + temp)

    data = pd.DataFrame(res, columns = ['transcript_id', 'transcript_pos', 'nucleo_seq',
                                        'dwell_time_-1', 'sd_-1', 'mean_-1',
                                        'dwell_time_0', 'sd_0', 'mean_0',
                                        'dwell_time_1', 'sd_1', 'mean_1'
                                        ])
    # Get one hot encoding
    kmer_df = pd.DataFrame(data.apply(lambda x: kmers(x.nucleo_seq),axis=1).to_list(), columns=["nucleo_seq_-1","nucleo_seq_0","nucleo_seq_1"])
    data = pd.concat([data,kmer_df], axis=1)
    data["seq"] = data.apply(lambda x: onehote(x.nucleo_seq), axis=1)
    one_hot = get_indiv_one_hot(data)

    # Get pwm
    log_odds_dict, ppm = get_log_odds(data.nucleo_seq)
    data["pwm_score"] = data.apply(lambda x: get_PWM_score(x.nucleo_seq),axis=1)
    data = pd.concat([data,one_hot],axis=1)

    return data

Feature engineering 

One Hot Encoding of the sequences

In [5]:
def onehote(sequence):
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    seq2 = [mapping[i] for i in sequence]
    return np.eye(4)[seq2]

def get_indiv_one_hot(df):
    res=[]
    for idx, row in df.iterrows():
        temp = row["seq"].reshape(1,-1).reshape(-1)
        res.append(temp)
    
    colName=[]

    for j in range(1, 8):
        for i in ["A", "C", "G", "T"]:
           colName.append(i + "_" + str(j))
    
    return pd.DataFrame(res,columns=colName)

Get Kmer sequence

In [6]:
def kmers(sequence, ksize=5):
    kmers = []
    n_kmers = len(sequence) - ksize + 1

    for i in range(n_kmers):
        kmer = sequence[i:i + ksize]
        kmers.append(kmer)

    return kmers

To get position weight matrix score for each nucleo_seq

In [7]:
def create_pwm_from_sequences(sequences):
    """
    Create a Position Weight Matrix (PWM) from a list of sequences.
    
    Args:
        sequences (list of str): List of DNA sequences of equal length.
        
    Returns:
        np.ndarray: The PWM matrix.
    """
    # Check if all sequences have the same length
    seq_length = len(sequences[0])
    if not all(len(seq) == seq_length for seq in sequences):
        raise ValueError("All sequences must have the same length")

    # Initialize the PWM matrix with zeros
    pwm = np.zeros((4, seq_length))  # 4 rows for A, C, G, T

    # Count the occurrences of each nucleotide at each position
    for seq in sequences:
        for i, nucleotide in enumerate(seq):
            if nucleotide == 'A':
                pwm[0, i] += 1
            elif nucleotide == 'C':
                pwm[1, i] += 1
            elif nucleotide == 'G':
                pwm[2, i] += 1
            elif nucleotide == 'T':
                pwm[3, i] += 1

    # Convert counts to probabilities
    pwm = pwm / len(sequences)

    return pd.DataFrame(data=pwm, columns=[1,2,3,4,5,6,7], index=["A","C","G","T"])

def log_odds(x):
    """
    Function to calculate log odds
    """
    if x == 0:
        return 0
    else:
        return np.log2(x/0.25)


def get_log_odds(sequences):
    """To get log odd dictionary which is used to calculate PWM score for each nucleo_seq"""
    ppm = create_pwm_from_sequences(sequences)
    log_odds_pos = ppm.applymap(log_odds)
    log_odds_dict = log_odds_pos.to_dict()

    return log_odds_dict, ppm


def get_PWM_score(seq):
    """Calculating PWM score for nucleo_seq based on the log odds"""
    res = 0
    for i in range(len(seq)):
        base = seq[i]
        dic = log_odds_dict[(i+1)]
        res = res + dic[base]
    return res

Data preprocessing

In [8]:
kmer_df = pd.DataFrame(data.apply(lambda x: kmers(x.nucleo_seq),axis=1).to_list(), columns=["nucleo_seq_-1","nucleo_seq_0","nucleo_seq_1"])
data = pd.concat([data,kmer_df], axis=1)

In [9]:
data["seq"] = data.apply(lambda x: onehote(x.nucleo_seq), axis=1)
one_hot = get_indiv_one_hot(data)
# data["seq_-1"] = data.apply(lambda x: onehote(x["nucleo_seq_-1"]), axis=1)
# data["seq_0"] = data.apply(lambda x: onehote(x.nucleo_seq_0), axis=1)
# data["seq_1"] = data.apply(lambda x: onehote(x.nucleo_seq_1), axis=1)


In [10]:
one_hot

Unnamed: 0,A_1,C_1,G_1,T_1,A_2,C_2,G_2,T_2,A_3,C_3,...,G_5,T_5,A_6,C_6,G_6,T_6,A_7,C_7,G_7,T_7
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
121834,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
121835,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
121836,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [11]:
log_odds_dict, ppm = get_log_odds(data.nucleo_seq)


In [12]:
log_odds_dict

{1: {'A': 0.35161637660242673,
  'C': -0.434148743074179,
  'G': 0.05303229856801224,
  'T': -0.07942150937578545},
 2: {'A': 0.5905642890360848,
  'C': 0.0,
  'G': 0.4657623808413223,
  'T': 0.15460578471573547},
 3: {'A': 0.9287874813640022, 'C': 0.0, 'G': 1.067862143702843, 'T': 0.0},
 4: {'A': 2.0, 'C': 0.0, 'G': 0.0, 'T': 0.0},
 5: {'A': 0.0, 'C': 2.0, 'G': 0.0, 'T': 0.0},
 6: {'A': 0.5828374337870843,
  'C': 0.2660940550604804,
  'G': 0.0,
  'T': 0.3781344807057834},
 7: {'A': 0.057363008159643646,
  'C': -0.2423758556183202,
  'G': -0.04754729329247997,
  'T': 0.19724392534880456}}

In [13]:
ppm

Unnamed: 0,1,2,3,4,5,6,7
A,0.318997,0.376459,0.475919,1.0,0.0,0.374448,0.260141
C,0.185033,0.0,0.0,0.0,1.0,0.300637,0.211338
G,0.259361,0.345262,0.524081,0.0,0.0,0.0,0.241895
T,0.236609,0.278279,0.0,0.0,0.0,0.324915,0.286627


In [14]:
data["pwm_score"] = data.apply(lambda x: get_PWM_score(x.nucleo_seq),axis=1)

In [15]:
data = pd.concat([data,one_hot],axis=1)

In [16]:
data

Unnamed: 0,transcript_id,transcript_pos,nucleo_seq,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,...,G_5,T_5,A_6,C_6,G_6,T_6,A_7,C_7,G_7,T_7
0,ENST00000000233,244,AAGACCA,0.008264,4.223784,123.702703,0.009373,7.382162,125.913514,0.007345,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,ENST00000000233,261,CAAACTG,0.006609,3.216424,109.681395,0.006813,3.226535,107.889535,0.007710,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,ENST00000000233,316,GAAACAG,0.007570,2.940541,105.475676,0.007416,3.642703,98.947027,0.007555,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,ENST00000000233,332,AGAACAT,0.010620,6.476350,129.355000,0.008632,2.899200,97.836500,0.006102,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,ENST00000000233,368,AGGACAA,0.010701,6.415051,117.924242,0.011479,5.870303,121.954545,0.010019,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000641834,1348,GGGACAT,0.009594,3.294164,118.232877,0.007300,4.929726,116.342466,0.006555,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
121834,ENST00000641834,1429,CTGACAC,0.008393,4.511014,110.969565,0.010305,9.105797,114.927536,0.005568,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
121835,ENST00000641834,1531,TGGACAC,0.008161,3.918438,113.968750,0.006877,4.759687,113.562500,0.006410,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
121836,ENST00000641834,1537,CTGACCA,0.008044,3.191228,109.354386,0.007419,6.552982,123.263158,0.006472,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


Train test split by gene ID

In [17]:
from sklearn.model_selection import GroupShuffleSplit 

splitter = GroupShuffleSplit(test_size=.10, n_splits=2, random_state = 42)
split = splitter.split(data, groups=data['gene_id'])
train_idx, test_idx = next(split)

train = data.iloc[train_idx].reset_index(drop=True)
test = data.iloc[test_idx].reset_index(drop=True)

# further split test into val and test sets
val_splitter = GroupShuffleSplit(test_size=0.1, n_splits=2, random_state=42)
val_split = val_splitter.split(train, groups= train["gene_id"])
null_idx, val_idx = next(val_split)

val = train.iloc[val_idx].reset_index(drop=True)

In [18]:
features = ['dwell_time_-1',
       'sd_-1', 'mean_-1', 'dwell_time_0', 'sd_0', 'mean_0', 'dwell_time_1',
       'sd_1', 'mean_1', 'pwm_score', 'A_1', 'C_1', 'G_1', 'T_1', 'A_2', 'C_2', 'G_2', 'T_2', 'A_3', 'C_3',
       'G_3', 'T_3', 'A_4', 'C_4', 'G_4', 'T_4', 'A_5', 'C_5', 'G_5', 'T_5',
       'A_6', 'C_6', 'G_6', 'T_6', 'A_7', 'C_7', 'G_7', 'T_7']

In [19]:
X_train = train[features]
X_val = val[features]
X_test = test[features]

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns =X_train.columns)
X_train_scaled["label"] = train["label"]

X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns)
X_val_scaled["label"] = val["label"]

X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
X_test_scaled["label"] = test["label"]

In [21]:
X_train["label"] = train["label"]
X_val["label"] = val["label"]
X_test["label"] = test["label"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["label"] = train["label"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val["label"] = val["label"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["label"] = test["label"]


In [22]:
X_train.to_csv("trainset.csv")
X_test.to_csv("testset.csv")
X_val.to_csv("valset.csv")

In [23]:
X_train_scaled.to_csv("stdtrainset.csv")
X_test_scaled.to_csv("stdtestset.csv")
X_val_scaled.to_csv("stdvalset.csv")

In [24]:
X_train_scaled

Unnamed: 0,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,sd_1,mean_1,pwm_score,...,T_5,A_6,C_6,G_6,T_6,A_7,C_7,G_7,T_7,label
0,0.086521,-0.061286,1.143673,0.740391,1.019254,1.205214,0.223718,1.582161,-1.141916,1.124939,...,0.0,-0.775307,1.528697,0.0,-0.693718,1.684813,-0.517113,-0.564808,-0.633901,0
1,-0.839637,-0.581488,-0.097565,-0.775008,-0.883272,-0.242814,0.505861,0.049860,1.636934,-1.181438,...,0.0,-0.775307,-0.654152,0.0,1.441508,-0.593538,-0.517113,1.770514,-0.633901,0
2,-0.302030,-0.723954,-0.469877,-0.418340,-0.692742,-0.961247,0.386090,-0.989407,0.639177,0.557397,...,0.0,1.289812,-0.654152,0.0,-0.693718,-0.593538,-0.517113,1.770514,-0.633901,0
3,1.404788,1.101941,1.644044,0.301845,-1.033132,-1.050465,-0.736462,-0.822385,0.596579,1.609351,...,0.0,1.289812,-0.654152,0.0,-0.693718,-0.593538,-0.517113,-0.564808,1.577533,0
4,1.449696,1.070286,0.632134,1.986879,0.327096,0.887154,2.288659,1.440451,-0.208536,1.607324,...,0.0,1.289812,-0.654152,0.0,-0.693718,1.684813,-0.517113,-0.564808,-0.633901,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109656,0.830441,-0.541343,0.659456,-0.486953,-0.103518,0.436286,-0.386635,1.155730,-0.851515,1.208473,...,0.0,1.289812,-0.654152,0.0,-0.693718,-0.593538,-0.517113,-0.564808,1.577533,1
109657,0.158355,0.087040,0.016470,1.292003,1.808367,0.322611,-1.148793,0.752101,-1.156735,-1.902746,...,0.0,1.289812,-0.654152,0.0,-0.693718,-0.593538,1.933814,-0.564808,-0.633901,0
109658,0.028464,-0.218967,0.281974,-0.737239,-0.181365,0.212946,-0.498356,-0.883835,-0.408672,-0.229255,...,0.0,1.289812,-0.654152,0.0,-0.693718,-0.593538,1.933814,-0.564808,-0.633901,1
109659,-0.036874,-0.594499,-0.126514,-0.415996,0.639640,0.992287,-0.450277,-0.482068,-0.793719,-1.945482,...,0.0,-0.775307,1.528697,0.0,-0.693718,1.684813,-0.517113,-0.564808,-0.633901,0


In [25]:
X_train

Unnamed: 0,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,sd_1,mean_1,pwm_score,...,T_5,A_6,C_6,G_6,T_6,A_7,C_7,G_7,T_7,label
0,0.008264,4.223784,123.702703,0.009373,7.382162,125.913514,0.007345,4.386989,80.570270,6.333500,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1,0.006609,3.216424,109.681395,0.006813,3.226535,107.889535,0.007710,3.016599,94.290698,5.415790,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
2,0.007570,2.940541,105.475676,0.007416,3.642703,98.947027,0.007555,2.087146,89.364324,6.107674,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3,0.010620,6.476350,129.355000,0.008632,2.899200,97.836500,0.006102,2.236520,89.154000,6.526248,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,0.010701,6.415051,117.924242,0.011479,5.870303,121.954545,0.010019,4.260253,85.178788,6.525441,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109656,0.009594,3.294164,118.232877,0.007300,4.929726,116.342466,0.006555,4.005616,82.004110,6.366738,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
109657,0.008393,4.511014,110.969565,0.010305,9.105797,114.927536,0.005568,3.644638,80.497101,5.128781,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
109658,0.008161,3.918438,113.968750,0.006877,4.759687,113.562500,0.006410,2.181562,84.190625,5.794665,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
109659,0.008044,3.191228,109.354386,0.007419,6.552982,123.263158,0.006472,2.540877,82.289474,5.111776,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0


In [26]:
X_test

Unnamed: 0,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,sd_1,mean_1,pwm_score,...,T_5,A_6,C_6,G_6,T_6,A_7,C_7,G_7,T_7,label
0,0.008884,4.900645,111.441935,0.010946,9.120968,121.032258,0.012280,2.513839,90.670968,4.924078,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0
1,0.012514,6.078824,116.970588,0.011593,4.955000,126.029412,0.006507,2.674206,92.835294,6.320738,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
2,0.008177,4.556774,109.096774,0.010551,6.230000,125.387097,0.007994,3.073226,82.077419,5.759568,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3,0.011033,7.111515,115.818182,0.009392,6.670606,122.606061,0.011874,3.255455,90.857576,6.320738,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
4,0.006671,4.071034,110.834483,0.008029,7.431724,121.965517,0.007770,3.116897,87.524138,5.363698,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12172,0.005171,3.546190,108.071429,0.008366,8.419524,116.809524,0.005716,3.548095,79.142857,5.251657,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
12173,0.009185,2.428750,105.375000,0.007499,2.211667,96.825000,0.007478,2.254167,88.854167,6.080131,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
12174,0.008836,3.795217,120.913043,0.009140,6.997826,124.217391,0.007519,3.289565,82.286957,6.219205,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
12175,0.005975,1.985000,93.829167,0.007158,2.457083,100.995833,0.006356,2.249167,92.162500,5.579350,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0


In [27]:
X_test_scaled

Unnamed: 0,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,sd_1,mean_1,pwm_score,...,T_5,A_6,C_6,G_6,T_6,A_7,C_7,G_7,T_7,label
0,0.433349,0.288245,0.058287,1.671152,1.815312,0.813059,4.034631,-0.512301,0.903816,-2.417203,...,0.0,-0.775307,-0.654152,0.0,1.441508,-0.593538,1.933814,-0.564808,-0.633901,0
1,2.464205,0.896658,0.547712,2.054349,-0.091947,1.214525,-0.423513,-0.332986,1.342165,1.092867,...,0.0,-0.775307,-0.654152,0.0,1.441508,1.684813,-0.517113,-0.564808,-0.633901,0
2,0.037500,0.110670,-0.149319,1.437648,0.491772,1.162922,0.725093,0.113178,-0.836667,-0.317460,...,0.0,1.289812,-0.654152,0.0,-0.693718,-0.593538,-0.517113,1.770514,-0.633901,0
3,1.635936,1.429941,0.445695,0.751206,0.693490,0.939496,3.720834,0.316937,0.941611,1.092867,...,0.0,-0.775307,-0.654152,0.0,1.441508,1.684813,-0.517113,-0.564808,-0.633901,0
4,-0.805254,-0.140166,0.004512,-0.055253,1.041944,0.888036,0.551703,0.162008,0.266477,-1.312356,...,0.0,-0.775307,-0.654152,0.0,1.441508,-0.593538,-0.517113,-0.564808,1.577533,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12172,-1.644191,-0.411196,-0.240088,0.144060,1.494178,0.473808,-1.034369,0.644153,-1.431015,-1.593935,...,0.0,-0.775307,1.528697,0.0,-0.693718,-0.593538,-0.517113,-0.564808,1.577533,0
12173,0.601903,-0.988243,-0.478789,-0.368828,-1.347898,-1.131728,0.326741,-0.802653,0.535853,0.488175,...,0.0,1.289812,-0.654152,0.0,-0.693718,1.684813,-0.517113,-0.564808,-0.633901,0
12174,0.406430,-0.282598,0.896718,0.602340,0.843298,1.068949,0.358245,0.355077,-0.794229,0.837696,...,0.0,1.289812,-0.654152,0.0,-0.693718,1.684813,-0.517113,-0.564808,-0.633901,0
12175,-1.194772,-1.217396,-1.500886,-0.570558,-1.235541,-0.796648,-0.540063,-0.808244,1.205902,-0.770381,...,0.0,-0.775307,-0.654152,0.0,1.441508,-0.593538,-0.517113,-0.564808,1.577533,0


In [28]:
X_val

Unnamed: 0,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,sd_1,mean_1,pwm_score,...,T_5,A_6,C_6,G_6,T_6,A_7,C_7,G_7,T_7,label
0,0.011193,6.899200,122.200000,0.009972,3.988800,95.036000,0.006014,2.003440,83.268000,5.771039,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1,0.006806,4.252500,88.221429,0.007042,3.508929,91.032143,0.008742,2.157143,83.935714,5.160144,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,0.006211,9.455938,117.937500,0.006846,4.092812,93.318750,0.005559,1.590937,83.178125,5.638585,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
3,0.006895,4.659333,119.100000,0.009256,6.915333,129.500000,0.008259,3.747667,84.540000,6.790124,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,0.008886,5.597647,117.323529,0.007156,5.240000,121.500000,0.006392,3.253559,88.529412,6.460619,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10949,0.008662,6.390870,122.608696,0.007193,3.280435,98.408696,0.005929,2.120870,87.656522,6.227664,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
10950,0.008654,3.899200,117.160000,0.009772,6.039600,126.120000,0.009442,4.726800,80.696000,6.545333,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
10951,0.007058,4.703636,96.631818,0.009510,9.734545,114.590909,0.005405,3.561818,84.222727,5.615962,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
10952,0.005757,5.218182,88.954545,0.005589,7.836818,117.909091,0.004892,3.077273,80.586364,5.597803,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [29]:
X_val_scaled

Unnamed: 0,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,sd_1,mean_1,pwm_score,...,T_5,A_6,C_6,G_6,T_6,A_7,C_7,G_7,T_7,label
0,1.725392,1.320301,1.010646,1.095017,-0.534292,-1.275455,-0.803721,-1.083003,-0.595535,-0.288630,...,0.0,-0.775307,1.528697,0.0,-0.693718,1.684813,-0.517113,-0.564808,-0.633901,0
1,-0.729699,-0.046457,-1.997312,-0.639540,-0.753986,-1.597120,1.302666,-0.911140,-0.460300,-1.823925,...,0.0,-0.775307,1.528697,0.0,-0.693718,-0.593538,1.933814,-0.564808,-0.633901,0
2,-1.062691,2.640602,0.633308,-0.755458,-0.486673,-1.413417,-1.155337,-1.544242,-0.613737,-0.621511,...,0.0,-0.775307,1.528697,0.0,-0.693718,1.684813,-0.517113,-0.564808,-0.633901,0
3,-0.679551,0.163632,0.736218,0.670801,0.805531,1.493349,0.929579,0.867303,-0.337912,2.272522,...,0.0,1.289812,-0.654152,0.0,-0.693718,-0.593538,-0.517113,-0.564808,1.577533,0
4,0.434623,0.648178,0.578956,-0.572009,0.038531,0.850637,-0.512089,0.314817,0.470079,1.444414,...,0.0,-0.775307,-0.654152,0.0,1.441508,-0.593538,-0.517113,-0.564808,1.577533,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10949,0.309114,1.057799,1.046826,-0.550272,-0.858595,-1.004496,-0.869903,-0.951699,0.293289,0.858953,...,0.0,1.289812,-0.654152,0.0,-0.693718,-0.593538,-0.517113,-0.564808,1.577533,0
10950,0.304764,-0.228902,0.564479,0.976405,0.404603,1.221803,1.843411,1.962121,-1.116452,1.657316,...,0.0,1.289812,-0.654152,0.0,-0.693718,-0.593538,-0.517113,1.770514,-0.633901,0
10951,-0.588681,0.186510,-1.252781,0.821334,2.096220,0.295567,-1.273954,0.659497,-0.402170,-0.678369,...,0.0,1.289812,-0.654152,0.0,-0.693718,-0.593538,1.933814,-0.564808,-0.633901,0
10952,-1.316627,0.452222,-1.932412,-1.499354,1.227404,0.562147,-1.670589,0.117703,-1.138657,-0.724007,...,0.0,-0.775307,1.528697,0.0,-0.693718,-0.593538,1.933814,-0.564808,-0.633901,0


In [36]:
test_data1 = parse_test_data("/Users/shaun/Desktop/dataset1.json.gz")

Loading /Users/shaun/Desktop/dataset1.json.gz...
Transferring data from json to dataframe...


In [34]:
test_data2 = parse_test_data("/Users/shaun/Desktop/dataset2.json.gz")

Loading /Users/shaun/Desktop/dataset2.json.gz...
Transferring data from json to dataframe...


In [37]:
test_data1.to_csv("dataset1.csv")
test_data2.to_csv("dataset2.csv")

Save standardscaler

In [38]:
import pickle
pickle.dump(scaler, open('scaler.pkl', 'wb'))