In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from load import load_pseudo, load_condons

pd.options.display.precision = 3
pd.options.display.max_colwidth = 10

%matplotlib inline

In [2]:
%time records = load_pseudo()
numerical_response = pd.read_csv('../data/pseudo/Perron_phenotype-GSU-training.csv')
records = records.merge(numerical_response[['strain', 'carb.lag.delta', 'toby.lag.delta']],
                        left_on='lab-id', right_on='strain', how='left')
records.rename(columns={'carb.lag.delta': 'carb_num', 'toby.lag.delta': 'toby_num'}, inplace=True)
records.drop(columns=['strain', 'lab-id'], inplace=True)
records.head()

CPU times: user 4.44 s, sys: 116 ms, total: 4.55 s
Wall time: 4.57 s


Unnamed: 0,id,sequence,missing,missing_%,sequence_i,missing_i,missing_%_i,carb,toby,carb_num,toby_num
0,TA151,ATGAGT...,31842,6.588,ATGAGT...,28410,5.878,True,False,-2.0,16.0
1,IC1,ATGAGT...,46071,9.532,ATGAGT...,34714,7.182,False,False,2.0,14.0
2,A237,ATGAGT...,44514,9.21,ATGAGT...,35933,7.434,True,False,-1.0,4.0
3,5920,ATGAGT...,49497,10.241,ATGAGT...,36873,7.629,,,,
4,LiA96,ATGAGT...,44067,9.117,ATGAGT...,34454,7.128,False,False,0.0,18.0


In [3]:
mask = (records['toby'].notna() & records['carb'].notna())

# Feature selection

In [4]:
# 22 seconds
%time o_c = load_condons('../data/pseudo/concatenated.fasta')
%time i_c = load_condons('../data/pseudo/concatenated_naive_impute.fasta')

# 1.5 minutes
d = {}
for label, content in o_c.iteritems():
    d.update(content.value_counts().to_dict())
d_sorted = dict(sorted(d.items(), key=lambda x: x[1], reverse=True))
mapping = {key: i for i, key in enumerate(d_sorted.keys())}

import json
with open('../data/pseudo/preprocess/others/condon_mapping.json', 'w') as output:
    json.dump(mapping, output, indent='\t')

import json
with open('../data/pseudo/preprocess/others/condon_mapping.json', 'r') as input_:
    mapping = json.load(input_)

# 22 seconds
%time o_c_ = o_c.applymap(lambda x: mapping[x])
%time i_c_ = i_c.applymap(lambda x: mapping[x])
np.save('../data/pseudo/preprocess/o_c_-_-.npy', o_c_)
np.save('../data/pseudo/preprocess/i_c_-_-.npy', i_c_)

In [5]:
o_c_ = np.load('../data/pseudo/preprocess/o_c_-_-.npy')
i_c_ = np.load('../data/pseudo/preprocess/i_c_-_-.npy')

## Remove based on SNP counts
similar to variance threshold but seems better

In [6]:
# 2 minutes
%time variant_counts_o = o_c.apply(pd.Series.value_counts, axis=0)
%time variant_counts_i = i_c.apply(pd.Series.value_counts, axis=0)
np.save('../data/pseudo/preprocess/others/variant_counts_o.npy', variant_counts_o)
np.save('../data/pseudo/preprocess/others/variant_counts_i.npy', variant_counts_i)

variant_counts_o = pd.DataFrame(np.load('../data/pseudo/preprocess/others/variant_counts_o.npy'))
variant_counts_i = pd.DataFrame(np.load('../data/pseudo/preprocess/others/variant_counts_i.npy'))

# True     85753
variant_max_counts_o = variant_counts_o.max()
(pd.Series(variant_max_counts_o<121)).value_counts()

# True      56191
variant_max_counts_i = variant_counts_i.max()
(variant_max_counts_i<121).value_counts()

o_c_v = o_c_[mask][:, variant_max_counts_o<121]
i_c_v = i_c_[mask][:, variant_max_counts_i<121]
np.save('../data/pseudo/preprocess/o_c_v_-.npy', o_c_v)
np.save('../data/pseudo/preprocess/i_c_v_-.npy', i_c_v)

## $\chi^2$ on the previous result
because some features are all 0's, so gives `divide by 0` warning

no warning if we remove those features (on the previous step)

In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

o_c_x = SelectKBest(chi2, k=85753//2).fit_transform(o_c_v, records['toby'][mask].astype('i4'))
i_c_x = SelectKBest(chi2, k=56191//2).fit_transform(i_c_v, records['toby'][mask].astype('i4'))

np.save('../data/pseudo/preprocess/o_c_x_-.npy', o_c_x)
np.save('../data/pseudo/preprocess/i_c_x_-.npy', i_c_x)

# Feature extraction

In [8]:
o_c_v = np.load('../data/pseudo/preprocess/o_c_v_-.npy')
i_c_v = np.load('../data/pseudo/preprocess/i_c_v_-.npy')
o_c_x = np.load('../data/pseudo/preprocess/o_c_x_-.npy')
i_c_x = np.load('../data/pseudo/preprocess/i_c_x_-.npy')

## String kernel

In [9]:
from strkernel.mismatch_kernel import MismatchKernel

%time o_c__s = MismatchKernel(l=125, k=2, m=1).get_kernel(o_c_)
%time i_c__s = MismatchKernel(l=125, k=2, m=1).get_kernel(i_c_)
np.save('../data/pseudo/preprocess/o_c_-_s.npy', o_c__s.kernel)
np.save('../data/pseudo/preprocess/i_c_-_s.npy', i_c__s.kernel)

%time o_c_v_s = MismatchKernel(l=125, k=2, m=1).get_kernel(o_c_v)
%time i_c_v_s = MismatchKernel(l=125, k=2, m=1).get_kernel(i_c_v)
np.save('../data/pseudo/preprocess/o_c_v_s.npy', o_c_v_s.kernel)
np.save('../data/pseudo/preprocess/i_c_v_s.npy', i_c_v_s.kernel)

%time o_c_x_s = MismatchKernel(l=125, k=2, m=1).get_kernel(o_c_x)
%time i_c_x_s = MismatchKernel(l=125, k=2, m=1).get_kernel(i_c_x)
np.save('../data/pseudo/preprocess/o_c_x_s.npy', o_c_x_s.kernel)
np.save('../data/pseudo/preprocess/i_c_x_s.npy', i_c_x_s.kernel)

## PCA

In [10]:
from sklearn.decomposition import PCA

%time o_c__p = PCA(n_components=119).fit_transform(o_c_)
%time i_c__p = PCA(n_components=119).fit_transform(i_c_)
np.save('../data/pseudo/preprocess/o_c_-_p.npy', o_c__p)
np.save('../data/pseudo/preprocess/i_c_-_p.npy', i_c__p)

%time o_c_v_p = PCA(n_components=119).fit_transform(o_c_v)
%time i_c_v_p = PCA(n_components=119).fit_transform(i_c_v)
np.save('../data/pseudo/preprocess/o_c_v_p.npy', o_c_v_p)
np.save('../data/pseudo/preprocess/i_c_v_p.npy', i_c_v_p)

%time o_c_x_p = PCA(n_components=119).fit_transform(o_c_x)
%time i_c_x_p = PCA(n_components=119).fit_transform(i_c_x)
np.save('../data/pseudo/preprocess/o_c_x_p.npy', o_c_x_p)
np.save('../data/pseudo/preprocess/i_c_x_p.npy', i_c_x_p)

## TSNE

In [11]:
from sklearn.manifold import TSNE

%time o_c__t = TSNE(n_components=3).fit_transform(o_c_)
%time i_c__t = TSNE(n_components=3).fit_transform(i_c_)
np.save('../data/pseudo/preprocess/o_c_-_t.npy', o_c__t)
np.save('../data/pseudo/preprocess/i_c_-_t.npy', i_c__t)

%time o_c_v_t = TSNE(n_components=3).fit_transform(o_c_v)
%time i_c_v_t = TSNE(n_components=3).fit_transform(i_c_v)
np.save('../data/pseudo/preprocess/o_c_v_t.npy', o_c_v_t)
np.save('../data/pseudo/preprocess/i_c_v_t.npy', i_c_v_t)

%time o_c_x_t = TSNE(n_components=3).fit_transform(o_c_x)
%time i_c_x_t = TSNE(n_components=3).fit_transform(i_c_x)
np.save('../data/pseudo/preprocess/o_c_x_t.npy', o_c_x_t)
np.save('../data/pseudo/preprocess/i_c_x_t.npy', i_c_x_t)

In [12]:
# verify all possible combinations are created
import os
d = os.listdir('../data/pseudo/preprocess/')
s = {'{}_{}_{}_{}.npy'.format(impute, c_or_n, selection, extraction) for impute in 'io' for c_or_n in 'nc' for selection in '-vx' for extraction in '-pts'}
len(s - set(d)) == 0

True

# Machine learning

In [None]:
import os
import pickle
from sklearn.preprocessing import OneHotEncoder

random_state = 42

In [13]:
# read the numerical data and use onehot encoder
s = {'{}_c_{}_{}.npy'.format(impute, selection, extraction)
     for impute in 'io'
     for selection in '-vx'
     for extraction in '-pts'}

data = {d: np.load(os.path.join('../data/pseudo/preprocess', d)) for d in s}
# mask all data to remove x with NAN labels
for k, v in data.items():
    if v.shape[0] != 119:
        data[k] = v[mask]

for file, X in data.items():
    encoder = OneHotEncoder(categories='auto', sparse=False, dtype=np.int32)
    %time X_encode = encoder.fit_transform(X)
    
    np.save(os.path.join('../data/pseudo/preprocess/onehot/', file), X_encode)
    with open(os.path.join('../data/pseudo/preprocess/onehot-encoder/', file[:file.index('.')]), 'wb') as output:
        pickle.dump(encoder, output)