In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from load import load_pseudo, load_nucleotides

pd.options.display.precision = 3
pd.options.display.max_colwidth = 10

%matplotlib inline

In [3]:
%time records = load_pseudo()
mask = (records['toby'].notna() & records['carb'].notna())
records.head()

CPU times: user 5.31 s, sys: 120 ms, total: 5.43 s
Wall time: 5.44 s


Unnamed: 0,id,sequence,missing,missing_%,sequence_i,missing_i,missing_%_i,lab-id,carb,toby
0,TA151,ATGAGT...,31842,6.588,ATGAGT...,28410,5.878,210.0,True,False
1,IC1,ATGAGT...,46071,9.532,ATGAGT...,34714,7.182,55.0,False,False
2,A237,ATGAGT...,44514,9.21,ATGAGT...,35933,7.434,14.0,True,False
3,5920,ATGAGT...,49497,10.241,ATGAGT...,36873,7.629,,,
4,LiA96,ATGAGT...,44067,9.117,ATGAGT...,34454,7.128,175.0,False,False


# Feature selection

In [4]:
# 40*2 seconds
%time o_n = load_nucleotides('../data/pseudo/concatenated.fasta')
%time i_n = load_nucleotides('../data/pseudo/concatenated_naive_impute.fasta')

# 45*2 seconds
forward = str.maketrans('-ACTG', '01234')
def transformation(str):
    return [int(i) for i in str.translate(forward)]
%time o_n = pd.DataFrame(records['sequence'].apply(transformation).to_list())
%time i_n = pd.DataFrame(records['sequence_i'].apply(transformation).to_list())
np.save('../data/pseudo/preprocess/o_n_-_-.npy', o_n[mask])
np.save('../data/pseudo/preprocess/i_n_-_-.npy', i_n[mask])

CPU times: user 45 s, sys: 1.01 s, total: 46 s
Wall time: 42.5 s
CPU times: user 47.2 s, sys: 972 ms, total: 48.1 s
Wall time: 42.5 s


In [6]:
o_n = np.load('../data/pseudo/preprocess/o_n_-_-.npy')
i_n = np.load('../data/pseudo/preprocess/i_n_-_-.npy')

### Variance threshold (not as good as the one based on SNP counts)

In [7]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(0.01)

# justification(not rigorous) for why < 0.016 is the threshold to drop a column
a, b = 4, 3
arr = np.ones((122, 1))*a
arr[:2] = b
np.var(arr)

o_n_v = selector.fit_transform(o_n)
o_n_v_selected = pd.Series(selector.get_support())
o_n_v_selected.value_counts()

i_n_v = selector.fit_transform(i_n)
i_n_v_selected = pd.Series(selector.get_support())
i_n_v_selected.value_counts()

### Remove based on SNP counts
similar to variance threshold but seems better

In [None]:
# less than 6*2 minutes
%time snp_counts_o = o_n.apply(pd.Series.value_counts, axis=0)
%time snp_counts_i = i_n.apply(pd.Series.value_counts, axis=0)
np.save('../data/pseudo/preprocess/others/snp_counts_o.npy', snp_counts_o.to_numpy())
np.save('../data/pseudo/preprocess/others/snp_counts_i.npy', snp_counts_i.to_numpy())

In [8]:
snp_counts_o = pd.DataFrame(np.load('../data/pseudo/preprocess/others/snp_counts_o.npy'))
snp_counts_i = pd.DataFrame(np.load('../data/pseudo/preprocess/others/snp_counts_i.npy'))

# True     204101
snp_max_counts_o = snp_counts_o.max()
# True     91789
snp_max_counts_i = snp_counts_i.max()

o_n_v = o_n[mask].loc[:, (snp_max_counts_o<121)]
i_n_v = i_n[mask].loc[:, (snp_max_counts_i<121)]
np.save('../data/pseudo/preprocess/o_n_v_-.npy', o_n_v)
np.save('../data/pseudo/preprocess/i_n_v_-.npy', i_n_v)

### $\chi^2$ on the previous result
because some features are all 0's, so gives `divide by 0` warning

no warning if we remove those features (on the previous step)

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

o_n_x = SelectKBest(chi2, k=204101//2).fit_transform(o_n_v, records['toby'][mask].astype('i4'))
i_n_x = SelectKBest(chi2, k=91789//2).fit_transform(i_n_v, records['toby'][mask].astype('i4'))
np.save('../data/pseudo/preprocess/o_n_x_-.npy', o_n_x)
np.save('../data/pseudo/preprocess/i_n_x_-.npy', i_n_x)

# Feature extraction

In [12]:
o_n_v = np.load('../data/pseudo/preprocess/o_n_v_-.npy')
i_n_v = np.load('../data/pseudo/preprocess/i_n_v_-.npy')
o_n_x = np.load('../data/pseudo/preprocess/o_n_x_-.npy')
i_n_x = np.load('../data/pseudo/preprocess/i_n_x_-.npy')

### String kernel

In [13]:
from strkernel.mismatch_kernel import MismatchKernel

# 9*2 minutes
%time o_n__s = MismatchKernel(l=5, k=4, m=1).get_kernel(o_n)
%time i_n__s = MismatchKernel(l=5, k=4, m=1).get_kernel(i_n)
np.save('../data/pseudo/preprocess/o_n_-_s.npy', o_n__s.kernel)
np.save('../data/pseudo/preprocess/i_n_-_s.npy', i_n__s.kernel)

# 3+1 minutes
%time o_n_v_s = MismatchKernel(l=5, k=4, m=1).get_kernel(o_n_v)
%time i_n_v_s = MismatchKernel(l=5, k=4, m=1).get_kernel(i_n_v)
np.save('../data/pseudo/preprocess/o_n_v_s.npy', o_n_v_s.kernel)
np.save('../data/pseudo/preprocess/i_n_v_s.npy', i_n_v_s.kernel)

# 2+1 minutes
%time o_n_x_s = MismatchKernel(l=5, k=4, m=1).get_kernel(o_n_x)
%time i_n_x_s = MismatchKernel(l=5, k=4, m=1).get_kernel(i_n_x)
np.save('../data/pseudo/preprocess/o_n_x_s.npy', o_n_x_s.kernel)
np.save('../data/pseudo/preprocess/i_n_x_s.npy', i_n_x_s.kernel)

CPU times: user 1h 20min 23s, sys: 5min 38s, total: 1h 26min 2s
Wall time: 8min 49s
CPU times: user 1h 13min 33s, sys: 5min 33s, total: 1h 19min 7s
Wall time: 9min 24s
CPU times: user 7min 14s, sys: 1min 4s, total: 8min 19s
Wall time: 3min 44s
CPU times: user 1min 29s, sys: 5.91 s, total: 1min 35s
Wall time: 1min 35s
CPU times: user 1min 42s, sys: 8.22 s, total: 1min 50s
Wall time: 1min 50s
CPU times: user 45.9 s, sys: 3.78 s, total: 49.7 s
Wall time: 49.8 s


### PCA

In [14]:
from sklearn.decomposition import PCA

%time o_n__p = PCA(n_components=119).fit_transform(o_n)
%time i_n__p = PCA(n_components=119).fit_transform(i_n)
np.save('../data/pseudo/preprocess/o_n_-_p.npy', o_n__p)
np.save('../data/pseudo/preprocess/i_n_-_p.npy', i_n__p)

%time o_n_v_p = PCA(n_components=119).fit_transform(o_n_v)
%time i_n_v_p = PCA(n_components=119).fit_transform(i_n_v)
np.save('../data/pseudo/preprocess/o_n_v_p.npy', o_n_v_p)
np.save('../data/pseudo/preprocess/i_n_v_p.npy', i_n_v_p)

%time o_n_x_p = PCA(n_components=119).fit_transform(o_n_x)
%time i_n_x_p = PCA(n_components=119).fit_transform(i_n_x)
np.save('../data/pseudo/preprocess/o_n_x_p.npy', o_n_x_p)
np.save('../data/pseudo/preprocess/i_n_x_p.npy', i_n_x_p)

CPU times: user 6min 34s, sys: 21.5 s, total: 6min 55s
Wall time: 14.7 s
CPU times: user 6min, sys: 19.8 s, total: 6min 19s
Wall time: 13.7 s
CPU times: user 2min 21s, sys: 8.61 s, total: 2min 30s
Wall time: 5.76 s
CPU times: user 1min 8s, sys: 7.81 s, total: 1min 16s
Wall time: 2.73 s
CPU times: user 1min 21s, sys: 7.9 s, total: 1min 28s
Wall time: 3.34 s
CPU times: user 44.4 s, sys: 5.28 s, total: 49.7 s
Wall time: 1.79 s


### T-SNE

In [15]:
from sklearn.manifold import TSNE

%time o_n__t = TSNE(n_components=3).fit_transform(o_n)
%time i_n__t = TSNE(n_components=3).fit_transform(i_n)
np.save('../data/pseudo/preprocess/o_n_-_t.npy', o_n__t)
np.save('../data/pseudo/preprocess/i_n_-_t.npy', i_n__t)

%time o_n_v_t = TSNE(n_components=3).fit_transform(o_n_v)
%time i_n_v_t = TSNE(n_components=3).fit_transform(i_n_v)
np.save('../data/pseudo/preprocess/o_n_v_t.npy', o_n_v_t)
np.save('../data/pseudo/preprocess/i_n_v_t.npy', i_n_v_t)

%time o_n_x_t = TSNE(n_components=3).fit_transform(o_n_x)
%time i_n_x_t = TSNE(n_components=3).fit_transform(i_n_x)
np.save('../data/pseudo/preprocess/o_n_x_t.npy', o_n_x_t)
np.save('../data/pseudo/preprocess/i_n_x_t.npy', i_n_x_t)

CPU times: user 3min 28s, sys: 12.8 s, total: 3min 41s
Wall time: 10.6 s
CPU times: user 3min 12s, sys: 11.9 s, total: 3min 24s
Wall time: 10 s
CPU times: user 1min 18s, sys: 4.59 s, total: 1min 23s
Wall time: 5.35 s
CPU times: user 44.4 s, sys: 2.52 s, total: 46.9 s
Wall time: 4 s
CPU times: user 43.8 s, sys: 2.44 s, total: 46.2 s
Wall time: 4.04 s
CPU times: user 21.4 s, sys: 1.33 s, total: 22.8 s
Wall time: 3.53 s


# Encoding

In [7]:
import os

s = {'{}_n_{}_{}.npy'.format(impute, selection, extraction)
     for impute in 'io'
     for selection in '-vx'
     for extraction in '-pts'}

data = {d: np.load(os.path.join('../data/pseudo/preprocess', d)) for d in s}

In [None]:
from sklearn.preprocessing import OneHotEncoder

for file, X in data.items():
    encoder = OneHotEncoder(categories='auto', sparse=False, dtype=np.int32)
    X_encode = encoder.fit_transform(X)
    np.save(os.path.join('../data/pseudo/preprocess/onehot', file), X_encode)