In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from load import load_staph, load_condons

pd.options.display.precision = 3
pd.options.display.max_colwidth = 10

In [2]:
%time records = load_staph(False)
mask = records['resp'].notna()
records.head()

CPU times: user 10.6 s, sys: 248 ms, total: 10.8 s
Wall time: 10.9 s


Unnamed: 0,id,sequence,missing,missing_%,sequence_i,missing_i,missing_%_i,resp
0,NRS001,ATGAAC...,2511,0.255,ATGAAC...,2356,0.24,False
1,NRS002,------...,25278,2.571,ATGAAC...,2236,0.227,False
2,NRS003,ATGAAC...,48213,4.904,ATGAAC...,2253,0.229,False
3,NRS021,ATGAAA...,2442,0.248,ATGAAA...,2088,0.212,False
4,NRS022,ATGAAC...,3885,0.395,ATGAAC...,2154,0.219,False


# Feature selection

In [6]:
# 1*2 minutes
%time o_c = load_condons('../data/staph/core_gene_alignment-narsa.fasta')
%time i_c = load_condons('../data/staph/core_gene_alignment-narsa_naive_impute.fasta')

# 1.5 minutes
d = {}
for label, content in o_c.iteritems():
    d.update(content.value_counts().to_dict())
d_sorted = dict(sorted(d.items(), key=lambda x: x[1], reverse=True))
mapping = {key: i for i, key in enumerate(d_sorted.keys())}

import json
with open('../data/staph/preprocess/others/condon_mapping.json', 'w') as output:
    json.dump(mapping, output, indent='\t')

import json
with open('../data/staph/preprocess/others/condon_mapping.json', 'r') as input_:
    mapping = json.load(input_)

# 1*2 minutes
%time o_c_ = o_c.applymap(lambda x: mapping[x])
%time i_c_ = i_c.applymap(lambda x: mapping[x])
np.save('../data/staph/preprocess/o_c_-_-.npy', o_c_.to_numpy()[mask])
np.save('../data/staph/preprocess/i_c_-_-.npy', i_c_.to_numpy()[mask])

CPU times: user 1min 10s, sys: 2.55 s, total: 1min 12s
Wall time: 1min 15s
CPU times: user 1min 14s, sys: 4.67 s, total: 1min 19s
Wall time: 1min 18s


In [8]:
o_c_ = np.load('../data/staph/preprocess/o_c_-_-.npy')
i_c_ = np.load('../data/staph/preprocess/i_c_-_-.npy')

### Remove based on SNP counts
similar to variance threshold but seems better

In [None]:
# 4*2 minutes
%time variant_counts_o = o_c.apply(pd.Series.value_counts, axis=0)
%time variant_counts_i = i_c.apply(pd.Series.value_counts, axis=0)
np.save('../data/staph/preprocess/others/variant_counts_o.npy', variant_counts_o)
np.save('../data/staph/preprocess/others/variant_counts_i.npy', variant_counts_i)

variant_counts_o = pd.DataFrame(np.load('../data/staph/preprocess/others/variant_counts_o.npy'))
variant_counts_i = pd.DataFrame(np.load('../data/staph/preprocess/others/variant_counts_i.npy'))

# True     96576
variant_max_counts_o = variant_counts_o.max()
# True      40054
variant_max_counts_i = variant_counts_i.max()

o_c_v = o_c_[mask][:, variant_max_counts_o<124]
i_c_v = i_c_[mask][:, variant_max_counts_i<124]
np.save('../data/staph/preprocess/o_c_v_-.npy', o_c_v)
np.save('../data/staph/preprocess/i_c_v_-.npy', i_c_v)

### $\chi^2$ on the previous result
because some features are all 0's, so gives `divide by 0` warning

no warning if we remove those features (on the previous step)

In [42]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

o_c_x = SelectKBest(chi2, k=96576//2).fit_transform(o_c_v, records['resp'][mask].astype('i4'))
i_c_x = SelectKBest(chi2, k=40054//2).fit_transform(i_c_v, records['resp'][mask].astype('i4'))

np.save('../data/staph/preprocess/o_c_x_-.npy', o_c_x)
np.save('../data/staph/preprocess/i_c_x_-.npy', i_c_x)

  chisq /= f_exp


# Feature extraction

In [9]:
o_c_v = np.load('../data/staph/preprocess/o_c_v_-.npy')
i_c_v = np.load('../data/staph/preprocess/i_c_v_-.npy')
o_c_x = np.load('../data/staph/preprocess/o_c_x_-.npy')
i_c_x = np.load('../data/staph/preprocess/i_c_x_-.npy')

### String kernel

In [None]:
# takes very long but about one night
from strkernel.mismatch_kernel import MismatchKernel

%time o_c__s = MismatchKernel(l=125, k=2, m=1).get_kernel(o_c_)
%time i_c__s = MismatchKernel(l=125, k=2, m=1).get_kernel(i_c_)
np.save('../data/staph/preprocess/o_c_-_s.npy', o_c__s.kernel)
np.save('../data/staph/preprocess/i_c_-_s.npy', i_c__s.kernel)

%time o_c_v_s = MismatchKernel(l=125, k=2, m=1).get_kernel(o_c_v)
%time i_c_v_s = MismatchKernel(l=125, k=2, m=1).get_kernel(i_c_v)
np.save('../data/staph/preprocess/o_c_v_s.npy', o_c_v_s.kernel)
np.save('../data/staph/preprocess/i_c_v_s.npy', i_c_v_s.kernel)

%time o_c_x_s = MismatchKernel(l=125, k=2, m=1).get_kernel(o_c_x)
%time i_c_x_s = MismatchKernel(l=125, k=2, m=1).get_kernel(i_c_x)
np.save('../data/staph/preprocess/o_c_x_s.npy', o_c_x_s.kernel)
np.save('../data/staph/preprocess/i_c_x_s.npy', i_c_x_s.kernel)

### PCA

In [10]:
from sklearn.decomposition import PCA

%time o_c__p = PCA(n_components=124).fit_transform(o_c_)
%time i_c__p = PCA(n_components=124).fit_transform(i_c_)
np.save('../data/staph/preprocess/o_c_-_p.npy', o_c__p)
np.save('../data/staph/preprocess/i_c_-_p.npy', i_c__p)

%time o_c_v_p = PCA(n_components=124).fit_transform(o_c_v)
%time i_c_v_p = PCA(n_components=124).fit_transform(i_c_v)
np.save('../data/staph/preprocess/o_c_v_p.npy', o_c_v_p)
np.save('../data/staph/preprocess/i_c_v_p.npy', i_c_v_p)

%time o_c_x_p = PCA(n_components=124).fit_transform(o_c_x)
%time i_c_x_p = PCA(n_components=124).fit_transform(i_c_x)
np.save('../data/staph/preprocess/o_c_x_p.npy', o_c_x_p)
np.save('../data/staph/preprocess/i_c_x_p.npy', i_c_x_p)

CPU times: user 4min 24s, sys: 19.1 s, total: 4min 43s
Wall time: 11.3 s
CPU times: user 3min 59s, sys: 21.9 s, total: 4min 21s
Wall time: 11.7 s
CPU times: user 55.4 s, sys: 7.39 s, total: 1min 2s
Wall time: 3.09 s
CPU times: user 32.7 s, sys: 4.58 s, total: 37.3 s
Wall time: 1.66 s
CPU times: user 35 s, sys: 5.21 s, total: 40.2 s
Wall time: 1.85 s
CPU times: user 21.7 s, sys: 2.88 s, total: 24.6 s
Wall time: 1.02 s


### TSNE

In [11]:
from sklearn.manifold import TSNE

%time o_c__t = TSNE(n_components=3).fit_transform(o_c_)
%time i_c__t = TSNE(n_components=3).fit_transform(i_c_)
np.save('../data/staph/preprocess/o_c_-_t.npy', o_c__t)
np.save('../data/staph/preprocess/i_c_-_t.npy', i_c__t)

%time o_c_v_t = TSNE(n_components=3).fit_transform(o_c_v)
%time i_c_v_t = TSNE(n_components=3).fit_transform(i_c_v)
np.save('../data/staph/preprocess/o_c_v_t.npy', o_c_v_t)
np.save('../data/staph/preprocess/i_c_v_t.npy', i_c_v_t)

%time o_c_x_t = TSNE(n_components=3).fit_transform(o_c_x)
%time i_c_x_t = TSNE(n_components=3).fit_transform(i_c_x)
np.save('../data/staph/preprocess/o_c_x_t.npy', o_c_x_t)
np.save('../data/staph/preprocess/i_c_x_t.npy', i_c_x_t)

CPU times: user 45.3 s, sys: 2.5 s, total: 47.8 s
Wall time: 4.03 s
CPU times: user 47.4 s, sys: 2.43 s, total: 49.8 s
Wall time: 4.38 s
CPU times: user 43 s, sys: 2.61 s, total: 45.6 s
Wall time: 4.42 s
CPU times: user 19.1 s, sys: 1.04 s, total: 20.1 s
Wall time: 3.02 s
CPU times: user 27.3 s, sys: 1.33 s, total: 28.7 s
Wall time: 3.98 s
CPU times: user 9.44 s, sys: 2.28 s, total: 11.7 s
Wall time: 3.36 s


In [52]:
# verify all possible combinations are created
import os
d = os.listdir('../data/staph/preprocess/')
s = {'{}_{}_{}_{}.npy'.format(impute, c_or_n, selection, extraction) for impute in 'io' for c_or_n in 'nc' for selection in '-vx' for extraction in '-pts'}
len(s - set(d)) == 0

True

# Encoding

In [4]:
import os

random_state = 42

s = {'{}_c_{}_{}.npy'.format(impute, selection, extraction)
     for impute in 'io'
     for selection in '-vx'
     for extraction in '-pts'}

data = {d: np.load(os.path.join('../data/staph/preprocess', d)) for d in s}

In [None]:
from sklearn.preprocessing import OneHotEncoder

for file, X in data.items():
    encoder = OneHotEncoder(categories='auto', sparse=False, dtype=np.int32)
    X_encode = encoder.fit_transform(X)
    np.save(os.path.join('../data/staph/preprocess/onehot', file), X_encode)