In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from load import load_staph, load_condons

pd.options.display.precision = 3
pd.options.display.max_colwidth = 10

In [2]:
%time records = load_staph()
numerical_response = pd.read_csv('../data/staph/nrs_metadata3.txt', delimiter='\t')
numerical_response
records = records.merge(numerical_response[['sample_tag', 'Total.Area']],
                        left_on='id', right_on='sample_tag', how='left')
records.drop(columns='sample_tag', inplace=True)
records.head()

CPU times: user 10.4 s, sys: 245 ms, total: 10.6 s
Wall time: 10.7 s


Unnamed: 0,id,sequence,missing,missing_%,sequence_i,missing_i,missing_%_i,resp,Total.Area
0,NRS001,ATGAAC...,2511,0.255,ATGAAC...,2356,0.24,False,0.0
1,NRS002,------...,25278,2.571,ATGAAC...,2236,0.227,False,0.0
2,NRS003,ATGAAC...,48213,4.904,ATGAAC...,2253,0.229,False,0.0
3,NRS021,ATGAAA...,2442,0.248,ATGAAA...,2088,0.212,False,473.152
4,NRS022,ATGAAC...,3885,0.395,ATGAAC...,2154,0.219,False,6686.806


In [3]:
mask = records['resp'].notna()

# Feature selection

In [None]:
# 48 seconds
%time o_c = load_condons('../data/staph/core_gene_alignment-narsa.fasta')
%time i_c = load_condons('../data/staph/core_gene_alignment-narsa_naive_impute.fasta')

# 1.5 minutes
d = {}
for label, content in o_c.iteritems():
    d.update(content.value_counts().to_dict())
d_sorted = dict(sorted(d.items(), key=lambda x: x[1], reverse=True))
mapping = {key: i for i, key in enumerate(d_sorted.keys())}

import json
with open('../data/staph/preprocess/others/condon_mapping.json', 'w') as output:
    json.dump(mapping, output, indent='\t')

import json
with open('../data/staph/preprocess/others/condon_mapping.json', 'r') as input_:
    mapping = json.load(input_)

# 22 seconds
%time o_c_ = o_c.applymap(lambda x: mapping[x])
%time i_c_ = i_c.applymap(lambda x: mapping[x])
np.save('../data/staph/preprocess/o_c_-_-.npy', o_c_)
np.save('../data/staph/preprocess/i_c_-_-.npy', i_c_)

In [40]:
o_c_ = np.load('../data/staph/preprocess/o_c_-_-.npy')
i_c_ = np.load('../data/staph/preprocess/i_c_-_-.npy')

## Remove based on SNP counts
similar to variance threshold but seems better

In [None]:
# 4 minutes each
%time variant_counts_o = o_c.apply(pd.Series.value_counts, axis=0)
%time variant_counts_i = i_c.apply(pd.Series.value_counts, axis=0)
np.save('../data/staph/preprocess/others/variant_counts_o.npy', variant_counts_o)
np.save('../data/staph/preprocess/others/variant_counts_i.npy', variant_counts_i)

variant_counts_o = pd.DataFrame(np.load('../data/staph/preprocess/others/variant_counts_o.npy'))
variant_counts_i = pd.DataFrame(np.load('../data/staph/preprocess/others/variant_counts_i.npy'))

# True     96576
variant_max_counts_o = variant_counts_o.max()
(pd.Series(variant_max_counts_o<124)).value_counts()

# True      40054
variant_max_counts_i = variant_counts_i.max()
(variant_max_counts_i<124).value_counts()

o_c_v = o_c_[mask][:, variant_max_counts_o<124]
i_c_v = i_c_[mask][:, variant_max_counts_i<124]
np.save('../data/staph/preprocess/o_c_v_-.npy', o_c_v)
np.save('../data/staph/preprocess/i_c_v_-.npy', i_c_v)

## $\chi^2$ on the previous result
because some features are all 0's, so gives `divide by 0` warning

no warning if we remove those features (on the previous step)

In [42]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

o_c_x = SelectKBest(chi2, k=96576//2).fit_transform(o_c_v, records['resp'][mask].astype('i4'))
i_c_x = SelectKBest(chi2, k=40054//2).fit_transform(i_c_v, records['resp'][mask].astype('i4'))

np.save('../data/staph/preprocess/o_c_x_-.npy', o_c_x)
np.save('../data/staph/preprocess/i_c_x_-.npy', i_c_x)

  chisq /= f_exp


# Feature extraction

In [43]:
o_c_v = np.load('../data/staph/preprocess/o_c_v_-.npy')
i_c_v = np.load('../data/staph/preprocess/i_c_v_-.npy')
o_c_x = np.load('../data/staph/preprocess/o_c_x_-.npy')
i_c_x = np.load('../data/staph/preprocess/i_c_x_-.npy')

## String kernel

In [None]:
from strkernel.mismatch_kernel import MismatchKernel

%time o_c__s = MismatchKernel(l=125, k=2, m=1).get_kernel(o_c_)
%time i_c__s = MismatchKernel(l=125, k=2, m=1).get_kernel(i_c_)
np.save('../data/staph/preprocess/o_c_-_s.npy', o_c__s.kernel)
np.save('../data/staph/preprocess/i_c_-_s.npy', i_c__s.kernel)

In [None]:
%time o_c_v_s = MismatchKernel(l=125, k=2, m=1).get_kernel(o_c_v)
%time i_c_v_s = MismatchKernel(l=125, k=2, m=1).get_kernel(i_c_v)
np.save('../data/staph/preprocess/o_c_v_s.npy', o_c_v_s.kernel)
np.save('../data/staph/preprocess/i_c_v_s.npy', i_c_v_s.kernel)

In [None]:
%time o_c_x_s = MismatchKernel(l=125, k=2, m=1).get_kernel(o_c_x)
%time i_c_x_s = MismatchKernel(l=125, k=2, m=1).get_kernel(i_c_x)
np.save('../data/staph/preprocess/o_c_x_s.npy', o_c_x_s.kernel)
np.save('../data/staph/preprocess/i_c_x_s.npy', i_c_x_s.kernel)

## PCA

In [46]:
from sklearn.decomposition import PCA

%time o_c__p = PCA(n_components=124).fit_transform(o_c_)
%time i_c__p = PCA(n_components=124).fit_transform(i_c_)
np.save('../data/staph/preprocess/o_c_-_p.npy', o_c__p)
np.save('../data/staph/preprocess/i_c_-_p.npy', i_c__p)

%time o_c_v_p = PCA(n_components=124).fit_transform(o_c_v)
%time i_c_v_p = PCA(n_components=124).fit_transform(i_c_v)
np.save('../data/staph/preprocess/o_c_v_p.npy', o_c_v_p)
np.save('../data/staph/preprocess/i_c_v_p.npy', i_c_v_p)

%time o_c_x_p = PCA(n_components=124).fit_transform(o_c_x)
%time i_c_x_p = PCA(n_components=124).fit_transform(i_c_x)
np.save('../data/staph/preprocess/o_c_x_p.npy', o_c_x_p)
np.save('../data/staph/preprocess/i_c_x_p.npy', i_c_x_p)

CPU times: user 3min 21s, sys: 6.23 s, total: 3min 27s
Wall time: 6.36 s
CPU times: user 3min 15s, sys: 6.23 s, total: 3min 21s
Wall time: 6.02 s
CPU times: user 46.1 s, sys: 1.61 s, total: 47.7 s
Wall time: 1.43 s
CPU times: user 25.5 s, sys: 1.22 s, total: 26.7 s
Wall time: 812 ms
CPU times: user 31.9 s, sys: 1.2 s, total: 33.1 s
Wall time: 979 ms
CPU times: user 14.5 s, sys: 713 ms, total: 15.2 s
Wall time: 425 ms


## TSNE

In [47]:
from sklearn.manifold import TSNE

%time o_c__t = TSNE(n_components=3).fit_transform(o_c_)
%time i_c__t = TSNE(n_components=3).fit_transform(i_c_)
np.save('../data/staph/preprocess/o_c_-_t.npy', o_c__t)
np.save('../data/staph/preprocess/i_c_-_t.npy', i_c__t)

%time o_c_v_t = TSNE(n_components=3).fit_transform(o_c_v)
%time i_c_v_t = TSNE(n_components=3).fit_transform(i_c_v)
np.save('../data/staph/preprocess/o_c_v_t.npy', o_c_v_t)
np.save('../data/staph/preprocess/i_c_v_t.npy', i_c_v_t)

%time o_c_x_t = TSNE(n_components=3).fit_transform(o_c_x)
%time i_c_x_t = TSNE(n_components=3).fit_transform(i_c_x)
np.save('../data/staph/preprocess/o_c_x_t.npy', o_c_x_t)
np.save('../data/staph/preprocess/i_c_x_t.npy', i_c_x_t)

CPU times: user 1min 59s, sys: 3.72 s, total: 2min 3s
Wall time: 6.21 s
CPU times: user 2min 2s, sys: 4 s, total: 2min 6s
Wall time: 6.1 s
CPU times: user 39.1 s, sys: 1.21 s, total: 40.3 s
Wall time: 3.46 s
CPU times: user 16.1 s, sys: 513 ms, total: 16.6 s
Wall time: 2.55 s
CPU times: user 20.3 s, sys: 669 ms, total: 20.9 s
Wall time: 2.73 s
CPU times: user 10.4 s, sys: 510 ms, total: 10.9 s
Wall time: 2.35 s


In [52]:
# verify all possible combinations are created
import os
d = os.listdir('../data/staph/preprocess/')
s = {'{}_{}_{}_{}.npy'.format(impute, c_or_n, selection, extraction) for impute in 'io' for c_or_n in 'nc' for selection in '-vx' for extraction in '-pts'}
len(s - set(d)) == 0

True

# Machine learning

In [53]:
import os

random_state = 42

s = {'{}_c_{}_{}.npy'.format(impute, selection, extraction)
     for impute in 'io'
     for selection in '-vx'
     for extraction in '-pts'}

data_u = {d: np.load(os.path.join('../data/staph/preprocess', d)) for d in s}
# mask all data to remove x with NAN labels
for k, v in data_u.items():
    if v.shape[0] != 124:
        data_u[k] = v[mask]

In [55]:
import pickle
from sklearn.preprocessing import OneHotEncoder

for file, X in data_u.items():
    encoder = OneHotEncoder(categories='auto', sparse=False, dtype=np.int32)
    %time X_encode = encoder.fit_transform(X)
    
    np.save(os.path.join('../data/staph/preprocess/onehot', file), X_encode)
#     with open(os.path.join('../data/staph/preprocess/onehot-encoder', file[:file.index('.')]), 'wb') as output:
#         pickle.dump(encoder, output)

CPU times: user 3.81 ms, sys: 187 µs, total: 3.99 ms
Wall time: 3.2 ms
CPU times: user 1.54 ms, sys: 642 µs, total: 2.18 ms
Wall time: 2.2 ms
CPU times: user 9.11 s, sys: 1.06 s, total: 10.2 s
Wall time: 3.4 s
CPU times: user 19.3 ms, sys: 0 ns, total: 19.3 ms
Wall time: 19.4 ms
CPU times: user 41.5 ms, sys: 0 ns, total: 41.5 ms
Wall time: 41.3 ms
CPU times: user 27.6 ms, sys: 0 ns, total: 27.6 ms
Wall time: 27.7 ms
CPU times: user 4.1 s, sys: 0 ns, total: 4.1 s
Wall time: 1.7 s
CPU times: user 114 ms, sys: 0 ns, total: 114 ms
Wall time: 3.21 ms
CPU times: user 537 ms, sys: 0 ns, total: 537 ms
Wall time: 15 ms
CPU times: user 638 ms, sys: 0 ns, total: 638 ms
Wall time: 17.8 ms
CPU times: user 16.9 s, sys: 1.5 s, total: 18.4 s
Wall time: 7.76 s
CPU times: user 974 µs, sys: 8.39 ms, total: 9.37 ms
Wall time: 9.17 ms
CPU times: user 14.4 ms, sys: 652 µs, total: 15.1 ms
Wall time: 15.1 ms
CPU times: user 14.3 ms, sys: 698 µs, total: 15 ms
Wall time: 15 ms
CPU times: user 909 µs, sys: 46 µs

In [5]:
data_e = {d: np.load(os.path.join('../data/staph/preprocess/onehot', d)) for d in s}

In [57]:
from sklearn.model_selection import train_test_split
X = data_u['o_c_v_-.npy']
y = records['resp'][mask].astype('?')

## Logistic regression

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [61]:
param_grid = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
#     'class_weight': [None, 'balanced', {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:32}, {0:1, 1:64}, {0:1, 1:128}],
    'l1_ratio': [0., 0.2, 0.4, 0.6, 0.8, 1.]
}
clf = GridSearchCV(LogisticRegression(penalty='elasticnet', solver='saga', max_iter=2000, verbose=1, n_jobs=5),
                   param_grid=param_grid,
                   scoring=['recall', 'balanced_accuracy'],
                   refit='balanced_accuracy',
                   cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state))

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
clf = GridSearchCV(LogisticRegression(penalty='elasticnet', solver='saga', max_iter=2000, verbose=True, n_jobs=5),
                   param_grid=param_grid,
                   scoring=['recall', 'balanced_accuracy'],
                   refit='balanced_accuracy',
                   cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state))
clf.fit(X_train, y_train)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 out of   1 | elapsed:  2.4min finished
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 out of   1 | elapsed:  1.8min finished
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 out of   1 | elapsed:  1.9min finished
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 out of   1 | elapsed:   12.3s finished
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 out of   1 | elapsed:    0.8s finished
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 out of   1 | elapsed:    1.0s finished
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 out of   1 | elapsed:   

OSError: [Errno 12] Cannot allocate memory

In [None]:
model_u_logistic = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = GridSearchCV(LogisticRegression(penalty='elasticnet', solver='saga', max_iter=2000, verbose=False, n_jobs=5),
                   param_grid=param_grid,
                   scoring=['recall', 'balanced_accuracy'],
                   refit='balanced_accuracy',
                   cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state))
    clf.fit(X_train, y_train)
    model_u_logistic[d] = clf

In [None]:
model_e_logistic = {}
for d, X in data_e.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = GridSearchCV(LogisticRegression(penalty='elasticnet', solver='saga', max_iter=2000, verbose=False, n_jobs=5),
                   param_grid=param_grid,
                   scoring=['recall', 'balanced_accuracy'],
                   refit='balanced_accuracy',
                   cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state))
    clf.fit(X_train, y_train)
    model_e_logistic[d] = clf