In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from load import load_staph, load_nucleotides

pd.options.display.precision = 3
pd.options.display.max_colwidth = 10

%matplotlib inline

In [11]:
%time records = load_staph()
numerical_response = pd.read_csv('../data/staph/nrs_metadata3.txt', delimiter='\t')
numerical_response
records = records.merge(numerical_response[['sample_tag', 'Total.Area']],
                        left_on='id', right_on='sample_tag', how='left')
records.drop(columns='sample_tag', inplace=True)
records.head()

CPU times: user 9.93 s, sys: 149 ms, total: 10.1 s
Wall time: 10.1 s


Unnamed: 0,id,sequence,missing,missing_%,sequence_i,missing_i,missing_%_i,resp,Total.Area
0,NRS001,ATGAAC...,2511,0.255,ATGAAC...,2356,0.24,False,0.0
1,NRS002,------...,25278,2.571,ATGAAC...,2236,0.227,False,0.0
2,NRS003,ATGAAC...,48213,4.904,ATGAAC...,2253,0.229,False,0.0
3,NRS021,ATGAAA...,2442,0.248,ATGAAA...,2088,0.212,False,473.152
4,NRS022,ATGAAC...,3885,0.395,ATGAAC...,2154,0.219,False,6686.806


In [12]:
mask = records['resp'].notna()

# Feature selection

In [4]:
o_n = np.load('../data/pseudo/preprocess/o_n_-_-.npy')
i_n = np.load('../data/pseudo/preprocess/i_n_-_-.npy')

In [14]:
# 1.5 minutes
%time o_n = load_nucleotides('../data/staph/core_gene_alignment-narsa.fasta')
%time i_n = load_nucleotides('../data/staph/core_gene_alignment-narsa_naive_impute.fasta')

CPU times: user 1min 31s, sys: 1.6 s, total: 1min 32s
Wall time: 1min 27s
CPU times: user 1min 29s, sys: 1.41 s, total: 1min 30s
Wall time: 1min 23s


In [16]:
# 1.5 minutes
forward = str.maketrans('-ACTGN', '012345')
def transformation(str):
    return [int(i) for i in str.translate(forward)]
%time o_n = pd.DataFrame(records['sequence'].apply(transformation).to_list())
%time i_n = pd.DataFrame(records['sequence_i'].apply(transformation).to_list())
np.save('../data/staph/preprocess/o_n_-_-.npy', o_n)
np.save('../data/staph/preprocess/i_n_-_-.npy', i_n)

CPU times: user 1min 36s, sys: 2.06 s, total: 1min 38s
Wall time: 1min 31s
CPU times: user 1min 36s, sys: 1.75 s, total: 1min 37s
Wall time: 1min 32s


## Variance threshold

In [7]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(0.01)

# justification(not rigorous) for why < 0.016 is the threshold to drop a column
a, b = 4, 3
arr = np.ones((122, 1))*a
arr[:2] = b
np.var(arr)

o_n_v = selector.fit_transform(o_n)
o_n_v_selected = pd.Series(selector.get_support())
o_n_v_selected.value_counts()

i_n_v = selector.fit_transform(i_n)
i_n_v_selected = pd.Series(selector.get_support())
i_n_v_selected.value_counts()

## Remove based on SNP counts
similar to variance threshold but seems better

In [None]:
# 11 minutes each
%time snp_counts_o = o_n.apply(pd.Series.value_counts, axis=0)
%time snp_counts_i = i_n.apply(pd.Series.value_counts, axis=0)
np.save('../data/staph/preprocess/others/snp_counts_o.npy', snp_counts_o.to_numpy())
np.save('../data/staph/preprocess/others/snp_counts_i.npy', snp_counts_i.to_numpy())

snp_counts_o = pd.DataFrame(np.load('../data/staph/preprocess/others/snp_counts_o.npy'))
snp_counts_i = pd.DataFrame(np.load('../data/staph/preprocess/others/snp_counts_i.npy'))

# True     218693
snp_max_counts_o = snp_counts_o.max()
(snp_max_counts_o<124).value_counts()

# True     42467
snp_max_counts_i = snp_counts_i.max()
(snp_max_counts_i<124).value_counts()

o_n_v = o_n[mask].loc[:, (snp_max_counts_o<124)]
i_n_v = i_n[mask].loc[:, (snp_max_counts_i<124)]
np.save('../data/staph/preprocess/o_n_v_-.npy', o_n_v)
np.save('../data/staph/preprocess/i_n_v_-.npy', i_n_v)

## $\chi^2$ on the previous result
because some features are all 0's, so gives `divide by 0` warning

no warning if we remove those features (on the previous step)

In [31]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

o_n_x = SelectKBest(chi2, k=218693//2).fit_transform(o_n_v, records['resp'][mask].astype('i4'))
i_n_x = SelectKBest(chi2, k=42467//2).fit_transform(i_n_v, records['resp'][mask].astype('i4'))
np.save('../data/staph/preprocess/o_n_x_-.npy', o_n_x)
np.save('../data/staph/preprocess/i_n_x_-.npy', i_n_x)

# Feature extraction

In [34]:
o_n_v = np.load('../data/staph/preprocess/o_n_v_-.npy')
i_n_v = np.load('../data/staph/preprocess/i_n_v_-.npy')
o_n_x = np.load('../data/staph/preprocess/o_n_x_-.npy')
i_n_x = np.load('../data/staph/preprocess/i_n_x_-.npy')

## String kernel

In [37]:
from strkernel.mismatch_kernel import MismatchKernel

# 14 minutes each
%time o_n__s = MismatchKernel(l=6, k=3, m=1).get_kernel(o_n)
%time i_n__s = MismatchKernel(l=6, k=3, m=1).get_kernel(i_n)
np.save('../data/staph/preprocess/o_n_-_s.npy', o_n__s.kernel)
np.save('../data/staph/preprocess/i_n_-_s.npy', i_n__s.kernel)

# 3 and 1 minutes
%time o_n_v_s = MismatchKernel(l=6, k=3, m=1).get_kernel(o_n_v)
%time i_n_v_s = MismatchKernel(l=6, k=3, m=1).get_kernel(i_n_v)
np.save('../data/staph/preprocess/o_n_v_s.npy', o_n_v_s.kernel)
np.save('../data/staph/preprocess/i_n_v_s.npy', i_n_v_s.kernel)

# 2 minutes
%time o_n_x_s = MismatchKernel(l=6, k=3, m=1).get_kernel(o_n_x)
%time i_n_x_s = MismatchKernel(l=6, k=3, m=1).get_kernel(i_n_x)
np.save('../data/staph/preprocess/o_n_x_s.npy', o_n_x_s.kernel)
np.save('../data/staph/preprocess/i_n_x_s.npy', i_n_x_s.kernel)

CPU times: user 5h 58min 49s, sys: 17min 14s, total: 6h 16min 3s
Wall time: 14min 17s
CPU times: user 5h 51min 48s, sys: 16min 54s, total: 6h 8min 42s
Wall time: 14min 16s
CPU times: user 8min 37s, sys: 1min 19s, total: 9min 56s
Wall time: 2min 45s
CPU times: user 28 s, sys: 1.48 s, total: 29.5 s
Wall time: 29.5 s
CPU times: user 1min 14s, sys: 4.57 s, total: 1min 19s
Wall time: 1min 19s
CPU times: user 15.4 s, sys: 648 ms, total: 16 s
Wall time: 16.1 s


## PCA

In [35]:
from sklearn.decomposition import PCA

%time o_n__p = PCA(n_components=124).fit_transform(o_n)
%time i_n__p = PCA(n_components=124).fit_transform(i_n)
np.save('../data/staph/preprocess/o_n_-_p.npy', o_n__p)
np.save('../data/staph/preprocess/i_n_-_p.npy', i_n__p)

%time o_n_v_p = PCA(n_components=124).fit_transform(o_n_v)
%time i_n_v_p = PCA(n_components=124).fit_transform(i_n_v)
np.save('../data/staph/preprocess/o_n_v_p.npy', o_n_v_p)
np.save('../data/staph/preprocess/i_n_v_p.npy', i_n_v_p)

%time o_n_x_p = PCA(n_components=124).fit_transform(o_n_x)
%time i_n_x_p = PCA(n_components=124).fit_transform(i_n_x)
np.save('../data/staph/preprocess/o_n_x_p.npy', o_n_x_p)
np.save('../data/staph/preprocess/i_n_x_p.npy', i_n_x_p)

CPU times: user 10min 18s, sys: 18.4 s, total: 10min 36s
Wall time: 19.4 s
CPU times: user 10min 22s, sys: 18.8 s, total: 10min 41s
Wall time: 19.6 s
CPU times: user 2min 5s, sys: 3.89 s, total: 2min 9s
Wall time: 3.92 s
CPU times: user 20 s, sys: 728 ms, total: 20.7 s
Wall time: 577 ms
CPU times: user 1min 5s, sys: 2.14 s, total: 1min 7s
Wall time: 1.87 s
CPU times: user 10.1 s, sys: 391 ms, total: 10.4 s
Wall time: 291 ms


## T-SNE

In [36]:
from sklearn.manifold import TSNE

%time o_n__t = TSNE(n_components=3).fit_transform(o_n)
%time i_n__t = TSNE(n_components=3).fit_transform(i_n)
np.save('../data/staph/preprocess/o_n_-_t.npy', o_n__t)
np.save('../data/staph/preprocess/i_n_-_t.npy', i_n__t)

%time o_n_v_t = TSNE(n_components=3).fit_transform(o_n_v)
%time i_n_v_t = TSNE(n_components=3).fit_transform(i_n_v)
np.save('../data/staph/preprocess/o_n_v_t.npy', o_n_v_t)
np.save('../data/staph/preprocess/i_n_v_t.npy', i_n_v_t)

%time o_n_x_t = TSNE(n_components=3).fit_transform(o_n_x)
%time i_n_x_t = TSNE(n_components=3).fit_transform(i_n_x)
np.save('../data/staph/preprocess/o_n_x_t.npy', o_n_x_t)
np.save('../data/staph/preprocess/i_n_x_t.npy', i_n_x_t)

CPU times: user 5min 34s, sys: 12 s, total: 5min 46s
Wall time: 13 s
CPU times: user 5min 27s, sys: 11.6 s, total: 5min 39s
Wall time: 12.8 s
CPU times: user 1min 16s, sys: 2.29 s, total: 1min 18s
Wall time: 4.47 s
CPU times: user 16.6 s, sys: 478 ms, total: 17 s
Wall time: 2.56 s
CPU times: user 39.9 s, sys: 1.2 s, total: 41.1 s
Wall time: 3.42 s
CPU times: user 10.6 s, sys: 263 ms, total: 10.8 s
Wall time: 2.54 s


# Machine learning

In [38]:
import os

random_state = 42

s = {'{}_n_{}_{}.npy'.format(impute, selection, extraction)
     for impute in 'io'
     for selection in '-vx'
     for extraction in '-pts'}

data_u = {d: np.load(os.path.join('../data/staph/preprocess', d)) for d in s}
# mask all data to remove x with NAN labels
for k, v in data_u.items():
    if v.shape[0] != 124:
        data_u[k] = v[mask]

In [40]:
import pickle
from sklearn.preprocessing import OneHotEncoder

for file, X in data_u.items():
    encoder = OneHotEncoder(categories='auto', sparse=False, dtype=np.int32)
    %time X_encode = encoder.fit_transform(X)
    
    np.save(os.path.join('../data/staph/preprocess/onehot', file), X_encode)
    with open(os.path.join('../data/staph/preprocess/onehot-encoder', file[:file.index('.')]), 'wb') as output:
        pickle.dump(encoder, output)

CPU times: user 39.5 ms, sys: 1.21 ms, total: 40.7 ms
Wall time: 39.3 ms
CPU times: user 38.9 ms, sys: 512 µs, total: 39.4 ms
Wall time: 39.5 ms
CPU times: user 2.13 ms, sys: 99 µs, total: 2.23 ms
Wall time: 2.27 ms
CPU times: user 1min 28s, sys: 3.4 s, total: 1min 32s
Wall time: 1min 15s
CPU times: user 986 µs, sys: 3.18 ms, total: 4.17 ms
Wall time: 4.21 ms
CPU times: user 737 µs, sys: 266 µs, total: 1 ms
Wall time: 1.03 ms
CPU times: user 14.9 ms, sys: 958 µs, total: 15.8 ms
Wall time: 15.9 ms
CPU times: user 18.1 ms, sys: 6.33 ms, total: 24.4 ms
Wall time: 24.5 ms
CPU times: user 4.81 s, sys: 183 ms, total: 4.99 s
Wall time: 1.74 s
CPU times: user 2.88 ms, sys: 4.57 ms, total: 7.45 ms
Wall time: 7.16 ms
CPU times: user 19.2 ms, sys: 113 µs, total: 19.3 ms
Wall time: 19.4 ms
CPU times: user 16.7 ms, sys: 0 ns, total: 16.7 ms
Wall time: 16.8 ms
CPU times: user 15.5 ms, sys: 0 ns, total: 15.5 ms
Wall time: 15.6 ms
CPU times: user 27.9 s, sys: 1.01 s, total: 29 s
Wall time: 17.2 s
CPU 

In [5]:
data_e = {d: np.load(os.path.join('../data/staph/preprocess/onehot', d)) for d in s}

In [6]:
from sklearn.model_selection import train_test_split
y = records['carb'][mask].astype('?')

In [205]:
X = data['i_n_x_p.npy']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.6)

## Logistic regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [9]:
param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
              'class_weight': [None, 'balanced', {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:32}, {0:1, 1:64}, {0:1, 1:128}],
              'l1_ratio': [0., 0.2, 0.4, 0.6, 0.8, 1.]}
clf = GridSearchCV(LogisticRegression(penalty='elasticnet', solver='saga', max_iter=2000, verbose=1, n_jobs=5),
                   param_grid=param_grid,
                   scoring=['recall', 'balanced_accuracy'],
                   refit='balanced_accuracy',
                   cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state))

In [212]:
weights = np.ones(y_test.shape)
weights[y_test] = clf.best_estimator_.class_weight[1]
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 8., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 8., 1., 1., 8., 1., 8., 1., 1., 8., 1., 8., 1.,
       1., 1., 1., 8., 1., 1., 8., 8., 1., 1., 1., 1., 8., 1.])

In [213]:
clf.best_estimator_.score(X_test, y_test, sample_weight=weights)

0.8050847457627118

In [214]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, clf.best_estimator_.predict(X_test))

array([[15, 23],
       [ 0, 10]])

In [None]:
model_u_logistic = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = GridSearchCV(LogisticRegression(penalty='elasticnet', solver='saga', max_iter=2000, verbose=False, n_jobs=5),
                   param_grid=param_grid,
                   scoring=['recall', 'balanced_accuracy'],
                   refit='balanced_accuracy',
                   cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state))
    clf.fit(X_train, y_train)
    model_u_logistic[d] = clf

In [None]:
model_e_logistic = {}
for d, X in data_e.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = GridSearchCV(LogisticRegression(penalty='elasticnet', solver='saga', max_iter=2000, verbose=False, n_jobs=5),
                   param_grid=param_grid,
                   scoring=['recall', 'balanced_accuracy'],
                   refit='balanced_accuracy',
                   cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state))
    clf.fit(X_train, y_train)
    model_e_logistic[d] = clf

## Random forest

In [215]:
from sklearn.ensemble import RandomForestClassifier

In [234]:
param_grid = {'n_estimators': [10, 100, 1000],
              'min_samples_split': [2, 3],
              'min_samples_leaf': [2, 3],
              'max_features': ['auto', 'sqrt', 'log2', None],
              'class_weight': [None, 'balanced', {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:32}, {0:1, 1:64}, {0:1, 1:128}]}
clf = GridSearchCV(RandomForestClassifier(verbose=0, n_jobs=5),
                   param_grid=param_grid,
                   scoring=['recall', 'balanced_accuracy'],
                   refit='balanced_accuracy',
                   cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state))

In [235]:
%time clf.fit(X_train, y_train)

CPU times: user 6min 25s, sys: 22.1 s, total: 6min 47s
Wall time: 10min 22s




GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fractio...
             iid='warn', n_jobs=None,
             param_grid={'class_weight': [None, 'balanced', {0: 1, 1: 4},
                                          {0: 1, 1: 8}, {0: 1, 1: 32},
                                          {0: 1, 1: 64}

In [236]:
clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=5,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [237]:
weights = np.ones(y_test.shape)
weights[y_test] = clf.best_estimator_.class_weight[1]
weights

TypeError: 'NoneType' object is not subscriptable

In [240]:
clf.best_estimator_.score(X_test, y_test,
#                           sample_weight=weights
                         )

0.7708333333333334

In [241]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, clf.best_estimator_.predict(X_test))

array([[37,  1],
       [10,  0]])

In [None]:
print('hello')