In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from load import load_staph, load_nucleotides

pd.options.display.precision = 3
pd.options.display.max_colwidth = 10

%matplotlib inline

In [3]:
%time records = load_staph()
mask = records['resp'].notna()
records.head()

CPU times: user 10.6 s, sys: 219 ms, total: 10.9 s
Wall time: 10.9 s


Unnamed: 0,id,sequence,missing,missing_%,sequence_i,missing_i,missing_%_i,resp,Total.Area
0,NRS001,ATGAAC...,2511,0.255,ATGAAC...,2356,0.24,False,0.0
1,NRS002,------...,25278,2.571,ATGAAC...,2236,0.227,False,0.0
2,NRS003,ATGAAC...,48213,4.904,ATGAAC...,2253,0.229,False,0.0
3,NRS021,ATGAAA...,2442,0.248,ATGAAA...,2088,0.212,False,473.152
4,NRS022,ATGAAC...,3885,0.395,ATGAAC...,2154,0.219,False,6686.806


# Feature selection

In [4]:
from Bio import SeqIO
import Bio
with open("../data/staph/core_gene_alignment-narsa.fasta", "w") as output:
    for sequence in SeqIO.parse('../data/staph/core_gene_alignment-narsa.aln', 'fasta'):
        seq = Bio.Seq.Seq(sequence.seq._data.upper(), alphabet=Bio.Alphabet.Alphabet())
        record = Bio.SeqRecord.SeqRecord(seq, id=sequence.id, description='')
        SeqIO.write(record, output, "fasta")

In [6]:
# 1.5*2 minutes
%time o_n = load_nucleotides('../data/staph/core_gene_alignment-narsa.fasta')
%time i_n = load_nucleotides('../data/staph/core_gene_alignment-narsa_naive_impute.fasta')

# 1.5*2 minutes
forward = str.maketrans('-ACTGN', '012345')
def transformation(str):
    return [int(i) for i in str.translate(forward)]
%time o_n = pd.DataFrame(records['sequence'].apply(transformation).to_list())
%time i_n = pd.DataFrame(records['sequence_i'].apply(transformation).to_list())
np.save('../data/staph/preprocess/o_n_-_-.npy', o_n[mask])
np.save('../data/staph/preprocess/i_n_-_-.npy', i_n[mask])

CPU times: user 1min 30s, sys: 1.5 s, total: 1min 32s
Wall time: 1min 28s
CPU times: user 1min 28s, sys: 1.19 s, total: 1min 29s
Wall time: 1min 24s
CPU times: user 1min 33s, sys: 2.07 s, total: 1min 35s
Wall time: 1min 31s
CPU times: user 1min 34s, sys: 1.91 s, total: 1min 36s
Wall time: 1min 32s


In [5]:
o_n = np.load('../data/staph/preprocess/o_n_-_-.npy')
i_n = np.load('../data/staph/preprocess/i_n_-_-.npy')

### Remove based on SNP counts
similar to variance threshold but seems better

In [None]:
# 11*2 minutes
%time snp_counts_o = o_n.apply(pd.Series.value_counts, axis=0)
%time snp_counts_i = i_n.apply(pd.Series.value_counts, axis=0)
np.save('../data/staph/preprocess/others/snp_counts_o.npy', snp_counts_o.to_numpy())
np.save('../data/staph/preprocess/others/snp_counts_i.npy', snp_counts_i.to_numpy())

snp_counts_o = pd.DataFrame(np.load('../data/staph/preprocess/others/snp_counts_o.npy'))
snp_counts_i = pd.DataFrame(np.load('../data/staph/preprocess/others/snp_counts_i.npy'))

# True     218693
snp_max_counts_o = snp_counts_o.max()
# True     42467
snp_max_counts_i = snp_counts_i.max()

o_n_v = o_n[mask].loc[:, (snp_max_counts_o<124)]
i_n_v = i_n[mask].loc[:, (snp_max_counts_i<124)]
np.save('../data/staph/preprocess/o_n_v_-.npy', o_n_v)
np.save('../data/staph/preprocess/i_n_v_-.npy', i_n_v)

### $\chi^2$ on the previous result
because some features are all 0's, so gives `divide by 0` warning

no warning if we remove those features (on the previous step)

In [31]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

o_n_x = SelectKBest(chi2, k=218693//2).fit_transform(o_n_v, records['resp'][mask].astype('i4'))
i_n_x = SelectKBest(chi2, k=42467//2).fit_transform(i_n_v, records['resp'][mask].astype('i4'))
np.save('../data/staph/preprocess/o_n_x_-.npy', o_n_x)
np.save('../data/staph/preprocess/i_n_x_-.npy', i_n_x)

# Feature extraction

In [6]:
o_n_v = np.load('../data/staph/preprocess/o_n_v_-.npy')
i_n_v = np.load('../data/staph/preprocess/i_n_v_-.npy')
o_n_x = np.load('../data/staph/preprocess/o_n_x_-.npy')
i_n_x = np.load('../data/staph/preprocess/i_n_x_-.npy')

### String kernel

In [7]:
from strkernel.mismatch_kernel import MismatchKernel

# 17~27*2 minutes
%time o_n__s = MismatchKernel(l=6, k=3, m=1).get_kernel(o_n)
%time i_n__s = MismatchKernel(l=6, k=3, m=1).get_kernel(i_n)
np.save('../data/staph/preprocess/o_n_-_s.npy', o_n__s.kernel)
np.save('../data/staph/preprocess/i_n_-_s.npy', i_n__s.kernel)

# 3+1 minutes
%time o_n_v_s = MismatchKernel(l=6, k=3, m=1).get_kernel(o_n_v)
%time i_n_v_s = MismatchKernel(l=6, k=3, m=1).get_kernel(i_n_v)
np.save('../data/staph/preprocess/o_n_v_s.npy', o_n_v_s.kernel)
np.save('../data/staph/preprocess/i_n_v_s.npy', i_n_v_s.kernel)

# 2*1 minutes
%time o_n_x_s = MismatchKernel(l=6, k=3, m=1).get_kernel(o_n_x)
%time i_n_x_s = MismatchKernel(l=6, k=3, m=1).get_kernel(i_n_x)
np.save('../data/staph/preprocess/o_n_x_s.npy', o_n_x_s.kernel)
np.save('../data/staph/preprocess/i_n_x_s.npy', i_n_x_s.kernel)

CPU times: user 6h 28min 52s, sys: 24min 9s, total: 6h 53min 2s
Wall time: 17min 41s
CPU times: user 6h 56min 3s, sys: 47min 50s, total: 7h 43min 53s
Wall time: 27min 1s
CPU times: user 10min 30s, sys: 1min 23s, total: 11min 54s
Wall time: 4min 19s
CPU times: user 36.7 s, sys: 2.15 s, total: 38.8 s
Wall time: 38.9 s
CPU times: user 1min 38s, sys: 7.05 s, total: 1min 46s
Wall time: 1min 46s
CPU times: user 20 s, sys: 1.08 s, total: 21 s
Wall time: 21.1 s


### PCA

In [10]:
from sklearn.decomposition import PCA

%time o_n__p = PCA(n_components=124).fit_transform(o_n)
%time i_n__p = PCA(n_components=124).fit_transform(i_n)
np.save('../data/staph/preprocess/o_n_-_p.npy', o_n__p)
np.save('../data/staph/preprocess/i_n_-_p.npy', i_n__p)

%time o_n_v_p = PCA(n_components=124).fit_transform(o_n_v)
%time i_n_v_p = PCA(n_components=124).fit_transform(i_n_v)
np.save('../data/staph/preprocess/o_n_v_p.npy', o_n_v_p)
np.save('../data/staph/preprocess/i_n_v_p.npy', i_n_v_p)

%time o_n_x_p = PCA(n_components=124).fit_transform(o_n_x)
%time i_n_x_p = PCA(n_components=124).fit_transform(i_n_x)
np.save('../data/staph/preprocess/o_n_x_p.npy', o_n_x_p)
np.save('../data/staph/preprocess/i_n_x_p.npy', i_n_x_p)

CPU times: user 11min 40s, sys: 36.8 s, total: 12min 17s
Wall time: 27.1 s
CPU times: user 11min 45s, sys: 35.3 s, total: 12min 21s
Wall time: 26.4 s
CPU times: user 3min 8s, sys: 9.61 s, total: 3min 17s
Wall time: 6.91 s
CPU times: user 27.3 s, sys: 1.82 s, total: 29.2 s
Wall time: 904 ms
CPU times: user 1min 34s, sys: 6.01 s, total: 1min 40s
Wall time: 3.34 s
CPU times: user 16.7 s, sys: 1.54 s, total: 18.3 s
Wall time: 583 ms


### T-SNE

In [11]:
from sklearn.manifold import TSNE

%time o_n__t = TSNE(n_components=3).fit_transform(o_n)
%time i_n__t = TSNE(n_components=3).fit_transform(i_n)
np.save('../data/staph/preprocess/o_n_-_t.npy', o_n__t)
np.save('../data/staph/preprocess/i_n_-_t.npy', i_n__t)

%time o_n_v_t = TSNE(n_components=3).fit_transform(o_n_v)
%time i_n_v_t = TSNE(n_components=3).fit_transform(i_n_v)
np.save('../data/staph/preprocess/o_n_v_t.npy', o_n_v_t)
np.save('../data/staph/preprocess/i_n_v_t.npy', i_n_v_t)

%time o_n_x_t = TSNE(n_components=3).fit_transform(o_n_x)
%time i_n_x_t = TSNE(n_components=3).fit_transform(i_n_x)
np.save('../data/staph/preprocess/o_n_x_t.npy', o_n_x_t)
np.save('../data/staph/preprocess/i_n_x_t.npy', i_n_x_t)

CPU times: user 5min 20s, sys: 17 s, total: 5min 37s
Wall time: 14.8 s
CPU times: user 5min 20s, sys: 17.6 s, total: 5min 38s
Wall time: 14.5 s
CPU times: user 1min 16s, sys: 4.14 s, total: 1min 20s
Wall time: 5.02 s
CPU times: user 17.8 s, sys: 962 ms, total: 18.8 s
Wall time: 2.73 s
CPU times: user 38.9 s, sys: 2.37 s, total: 41.3 s
Wall time: 3.66 s
CPU times: user 10.8 s, sys: 349 ms, total: 11.1 s
Wall time: 2.53 s


# Machine learning

In [8]:
import os

random_state = 42

s = {'{}_n_{}_{}.npy'.format(impute, selection, extraction)
     for impute in 'io'
     for selection in '-vx'
     for extraction in '-pts'}

data_u = {d: np.load(os.path.join('../data/staph/preprocess', d)) for d in s}

In [9]:
from sklearn.preprocessing import OneHotEncoder

for file, X in data_u.items():
    encoder = OneHotEncoder(categories='auto', sparse=False, dtype=np.int32)
    X_encode = encoder.fit_transform(X)
    np.save(os.path.join('../data/staph/preprocess/onehot', file), X_encode)

In [5]:
data_e = {d: np.load(os.path.join('../data/staph/preprocess/onehot', d)) for d in s}

In [6]:
from sklearn.model_selection import train_test_split
y = records['carb'][mask].astype('?')

In [205]:
X = data['i_n_x_p.npy']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.6)

## Logistic regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [9]:
param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
              'class_weight': [None, 'balanced', {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:32}, {0:1, 1:64}, {0:1, 1:128}],
              'l1_ratio': [0., 0.2, 0.4, 0.6, 0.8, 1.]}
clf = GridSearchCV(LogisticRegression(penalty='elasticnet', solver='saga', max_iter=2000, verbose=1, n_jobs=5),
                   param_grid=param_grid,
                   scoring=['recall', 'balanced_accuracy'],
                   refit='balanced_accuracy',
                   cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state))

In [212]:
weights = np.ones(y_test.shape)
weights[y_test] = clf.best_estimator_.class_weight[1]
weights

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 8., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 8., 1., 1., 8., 1., 8., 1., 1., 8., 1., 8., 1.,
       1., 1., 1., 8., 1., 1., 8., 8., 1., 1., 1., 1., 8., 1.])

In [213]:
clf.best_estimator_.score(X_test, y_test, sample_weight=weights)

0.8050847457627118

In [214]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, clf.best_estimator_.predict(X_test))

array([[15, 23],
       [ 0, 10]])

In [None]:
model_u_logistic = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = GridSearchCV(LogisticRegression(penalty='elasticnet', solver='saga', max_iter=2000, verbose=False, n_jobs=5),
                   param_grid=param_grid,
                   scoring=['recall', 'balanced_accuracy'],
                   refit='balanced_accuracy',
                   cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state))
    clf.fit(X_train, y_train)
    model_u_logistic[d] = clf

In [None]:
model_e_logistic = {}
for d, X in data_e.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = GridSearchCV(LogisticRegression(penalty='elasticnet', solver='saga', max_iter=2000, verbose=False, n_jobs=5),
                   param_grid=param_grid,
                   scoring=['recall', 'balanced_accuracy'],
                   refit='balanced_accuracy',
                   cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state))
    clf.fit(X_train, y_train)
    model_e_logistic[d] = clf

## Random forest

In [215]:
from sklearn.ensemble import RandomForestClassifier

In [234]:
param_grid = {'n_estimators': [10, 100, 1000],
              'min_samples_split': [2, 3],
              'min_samples_leaf': [2, 3],
              'max_features': ['auto', 'sqrt', 'log2', None],
              'class_weight': [None, 'balanced', {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:32}, {0:1, 1:64}, {0:1, 1:128}]}
clf = GridSearchCV(RandomForestClassifier(verbose=0, n_jobs=5),
                   param_grid=param_grid,
                   scoring=['recall', 'balanced_accuracy'],
                   refit='balanced_accuracy',
                   cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state))

In [235]:
%time clf.fit(X_train, y_train)

CPU times: user 6min 25s, sys: 22.1 s, total: 6min 47s
Wall time: 10min 22s




GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fractio...
             iid='warn', n_jobs=None,
             param_grid={'class_weight': [None, 'balanced', {0: 1, 1: 4},
                                          {0: 1, 1: 8}, {0: 1, 1: 32},
                                          {0: 1, 1: 64}

In [236]:
clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=5,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [237]:
weights = np.ones(y_test.shape)
weights[y_test] = clf.best_estimator_.class_weight[1]
weights

TypeError: 'NoneType' object is not subscriptable

In [240]:
clf.best_estimator_.score(X_test, y_test,
#                           sample_weight=weights
                         )

0.7708333333333334

In [241]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, clf.best_estimator_.predict(X_test))

array([[37,  1],
       [10,  0]])

In [None]:
print('hello')