In [1]:
import numpy as np
import pandas as pd

from load import load_staph

pd.options.display.precision = 3
pd.options.display.max_colwidth = 12

%matplotlib inline

In [2]:
records = load_staph(True)
mask = records['resp'].notna()
records.head()

Unnamed: 0,id,sequence,missing,missing_%,sequence_i,missing_i,missing_%_i,resp,Total.Area
0,NRS001,ATGAACAT...,2511,0.255,ATGAACAT...,2356,0.24,False,0.0
1,NRS002,--------...,25278,2.571,ATGAACAT...,2236,0.227,False,0.0
2,NRS003,ATGAACAT...,48213,4.904,ATGAACAT...,2253,0.229,False,0.0
3,NRS021,ATGAAAAT...,2442,0.248,ATGAAAAT...,2088,0.212,False,473.152
4,NRS022,ATGAACAT...,3885,0.395,ATGAACAT...,2154,0.219,False,6686.806


In [3]:
import os

random_state = 42

s = {'{}_{}_{}_{}.npy'.format(impute, nc, selection, extraction)
     for impute in 'io'
     for nc in 'nc'
     for selection in '-vx'
     for extraction in '-pts'}

# numerical data
data_u = {d: np.load(os.path.join('../data/staph/preprocess', d)) for d in s}
for k, v in data_u.items():
    if v.shape[0] != 124:
        print(k)

# one-hot encoded data
data_e = {d: np.load(os.path.join('../data/staph/preprocess/onehot', d)) for d in s}

In [4]:
from sklearn.model_selection import train_test_split

# Classification

In [5]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [6]:
# silence warning for getting 0 on f1 score
from sklearn.exceptions import UndefinedMetricWarning
import warnings
warnings.filterwarnings('ignore', category=UndefinedMetricWarning)

In [8]:
y_c = records['resp'][mask].astype('?')
metrics_c = [accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score]

In [9]:
def train_clfs(clf, model, encode):
    results = []
    data = data_e if encode else data_u
    for d, X in data.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y_c, random_state=random_state,
                                                            stratify=y_c, train_size=0.7)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        m = {metric.__name__.replace('_score', ''): metric(y_test, y_pred)
             for metric in metrics_c}
        m.update({'file': d, 'encode': encode, 'model': model})
        results.append(m)
    return results

### Logistic regression

In [10]:
# 4*2 minutes
lr = LogisticRegression(penalty='none', class_weight='balanced',
                         solver='lbfgs', max_iter=2000, n_jobs=5, warm_start=False)
%time lr_u = train_clfs(lr, 'logistic', False)
%time lr_e = train_clfs(lr, 'logistic', True)

CPU times: user 1min 31s, sys: 8.09 s, total: 1min 39s
Wall time: 3min 51s
CPU times: user 4min, sys: 14.6 s, total: 4min 14s
Wall time: 4min 13s


###  Random forest

In [11]:
# 1*2 minutes
rfc = RandomForestClassifier(n_estimators=500, n_jobs=5, class_weight='balanced')
%time rfc_u = train_clfs(rfc, 'rf', False)
%time rfc_e = train_clfs(rfc, 'rf', True)

CPU times: user 1min 24s, sys: 9.95 s, total: 1min 34s
Wall time: 59.3 s
CPU times: user 1min 31s, sys: 8.63 s, total: 1min 40s
Wall time: 54.5 s


###  Support vector machine

In [12]:
# 1.75 and 2.5 minutes
svc = SVC(gamma='auto', class_weight='balanced')
%time svc_u = train_clfs(svc, 'svm', False)
%time svc_e = train_clfs(svc, 'svm', True)

CPU times: user 1min 43s, sys: 3.51 s, total: 1min 47s
Wall time: 1min 47s
CPU times: user 2min 25s, sys: 2.42 s, total: 2min 28s
Wall time: 2min 28s


### Results

In [13]:
results_c = pd.DataFrame(lr_e + lr_u + rfc_u + rfc_e + svc_u + svc_e)
results_c.to_csv('result/result_staph_clf.csv', index=False)
results_c.head()

Unnamed: 0,accuracy,balanced_accuracy,f1,roc_auc,file,encode,model
0,0.842,0.5,0.0,0.5,o_n_v_s.npy,True,logistic
1,0.842,0.5,0.0,0.5,i_n_x_t.npy,True,logistic
2,0.842,0.5,0.0,0.5,i_c_-_p.npy,True,logistic
3,0.868,0.651,0.444,0.651,o_n_v_-.npy,True,logistic
4,0.842,0.5,0.0,0.5,o_c_x_t.npy,True,logistic


In [14]:
results_c.sort_values(by=['f1'], ascending=False)

Unnamed: 0,accuracy,balanced_accuracy,f1,roc_auc,file,encode,model
82,0.711,0.828,0.522,0.828,i_c_-_t.npy,False,logistic
93,0.895,0.667,0.500,0.667,o_n_-_-.npy,False,logistic
51,0.895,0.667,0.500,0.667,o_n_v_-.npy,False,logistic
221,0.895,0.667,0.500,0.667,i_c_-_-.npy,False,svm
141,0.895,0.667,0.500,0.667,o_n_-_-.npy,False,rf
...,...,...,...,...,...,...,...
148,0.842,0.500,0.000,0.500,o_c_x_t.npy,True,rf
150,0.842,0.500,0.000,0.500,o_c_v_p.npy,True,rf
152,0.842,0.500,0.000,0.500,o_n_x_p.npy,True,rf
153,0.842,0.500,0.000,0.500,i_n_-_t.npy,True,rf


# Regression

In [16]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [17]:
y_r = records['carb_num'][mask]
metrics_r = [r2_score, mean_squared_error]

In [22]:
def train_regs(reg, model, encode):
    results = []
    data = data_e if encode else data_u
    for d, X in data.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y_r, random_state=random_state,
                                                            train_size=0.7)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
        m = {metric.__name__.replace('_score', ''): metric(y_test, y_pred)
             for metric in metrics_r}
        m.update({'file': d, 'encode': encode, 'model': model})
        results.append(m)
    return results

### Linear regression

In [23]:
lrr = LinearRegression(n_jobs=5)
%time lrr_u = train_regs(lrr, 'linear', False)
%time lrr_e = train_regs(lrr, 'linear', True)

CPU times: user 4min 48s, sys: 11.1 s, total: 4min 59s
Wall time: 15.1 s
CPU times: user 8min 48s, sys: 18.1 s, total: 9min 7s
Wall time: 20.1 s


### Random forest

In [None]:
# 20*2 minutes
rfr = RandomForestRegressor(n_estimators=500, n_jobs=5)
%time rfr_u = train_regs(rfr, 'rf', False)
%time rfr_e = train_regs(rfr, 'rf', True)

CPU times: user 1h 38min, sys: 7.95 s, total: 1h 38min 8s
Wall time: 20min 15s


### Support vector machine

In [None]:
svr = SVR(gamma='auto')
%time svr_u = train_regs(svr, 'svm', False)
%time svr_e = train_regs(svr, 'svm', True)

In [None]:
results_r = pd.DataFrame(lrr_e + lrr_u + rfr_u + rfr_e + svr_u + svr_e)
results_r.to_csv('result/result_staph_reg.csv', index=False)
results_r.head()

In [None]:
results_r.sort_values(by=['f1'], ascending=False)