In [1]:
import numpy as np
import pandas as pd

from load import load_staph

pd.options.display.precision = 3
pd.options.display.max_colwidth = 12

%matplotlib inline

In [2]:
records = load_staph(True)
mask = records['resp'].notna()
records.head()

Unnamed: 0,id,sequence,missing,missing_%,sequence_i,missing_i,missing_%_i,resp,Total.Area
0,NRS001,ATGAACAT...,2511,0.255,ATGAACAT...,2356,0.24,False,0.0
1,NRS002,--------...,25278,2.571,ATGAACAT...,2236,0.227,False,0.0
2,NRS003,ATGAACAT...,48213,4.904,ATGAACAT...,2253,0.229,False,0.0
3,NRS021,ATGAAAAT...,2442,0.248,ATGAAAAT...,2088,0.212,False,473.152
4,NRS022,ATGAACAT...,3885,0.395,ATGAACAT...,2154,0.219,False,6686.806


In [3]:
import os

random_state = 42

s = {'{}_{}_{}_{}.npy'.format(impute, nc, selection, extraction)
     for impute in 'io'
     for nc in 'nc'
     for selection in '-vx'
     for extraction in '-pts'}

# numerical data
data_u = {d: np.load(os.path.join('../data/staph/preprocess', d)) for d in s}
for k, v in data_u.items():
    if v.shape[0] != 124:
        print(k)

# one-hot encoded data
data_e = {d: np.load(os.path.join('../data/staph/preprocess/onehot', d)) for d in s}

In [4]:
from sklearn.model_selection import train_test_split

# Classification

In [5]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [6]:
# silence warning for getting 0 on f1 score
from sklearn.exceptions import UndefinedMetricWarning
import warnings
warnings.filterwarnings('ignore', category=UndefinedMetricWarning)

In [7]:
y_c = records['resp'][mask].astype('?')
metrics_c = [accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score]

In [8]:
def train_clfs(clf, model, encode):
    results = []
    data = data_e if encode else data_u
    for d, X in data.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y_c, random_state=random_state,
                                                            stratify=y_c, train_size=0.7)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        m = {metric.__name__.replace('_score', ''): metric(y_test, y_pred)
             for metric in metrics_c}
        m.update({'file': d, 'encode': encode, 'model': model})
        results.append(m)
    return results

### Logistic regression

In [9]:
# 4*2 minutes
lr = LogisticRegression(penalty='none', class_weight='balanced',
                         solver='lbfgs', max_iter=2000, n_jobs=5, warm_start=False)
%time lr_u = train_clfs(lr, 'logistic', False)
%time lr_e = train_clfs(lr, 'logistic', True)

CPU times: user 1min 43s, sys: 8.22 s, total: 1min 51s
Wall time: 3min 58s
CPU times: user 4min 26s, sys: 14.8 s, total: 4min 41s
Wall time: 4min 31s


###  Random forest

In [10]:
# 1*2 minutes
rfc = RandomForestClassifier(n_estimators=500, n_jobs=5, class_weight='balanced')
%time rfc_u = train_clfs(rfc, 'rf', False)
%time rfc_e = train_clfs(rfc, 'rf', True)

CPU times: user 1min 22s, sys: 11.7 s, total: 1min 33s
Wall time: 1min 1s
CPU times: user 1min 17s, sys: 8.89 s, total: 1min 26s
Wall time: 49.4 s


###  Support vector machine

In [11]:
# 1.75 and 2.5 minutes
svc = SVC(gamma='auto', class_weight='balanced')
%time svc_u = train_clfs(svc, 'svm', False)
%time svc_e = train_clfs(svc, 'svm', True)

CPU times: user 1min 38s, sys: 3.8 s, total: 1min 42s
Wall time: 1min 42s
CPU times: user 2min 27s, sys: 5.9 s, total: 2min 33s
Wall time: 2min 33s


### Results

In [12]:
results_c = pd.DataFrame(lr_e + lr_u + rfc_u + rfc_e + svc_u + svc_e)
results_c.to_csv('result/result_staph_clf.csv', index=False)
results_c.head()

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,file,encode,model
0,0.868,0.651,0.667,0.333,0.444,o_n_-_-.npy,True,logistic
1,0.842,0.5,0.0,0.0,0.0,o_n_x_t.npy,True,logistic
2,0.842,0.5,0.0,0.0,0.0,o_c_v_t.npy,True,logistic
3,0.842,0.5,0.0,0.0,0.0,i_c_v_t.npy,True,logistic
4,0.842,0.5,0.0,0.0,0.0,o_n_x_p.npy,True,logistic


In [13]:
results_c.sort_values(by=['f1'], ascending=False)

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,file,encode,model
95,0.711,0.828,0.353,1.000,0.522,i_c_-_t.npy,False,logistic
144,0.895,0.667,1.000,0.333,0.500,o_n_-_-.npy,True,rf
96,0.895,0.667,1.000,0.333,0.500,o_n_-_-.npy,False,rf
232,0.895,0.667,1.000,0.333,0.500,i_c_-_-.npy,False,svm
192,0.895,0.667,1.000,0.333,0.500,o_n_-_-.npy,False,svm
...,...,...,...,...,...,...,...,...
151,0.842,0.500,0.000,0.000,0.000,i_n_v_t.npy,True,rf
152,0.842,0.500,0.000,0.000,0.000,i_c_v_p.npy,True,rf
153,0.842,0.500,0.000,0.000,0.000,i_n_v_p.npy,True,rf
21,0.842,0.500,0.000,0.000,0.000,o_n_v_p.npy,True,logistic


# Regression

In [14]:
from sklearn.metrics import r2_score, max_error, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [15]:
y_r = records['resp'][mask]
metrics_r = [r2_score, max_error, mean_absolute_error, mean_squared_error]

In [16]:
def train_regs(reg, model, encode):
    results = []
    data = data_e if encode else data_u
    for d, X in data.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y_r, random_state=random_state,
                                                            train_size=0.7)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
        m = {metric.__name__.replace('_score', ''): metric(y_test, y_pred)
             for metric in metrics_r}
        m.update({'file': d, 'encode': encode, 'model': model})
        results.append(m)
    return results

### Linear regression

In [17]:
# 30*2 seconds
lrr = LinearRegression(n_jobs=5)
%time lrr_u = train_regs(lrr, 'linear', False)
%time lrr_e = train_regs(lrr, 'linear', True)

CPU times: user 7min 30s, sys: 16 s, total: 7min 46s
Wall time: 26.2 s
CPU times: user 11min 17s, sys: 22.7 s, total: 11min 39s
Wall time: 28.3 s


### Random forest

In [18]:
# 13 and 25 minutes
rfr = RandomForestRegressor(n_estimators=500, n_jobs=5)
%time rfr_u = train_regs(rfr, 'rf', False)
%time rfr_e = train_regs(rfr, 'rf', True)

CPU times: user 57min 59s, sys: 14.4 s, total: 58min 13s
Wall time: 12min 15s
CPU times: user 1h 51min 47s, sys: 19 s, total: 1h 52min 6s
Wall time: 22min 53s


### Support vector machine

In [19]:
# 1.5*2 minutes
svr = SVR(gamma='auto')
%time svr_u = train_regs(svr, 'svm', False)
%time svr_e = train_regs(svr, 'svm', True)

CPU times: user 1min 17s, sys: 3.18 s, total: 1min 20s
Wall time: 1min 20s
CPU times: user 1min 31s, sys: 2.32 s, total: 1min 34s
Wall time: 1min 34s


In [20]:
results_r = pd.DataFrame(lrr_e + lrr_u + rfr_u + rfr_e + svr_u + svr_e)
results_r.to_csv('result/result_staph_reg.csv', index=False)
results_r.head()

Unnamed: 0,r2,max_error,mean_absolute_error,mean_squared_error,file,encode,model
0,-0.428,1.458,0.315,0.258,o_n_-_-.npy,True,linear
1,-0.058,0.869,0.312,0.191,o_n_x_t.npy,True,linear
2,-0.02,0.923,0.301,0.184,o_c_v_t.npy,True,linear
3,-0.038,0.87,0.307,0.188,i_c_v_t.npy,True,linear
4,-0.056,0.866,0.309,0.191,o_n_x_p.npy,True,linear


In [21]:
results_r.sort_values(by=['mean_squared_error'])

Unnamed: 0,r2,max_error,mean_absolute_error,mean_squared_error,file,encode,model
143,0.213,0.896,0.251,0.142,i_c_-_t.npy,False,rf
232,0.182,0.894,0.292,0.148,i_c_-_-.npy,False,svm
221,0.101,0.903,0.263,0.162,i_n_x_-.npy,False,svm
135,0.093,0.912,0.260,0.164,o_c_v_p.npy,False,rf
104,0.090,0.992,0.269,0.164,i_c_v_p.npy,False,rf
...,...,...,...,...,...,...,...
91,-37.295,7.637,2.054,6.922,o_n_-_s.npy,False,linear
73,-48.700,16.229,1.387,8.983,i_c_x_s.npy,False,linear
81,-74.312,12.340,2.294,13.612,i_n_-_s.npy,False,linear
80,-77.714,15.103,2.599,14.227,o_n_x_s.npy,False,linear
