In [1]:
import numpy as np
import pandas as pd

from load import load_pseudo

pd.options.display.precision = 3
pd.options.display.max_colwidth = 12

%matplotlib inline

In [2]:
records = load_pseudo(True)
mask = (records['toby'].notna() & records['carb'].notna())
records.head()

Unnamed: 0,id,sequence,missing,missing_%,sequence_i,missing_i,missing_%_i,carb,toby,carb_num,toby_num
0,TA151,ATGAGTGA...,31842,6.588,ATGAGTGA...,28410,5.878,True,False,-2.0,16.0
1,IC1,ATGAGTGA...,46071,9.532,ATGAGTGA...,34714,7.182,False,False,2.0,14.0
2,A237,ATGAGTGA...,44514,9.21,ATGAGTGA...,35933,7.434,True,False,-1.0,4.0
3,5920,ATGAGTGA...,49497,10.241,ATGAGTGA...,36873,7.629,,,,
4,LiA96,ATGAGTGA...,44067,9.117,ATGAGTGA...,34454,7.128,False,False,0.0,18.0


In [3]:
import os

random_state = 42

s = {'{}_{}_{}_{}.npy'.format(impute, nc, selection, extraction)
     for impute in 'io'
     for nc in 'nc'
     for selection in '-vx'
     for extraction in '-pts'}

# numerical data
data_u = {d: np.load(os.path.join('../data/pseudo/preprocess', d)) for d in s}
for k, v in data_u.items():
    if v.shape[0] != 119:
        print(k)

# one-hot encoded data
data_e = {d: np.load(os.path.join('../data/pseudo/preprocess/onehot', d)) for d in s}

In [4]:
from sklearn.model_selection import train_test_split

# Classification

In [5]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [6]:
# silence warning for getting 0 on f1 score
from sklearn.exceptions import UndefinedMetricWarning
import warnings
warnings.filterwarnings('ignore', category=UndefinedMetricWarning)

In [7]:
y_c = records['carb'][mask].astype('?')
metrics_c = [accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score]

In [8]:
def train_clfs(clf, model, encode):
    results = []
    data = data_e if encode else data_u
    for d, X in data.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y_c, random_state=random_state,
                                                            stratify=y_c, train_size=0.7)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        m = {metric.__name__.replace('_score', ''): metric(y_test, y_pred)
             for metric in metrics_c}
        m.update({'file': d, 'encode': encode, 'model': model})
        results.append(m)
    return results

### Logistic regression

In [9]:
# 2 and 3 minutes
lr = LogisticRegression(penalty='none', class_weight='balanced',
                         solver='lbfgs', max_iter=2000, n_jobs=5, warm_start=False)
%time lr_u = train_clfs(lr, 'logistic', False)
%time lr_e = train_clfs(lr, 'logistic', True)

CPU times: user 1min 34s, sys: 8.28 s, total: 1min 42s
Wall time: 2min 3s
CPU times: user 4min 27s, sys: 16.5 s, total: 4min 44s
Wall time: 3min 7s


###  Random forest

In [10]:
# 45*2 seconds
rfc = RandomForestClassifier(n_estimators=500, n_jobs=5, class_weight='balanced')
%time rfc_u = train_clfs(rfc, 'rf', False)
%time rfc_e = train_clfs(rfc, 'rf', True)

CPU times: user 1min 2s, sys: 10.1 s, total: 1min 12s
Wall time: 49.5 s
CPU times: user 1min 2s, sys: 8.98 s, total: 1min 11s
Wall time: 44.8 s


###  Support vector machine

In [11]:
# 1 and 1.75 minutes
svc = SVC(gamma='auto', class_weight='balanced')
%time svc_u = train_clfs(svc, 'svm', False)
%time svc_e = train_clfs(svc, 'svm', True)

CPU times: user 51.5 s, sys: 4.33 s, total: 55.8 s
Wall time: 55.9 s
CPU times: user 1min 35s, sys: 3.94 s, total: 1min 39s
Wall time: 1min 39s


### Results

In [12]:
results_c = pd.DataFrame(lr_e + lr_u + rfc_u + rfc_e + svc_u + svc_e)
results_c.to_csv('result/result_pseudo_clf.csv', index=False)
results_c.head()

Unnamed: 0,accuracy,balanced_accuracy,f1,roc_auc,file,encode,model
0,0.778,0.5,0.0,0.5,i_c_x_t.npy,True,logistic
1,0.778,0.5,0.0,0.5,o_c_x_t.npy,True,logistic
2,0.861,0.821,0.706,0.821,o_n_x_-.npy,True,logistic
3,0.778,0.5,0.0,0.5,o_c_-_p.npy,True,logistic
4,0.778,0.5,0.0,0.5,i_c_v_s.npy,True,logistic


In [13]:
results_c.sort_values(by=['f1'], ascending=False)

Unnamed: 0,accuracy,balanced_accuracy,f1,roc_auc,file,encode,model
210,0.889,0.884,0.778,0.884,o_n_v_-.npy,False,svm
50,0.889,0.839,0.750,0.839,o_n_x_-.npy,False,logistic
263,0.861,0.866,0.737,0.866,o_c_-_-.npy,True,svm
242,0.861,0.821,0.706,0.821,o_n_x_-.npy,True,svm
219,0.861,0.821,0.706,0.821,o_n_-_-.npy,False,svm
...,...,...,...,...,...,...,...
149,0.778,0.500,0.000,0.500,o_n_x_s.npy,True,rf
151,0.778,0.500,0.000,0.500,i_n_v_t.npy,True,rf
152,0.778,0.500,0.000,0.500,i_n_-_t.npy,True,rf
155,0.778,0.500,0.000,0.500,o_c_x_p.npy,True,rf


# Regression

In [16]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [17]:
y_r = records['carb_num'][mask]
metrics_r = [r2_score, mean_squared_error]

In [22]:
def train_regs(reg, model, encode):
    results = []
    data = data_e if encode else data_u
    for d, X in data.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y_r, random_state=random_state,
                                                            train_size=0.7)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
        m = {metric.__name__.replace('_score', ''): metric(y_test, y_pred)
             for metric in metrics_r}
        m.update({'file': d, 'encode': encode, 'model': model})
        results.append(m)
    return results

### Linear regression

In [23]:
# 20*2 seconds
lrr = LinearRegression(n_jobs=5)
%time lrr_u = train_regs(lrr, 'linear', False)
%time lrr_e = train_regs(lrr, 'linear', True)

CPU times: user 4min 48s, sys: 11.1 s, total: 4min 59s
Wall time: 15.1 s
CPU times: user 8min 48s, sys: 18.1 s, total: 9min 7s
Wall time: 20.1 s


### Random forest

In [24]:
# 20 and 53 minutes
rfr = RandomForestRegressor(n_estimators=500, n_jobs=5)
%time rfr_u = train_regs(rfr, 'rf', False)
%time rfr_e = train_regs(rfr, 'rf', True)

CPU times: user 1h 38min, sys: 7.95 s, total: 1h 38min 8s
Wall time: 20min 15s
CPU times: user 4h 22min 20s, sys: 10.3 s, total: 4h 22min 30s
Wall time: 53min 17s


### Support vector machine

In [25]:
# 1 and 1.5 minutes
svr = SVR(gamma='auto')
%time svr_u = train_regs(svr, 'svm', False)
%time svr_e = train_regs(svr, 'svm', True)

CPU times: user 48.8 s, sys: 1.72 s, total: 50.5 s
Wall time: 50.6 s
CPU times: user 1min 26s, sys: 1.97 s, total: 1min 28s
Wall time: 1min 28s


In [26]:
results_r = pd.DataFrame(lrr_e + lrr_u + rfr_u + rfr_e + svr_u + svr_e)
results_r.to_csv('result/result_pseudo_reg.csv', index=False)
results_r.head()

Unnamed: 0,r2,mean_squared_error,file,encode,model
0,0.0008847,111.842,i_n_v_s.npy,True,linear
1,-0.9008,212.781,i_n_x_-.npy,True,linear
2,-0.002881,112.264,o_n_v_p.npy,True,linear
3,-0.2565,140.657,o_c_x_-.npy,True,linear
4,-0.004616,112.458,i_n_-_p.npy,True,linear


In [29]:
results_r.sort_values(by=['mean_squared_error'])

Unnamed: 0,r2,mean_squared_error,file,encode,model
110,0.052,106.118,o_c_x_s.npy,False,rf
115,0.049,106.413,o_c_v_p.npy,False,rf
14,0.020,109.688,o_c_x_s.npy,True,linear
15,0.020,109.722,i_n_x_s.npy,True,linear
105,0.016,110.198,i_n_x_t.npy,False,rf
...,...,...,...,...,...
48,-20.476,2404.067,i_n_v_s.npy,False,linear
63,-25.124,2924.399,i_n_x_s.npy,False,linear
82,-41.163,4719.836,i_c_v_s.npy,False,linear
75,-52.805,6022.988,i_c_x_s.npy,False,linear
