In [1]:
import numpy as np
import pandas as pd

from load import load_pseudo

pd.options.display.precision = 3
pd.options.display.max_colwidth = 12

%matplotlib inline

In [2]:
records = load_pseudo(True)
mask = (records['toby'].notna() & records['carb'].notna())
records.head()

Unnamed: 0,id,sequence,missing,missing_%,sequence_i,missing_i,missing_%_i,carb,toby,carb_num,toby_num
0,TA151,ATGAGTGA...,31842,6.588,ATGAGTGA...,28410,5.878,True,False,-2.0,16.0
1,IC1,ATGAGTGA...,46071,9.532,ATGAGTGA...,34714,7.182,False,False,2.0,14.0
2,A237,ATGAGTGA...,44514,9.21,ATGAGTGA...,35933,7.434,True,False,-1.0,4.0
3,5920,ATGAGTGA...,49497,10.241,ATGAGTGA...,36873,7.629,,,,
4,LiA96,ATGAGTGA...,44067,9.117,ATGAGTGA...,34454,7.128,False,False,0.0,18.0


In [3]:
import os

random_state = 42

s = {'{}_{}_{}_{}.npy'.format(impute, nc, selection, extraction)
     for impute in 'io'
     for nc in 'nc'
     for selection in '-vx'
     for extraction in '-pts'}

# numerical data
data_u = {d: np.load(os.path.join('../data/pseudo/preprocess', d)) for d in s}
for k, v in data_u.items():
    if v.shape[0] != 119:
        print(k)

# one-hot encoded data
data_e = {d: np.load(os.path.join('../data/pseudo/preprocess/onehot', d)) for d in s}

In [4]:
from sklearn.model_selection import train_test_split

# Classification

In [5]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [6]:
# silence warning for getting 0 on f1 score
from sklearn.exceptions import UndefinedMetricWarning
import warnings
warnings.filterwarnings('ignore', category=UndefinedMetricWarning)

In [7]:
y_c = records['carb'][mask].astype('?')
metrics_c = [accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score]

In [8]:
def train_clfs(clf, model, encode):
    results = []
    data = data_e if encode else data_u
    for d, X in data.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y_c, random_state=random_state,
                                                            stratify=y_c, train_size=0.7)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        m = {metric.__name__.replace('_score', ''): metric(y_test, y_pred)
             for metric in metrics_c}
        m.update({'file': d, 'encode': encode, 'model': model})
        results.append(m)
    return results

### Logistic regression

In [9]:
# 3 to 5 minutes in total
lr = LogisticRegression(penalty='none', class_weight='balanced',
                         solver='lbfgs', max_iter=2000, n_jobs=5, warm_start=False)
%time lr_u = train_clfs(lr, 'logistic', False)
%time lr_e = train_clfs(lr, 'logistic', True)

CPU times: user 1min 29s, sys: 6.7 s, total: 1min 36s
Wall time: 1min 41s
CPU times: user 4min 9s, sys: 14.3 s, total: 4min 23s
Wall time: 2min 26s


###  Random forest

In [10]:
# 45*2 seconds
rfc = RandomForestClassifier(n_estimators=500, n_jobs=5, class_weight='balanced')
%time rfc_u = train_clfs(rfc, 'rf', False)
%time rfc_e = train_clfs(rfc, 'rf', True)

CPU times: user 1min 2s, sys: 8.71 s, total: 1min 11s
Wall time: 47.4 s
CPU times: user 1min 8s, sys: 8.4 s, total: 1min 17s
Wall time: 47 s


###  Support vector machine

In [11]:
# 1 and 2 minutes
svc = SVC(gamma='auto', class_weight='balanced')
%time svc_u = train_clfs(svc, 'svm', False)
%time svc_e = train_clfs(svc, 'svm', True)

CPU times: user 49.3 s, sys: 2.2 s, total: 51.5 s
Wall time: 51.7 s
CPU times: user 1min 41s, sys: 2.42 s, total: 1min 44s
Wall time: 1min 44s


### Results

In [12]:
results_c = pd.DataFrame(lr_e + lr_u + rfc_u + rfc_e + svc_u + svc_e)
results_c.to_csv('result/result_pseudo_clf.csv', index=False)
results_c.head()

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,file,encode,model
0,0.778,0.5,0.0,0.0,0.0,i_c_v_t.npy,True,logistic
1,0.778,0.5,0.0,0.0,0.0,o_c_x_p.npy,True,logistic
2,0.778,0.5,0.0,0.0,0.0,o_n_x_t.npy,True,logistic
3,0.75,0.705,0.455,0.625,0.526,o_n_v_-.npy,True,logistic
4,0.778,0.5,0.0,0.0,0.0,i_c_-_s.npy,True,logistic


In [13]:
results_c.sort_values(by=['f1'], ascending=False)

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,file,encode,model
195,0.889,0.884,0.700,0.875,0.778,o_n_v_-.npy,False,svm
89,0.889,0.839,0.750,0.750,0.750,o_n_x_-.npy,False,logistic
254,0.861,0.866,0.636,0.875,0.737,o_c_-_-.npy,True,svm
41,0.861,0.821,0.667,0.750,0.706,o_n_x_-.npy,True,logistic
281,0.861,0.821,0.667,0.750,0.706,o_n_x_-.npy,True,svm
...,...,...,...,...,...,...,...,...
149,0.778,0.500,0.000,0.000,0.000,o_n_-_t.npy,True,rf
150,0.778,0.500,0.000,0.000,0.000,o_n_x_p.npy,True,rf
151,0.778,0.500,0.000,0.000,0.000,o_c_-_p.npy,True,rf
152,0.778,0.500,0.000,0.000,0.000,o_n_v_s.npy,True,rf


# Regression

In [14]:
from sklearn.metrics import r2_score, max_error, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [15]:
y_r = records['carb_num'][mask]
metrics_r = [r2_score, max_error, mean_absolute_error, mean_squared_error]

In [16]:
def train_regs(reg, model, encode):
    results = []
    data = data_e if encode else data_u
    for d, X in data.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y_r, random_state=random_state,
                                                            train_size=0.7)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
        m = {metric.__name__.replace('_score', ''): metric(y_test, y_pred)
             for metric in metrics_r}
        m.update({'file': d, 'encode': encode, 'model': model})
        results.append(m)
    return results

### Linear regression

In [17]:
# 20*2 seconds
lrr = LinearRegression(n_jobs=5)
%time lrr_u = train_regs(lrr, 'linear', False)
%time lrr_e = train_regs(lrr, 'linear', True)

CPU times: user 4min 51s, sys: 13.3 s, total: 5min 4s
Wall time: 15.9 s
CPU times: user 8min 53s, sys: 25.6 s, total: 9min 19s
Wall time: 21.9 s


### Random forest

In [18]:
# 20 and 50 minutes
rfr = RandomForestRegressor(n_estimators=500, n_jobs=5)
%time rfr_u = train_regs(rfr, 'rf', False)
%time rfr_e = train_regs(rfr, 'rf', True)

CPU times: user 1h 30min 13s, sys: 28 s, total: 1h 30min 41s
Wall time: 18min 38s
CPU times: user 3h 57min, sys: 56.4 s, total: 3h 57min 56s
Wall time: 48min 11s


### Support vector machine

In [19]:
# 1 and 1.5 minutes
svr = SVR(gamma='auto')
%time svr_u = train_regs(svr, 'svm', False)
%time svr_e = train_regs(svr, 'svm', True)

CPU times: user 48.2 s, sys: 3.18 s, total: 51.4 s
Wall time: 51.5 s
CPU times: user 1min 24s, sys: 3.68 s, total: 1min 28s
Wall time: 1min 28s


In [20]:
results_r = pd.DataFrame(lrr_e + lrr_u + rfr_u + rfr_e + svr_u + svr_e)
results_r.to_csv('result/result_pseudo_reg.csv', index=False)
results_r.head()

Unnamed: 0,r2,max_error,mean_absolute_error,mean_squared_error,file,encode,model
0,-4.61,64.827,18.871,628.008,i_c_v_t.npy,True,linear
1,-0.003,34.173,8.021,112.33,o_c_x_p.npy,True,linear
2,-0.096,35.593,8.754,122.673,o_n_x_t.npy,True,linear
3,-0.135,32.478,8.937,127.029,o_n_v_-.npy,True,linear
4,0.014,32.913,7.861,110.353,i_c_-_s.npy,True,linear


In [21]:
results_r.sort_values(by=['mean_squared_error'])

Unnamed: 0,r2,max_error,mean_absolute_error,mean_squared_error,file,encode,model
134,0.079,32.932,7.994,103.135,o_c_v_p.npy,False,rf
136,0.027,29.160,7.833,108.943,i_n_x_t.npy,False,rf
129,0.021,24.740,7.780,109.563,o_c_x_s.npy,False,rf
33,0.020,33.059,7.873,109.688,o_c_x_s.npy,True,linear
35,0.020,32.705,7.846,109.722,i_n_x_s.npy,True,linear
...,...,...,...,...,...,...,...
71,-20.476,195.814,33.036,2404.067,i_n_v_s.npy,False,linear
83,-25.124,150.952,39.258,2924.399,i_n_x_s.npy,False,linear
78,-41.163,165.162,56.333,4719.836,i_c_v_s.npy,False,linear
67,-52.805,225.895,55.153,6022.988,i_c_x_s.npy,False,linear
