In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from load import load_pseudo, load_nucleotides

pd.options.display.precision = 3
pd.options.display.max_colwidth = 10

%matplotlib inline

In [2]:
%time records = load_pseudo()
numerical_response = pd.read_csv('../data/pseudo/Perron_phenotype-GSU-training.csv')
records = records.merge(numerical_response[['strain', 'carb.lag.delta', 'toby.lag.delta']],
                        left_on='lab-id', right_on='strain', how='left')
records.rename(columns={'carb.lag.delta': 'carb_num', 'toby.lag.delta': 'toby_num'}, inplace=True)
records.drop(columns=['strain', 'lab-id'], inplace=True)
records.head()

CPU times: user 5.33 s, sys: 127 ms, total: 5.46 s
Wall time: 5.49 s


Unnamed: 0,id,sequence,missing,missing_%,sequence_i,missing_i,missing_%_i,carb,toby,carb_num,toby_num
0,TA151,ATGAGT...,31842,6.588,ATGAGT...,28410,5.878,True,False,-2.0,16.0
1,IC1,ATGAGT...,46071,9.532,ATGAGT...,34714,7.182,False,False,2.0,14.0
2,A237,ATGAGT...,44514,9.21,ATGAGT...,35933,7.434,True,False,-1.0,4.0
3,5920,ATGAGT...,49497,10.241,ATGAGT...,36873,7.629,,,,
4,LiA96,ATGAGT...,44067,9.117,ATGAGT...,34454,7.128,False,False,0.0,18.0


In [3]:
mask = (records['toby'].notna() & records['carb'].notna())

# Feature selection

In [4]:
o_n = np.load('../data/pseudo/preprocess/o_n_-_-.npy')
i_n = np.load('../data/pseudo/preprocess/i_n_-_-.npy')

In [5]:
# 40 seconds
%time o_n = load_nucleotides('../data/pseudo/concatenated.fasta')
%time i_n = load_nucleotides('../data/pseudo/concatenated_naive_impute.fasta')

In [6]:
# 45 seconds
forward = str.maketrans('-ACTG', '01234')
def transformation(str):
    return [int(i) for i in str.translate(forward)]
%time o_n = pd.DataFrame(records['sequence'].apply(transformation).to_list())
%time i_n = pd.DataFrame(records['sequence_i'].apply(transformation).to_list())
np.save('../data/pseudo/preprocess/o_n_-_-.npy', o_n)
np.save('../data/pseudo/preprocess/i_n_-_-.npy', i_n)

## Variance threshold

In [7]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(0.01)

# justification(not rigorous) for why < 0.016 is the threshold to drop a column
a, b = 4, 3
arr = np.ones((122, 1))*a
arr[:2] = b
np.var(arr)

o_n_v = selector.fit_transform(o_n)
o_n_v_selected = pd.Series(selector.get_support())
o_n_v_selected.value_counts()

i_n_v = selector.fit_transform(i_n)
i_n_v_selected = pd.Series(selector.get_support())
i_n_v_selected.value_counts()

## Remove based on SNP counts
similar to variance threshold but seems better

In [8]:
# less than 6 min
%time snp_counts_o = o_n.apply(pd.Series.value_counts, axis=0)
%time snp_counts_i = i_n.apply(pd.Series.value_counts, axis=0)
np.save('../data/pseudo/preprocess/others/snp_counts_o.npy', snp_counts_o.to_numpy())
np.save('../data/pseudo/preprocess/others/snp_counts_i.npy', snp_counts_i.to_numpy())

snp_counts_o = pd.DataFrame(np.load('../data/pseudo/preprocess/others/snp_counts_o.npy'))
snp_counts_i = pd.DataFrame(np.load('../data/pseudo/preprocess/others/snp_counts_i.npy'))

# True     204101
snp_max_counts_o = snp_counts_o.max()
(snp_max_counts_o<121).value_counts()

# True     ???
snp_max_counts_i = snp_counts_i.max()
(snp_max_counts_i<121).value_counts()

o_n_v = o_n[mask].loc[:, (snp_max_counts_o<121)]
i_n_v = i_n[mask].loc[:, (snp_max_counts_i<121)]
np.save('../data/pseudo/preprocess/o_n_v_-.npy', o_n_v)
np.save('../data/pseudo/preprocess/i_n_v_-.npy', i_n_v)

## $\chi^2$ on the previous result
because some features are all 0's, so gives `divide by 0` warning

no warning if we remove those features (on the previous step)

In [9]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

o_n_x = SelectKBest(chi2, k=250032//2).fit_transform(o_n_v, records['toby'][mask].astype('i4'))
i_n_x = SelectKBest(chi2, k=105124//2).fit_transform(i_n_v, records['toby'][mask].astype('i4'))
np.save('../data/pseudo/preprocess/o_n_x_-.npy', o_n_x)
np.save('../data/pseudo/preprocess/i_n_x_-.npy', i_n_x)

# Feature extraction

In [10]:
o_n_v = np.load('../data/pseudo/preprocess/o_n_v_-.npy')
i_n_v = np.load('../data/pseudo/preprocess/i_n_v_-.npy')
o_n_x = np.load('../data/pseudo/preprocess/o_n_x_-.npy')
i_n_x = np.load('../data/pseudo/preprocess/i_n_x_-.npy')

## String kernel

In [11]:
from strkernel.mismatch_kernel import MismatchKernel

# 9 minutes
%time o_n__s = MismatchKernel(l=5, k=4, m=1).get_kernel(o_n)
%time i_n__s = MismatchKernel(l=5, k=4, m=1).get_kernel(i_n)
np.save('../data/pseudo/preprocess/o_n_-_s.npy', o_n__s.kernel)
np.save('../data/pseudo/preprocess/i_n_-_s.npy', i_n__s.kernel)

# 3 and 1 minutes
%time o_n_v_s = MismatchKernel(l=5, k=4, m=1).get_kernel(o_n_v)
%time i_n_v_s = MismatchKernel(l=5, k=4, m=1).get_kernel(i_n_v)
np.save('../data/pseudo/preprocess/o_n_v_s.npy', o_n_v_s.kernel)
np.save('../data/pseudo/preprocess/i_n_v_s.npy', i_n_v_s.kernel)

# 2 minutes
%time o_n_x_s = MismatchKernel(l=5, k=4, m=1).get_kernel(o_n_x)
%time i_n_x_s = MismatchKernel(l=5, k=4, m=1).get_kernel(i_n_x)
np.save('../data/pseudo/preprocess/o_n_x_s.npy', o_n_x_s.kernel)
np.save('../data/pseudo/preprocess/i_n_x_s.npy', i_n_x_s.kernel)

## PCA

In [12]:
from sklearn.decomposition import PCA

%time o_n__p = PCA(n_components=119).fit_transform(o_n)
%time i_n__p = PCA(n_components=119).fit_transform(i_n)
np.save('../data/pseudo/preprocess/o_n_-_p.npy', o_n__p)
np.save('../data/pseudo/preprocess/i_n_-_p.npy', i_n__p)

%time o_n_v_p = PCA(n_components=119).fit_transform(o_n_v)
%time i_n_v_p = PCA(n_components=119).fit_transform(i_n_v)
np.save('../data/pseudo/preprocess/o_n_v_p.npy', o_n_v_p)
np.save('../data/pseudo/preprocess/i_n_v_p.npy', i_n_v_p)

%time o_n_x_p = PCA(n_components=119).fit_transform(o_n_x)
%time i_n_x_p = PCA(n_components=119).fit_transform(i_n_x)
np.save('../data/pseudo/preprocess/o_n_x_p.npy', o_n_x_p)
np.save('../data/pseudo/preprocess/i_n_x_p.npy', i_n_x_p)

## T-SNE

In [13]:
from sklearn.manifold import TSNE

%time o_n__t = TSNE(n_components=3).fit_transform(o_n)
%time i_n__t = TSNE(n_components=3).fit_transform(i_n)
np.save('../data/pseudo/preprocess/o_n_-_t.npy', o_n__t)
np.save('../data/pseudo/preprocess/i_n_-_t.npy', i_n__t)

%time o_n_v_t = TSNE(n_components=3).fit_transform(o_n_v)
%time i_n_v_t = TSNE(n_components=3).fit_transform(i_n_v)
np.save('../data/pseudo/preprocess/o_n_v_t.npy', o_n_v_t)
np.save('../data/pseudo/preprocess/i_n_v_t.npy', i_n_v_t)

%time o_n_x_t = TSNE(n_components=3).fit_transform(o_n_x)
%time i_n_x_t = TSNE(n_components=3).fit_transform(i_n_x)
np.save('../data/pseudo/preprocess/o_n_x_t.npy', o_n_x_t)
np.save('../data/pseudo/preprocess/i_n_x_t.npy', i_n_x_t)

# Machine learning

In [4]:
import os

random_state = 42

s = {'{}_n_{}_{}.npy'.format(impute, selection, extraction)
     for impute in 'io'
     for selection in '-vx'
     for extraction in '-pts'}

data_u = {d: np.load(os.path.join('../data/pseudo/preprocess', d)) for d in s}
# mask all data to remove x with NAN labels
for k, v in data_u.items():
    if v.shape[0] != 119:
        data_u[k] = v[mask]

In [71]:
import pickle
from sklearn.preprocessing import OneHotEncoder

for file, X in data.items():
    encoder = OneHotEncoder(categories='auto', sparse=False, dtype=np.int32)
    %time X_encode = encoder.fit_transform(X)
    
    np.save(os.path.join('../data/pseudo/preprocess/onehot', file), X_encode)
    with open(os.path.join('../data/pseudo/preprocess/onehot-encoder', file[:file.index('.')]), 'wb') as output:
        pickle.dump(encoder, output)

In [5]:
data_e = {d: np.load(os.path.join('../data/pseudo/preprocess/onehot', d)) for d in s}

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

## Classification

In [23]:
y = records['carb'][mask].astype('?')
X = data_u['i_n_v_-.npy']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.6)

### Logistic regression

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [11]:
param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
              'class_weight': [None, 'balanced', {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:32}, {0:1, 1:64}, {0:1, 1:128}],
              'l1_ratio': [0., 0.2, 0.4, 0.6, 0.8, 1.]}
clf = GridSearchCV(LogisticRegression(penalty='elasticnet', solver='saga', max_iter=2000, verbose=1, n_jobs=5),
                   param_grid=param_grid,
                   scoring=['recall', 'balanced_accuracy'],
                   refit='balanced_accuracy',
                   cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state))

In [53]:
clf = LogisticRegression(penalty='none', class_weight='balanced',
                         solver='lbfgs', max_iter=2000, n_jobs=5)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
print(confusion_matrix(y_test, clf.predict(X_test)))

[[28  0]
 [ 8  0]]


In [54]:
model_u_logistic = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = LogisticRegression(penalty='none', class_weight='balanced',
                             solver='lbfgs', max_iter=2000, n_jobs=5)
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    model_u_logistic[d] = clf

i_n_x_-.npy: 0.7777777777777778
[[25  3]
 [ 5  3]]
o_n_-_t.npy: 0.6944444444444444
[[22  6]
 [ 5  3]]
i_n_-_-.npy: 0.6944444444444444
[[24  4]
 [ 7  1]]
o_n_v_-.npy: 0.7777777777777778
[[22  6]
 [ 2  6]]
o_n_-_p.npy: 0.6666666666666666
[[16 12]
 [ 0  8]]
i_n_v_t.npy: 0.6666666666666666
[[18 10]
 [ 2  6]]
o_n_v_s.npy: 0.6944444444444444
[[19  9]
 [ 2  6]]
o_n_x_-.npy: 0.8333333333333334
[[24  4]
 [ 2  6]]
i_n_x_p.npy: 0.6111111111111112
[[17 11]
 [ 3  5]]
i_n_v_s.npy: 0.6388888888888888
[[17 11]
 [ 2  6]]
i_n_x_s.npy: 0.6388888888888888
[[17 11]
 [ 2  6]]
i_n_-_p.npy: 0.5833333333333334
[[16 12]
 [ 3  5]]
o_n_x_p.npy: 0.6111111111111112
[[15 13]
 [ 1  7]]
o_n_x_s.npy: 0.6388888888888888
[[17 11]
 [ 2  6]]
o_n_x_t.npy: 0.4166666666666667
[[13 15]
 [ 6  2]]
i_n_v_p.npy: 0.6111111111111112
[[17 11]
 [ 3  5]]
i_n_v_-.npy: 0.75
[[25  3]
 [ 6  2]]
o_n_-_s.npy: 0.75
[[21  7]
 [ 2  6]]
o_n_-_-.npy: 0.8055555555555556
[[25  3]
 [ 4  4]]
i_n_x_t.npy: 0.4722222222222222
[[15 13]
 [ 6  2]]
i_n_-_t.

In [55]:
model_e_logistic = {}
for d, X in data_e.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = LogisticRegression(penalty='none', class_weight='balanced',
                             solver='lbfgs', max_iter=2000, n_jobs=5)
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    model_e_logistic[d] = clf

i_n_x_-.npy: 0.7222222222222222
[[23  5]
 [ 5  3]]
o_n_-_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_-_-.npy: 0.7777777777777778
[[25  3]
 [ 5  3]]
o_n_v_-.npy: 0.75
[[22  6]
 [ 3  5]]
o_n_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_v_s.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_x_-.npy: 0.8333333333333334
[[25  3]
 [ 3  5]]
i_n_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_s.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_x_s.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_x_s.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_x_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_-.npy: 0.7222222222222222
[[23  5]
 [ 5  3]]
o_n_-_s.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_-_-.npy: 0.7777777777777778
[[22  6]
 [ 2  6]]
i_n_x_t.npy: 0.7777777777777778
[[28  0]
 [ 8

### Random forest

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [52]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=5, class_weight='balanced')
clf.fit(X_train, y_train)
confusion_matrix(y_test, clf.predict(X_test))

array([[28,  0],
       [ 8,  0]])

In [57]:
model_u_random = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = RandomForestClassifier(n_estimators=500, n_jobs=5, class_weight='balanced')
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    model_u_random[d] = clf

i_n_x_-.npy: 0.7777777777777778
[[26  2]
 [ 6  2]]
o_n_-_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_-_-.npy: 0.7777777777777778
[[26  2]
 [ 6  2]]
o_n_v_-.npy: 0.8333333333333334
[[27  1]
 [ 5  3]]
o_n_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_t.npy: 0.7777777777777778
[[27  1]
 [ 7  1]]
o_n_v_s.npy: 0.75
[[25  3]
 [ 6  2]]
o_n_x_-.npy: 0.8055555555555556
[[27  1]
 [ 6  2]]
i_n_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_s.npy: 0.75
[[27  1]
 [ 8  0]]
i_n_x_s.npy: 0.7222222222222222
[[26  2]
 [ 8  0]]
i_n_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_x_s.npy: 0.7222222222222222
[[25  3]
 [ 7  1]]
o_n_x_t.npy: 0.75
[[27  1]
 [ 8  0]]
i_n_v_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_-.npy: 0.8055555555555556
[[27  1]
 [ 6  2]]
o_n_-_s.npy: 0.8055555555555556
[[26  2]
 [ 5  3]]
o_n_-_-.npy: 0.75
[[26  2]
 [ 7  1]]
i_n_x_t.npy: 0.75
[[27  1]
 [ 8  0]]
i_n_-_t.npy: 0.7222222222222222
[[26  2]
 [ 8  0]]

In [58]:
model_e_random = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = RandomForestClassifier(n_estimators=500, n_jobs=5, class_weight='balanced')
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    model_e_random[d] = clf

i_n_x_-.npy: 0.7777777777777778
[[26  2]
 [ 6  2]]
o_n_-_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_-_-.npy: 0.7777777777777778
[[26  2]
 [ 6  2]]
o_n_v_-.npy: 0.7777777777777778
[[27  1]
 [ 7  1]]
o_n_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_t.npy: 0.7777777777777778
[[27  1]
 [ 7  1]]
o_n_v_s.npy: 0.75
[[25  3]
 [ 6  2]]
o_n_x_-.npy: 0.8333333333333334
[[27  1]
 [ 5  3]]
i_n_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_s.npy: 0.75
[[27  1]
 [ 8  0]]
i_n_x_s.npy: 0.7222222222222222
[[26  2]
 [ 8  0]]
i_n_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_x_s.npy: 0.7222222222222222
[[25  3]
 [ 7  1]]
o_n_x_t.npy: 0.75
[[27  1]
 [ 8  0]]
i_n_v_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_-.npy: 0.7777777777777778
[[26  2]
 [ 6  2]]
o_n_-_s.npy: 0.8055555555555556
[[26  2]
 [ 5  3]]
o_n_-_-.npy: 0.7777777777777778
[[27  1]
 [ 7  1]]
i_n_x_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_-_t.npy: 0.75
[[27

### Support vector machine

In [59]:
from sklearn.svm import SVC

In [63]:
model_u_svm = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = SVC(gamma='auto', class_weight='balanced')
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    model_u_svm[d] = clf

i_n_x_-.npy: 0.7777777777777778
[[23  5]
 [ 3  5]]
o_n_-_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_-_-.npy: 0.8055555555555556
[[25  3]
 [ 4  4]]
o_n_v_-.npy: 0.8888888888888888
[[25  3]
 [ 1  7]]
o_n_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_v_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
o_n_x_-.npy: 0.8888888888888888
[[26  2]
 [ 2  6]]
i_n_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
i_n_x_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
i_n_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_x_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
o_n_x_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_-.npy: 0.8055555555555556
[[23  5]
 [ 2  6]]
o_n_-_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
o_n_-_-.npy: 0.8611111111111112
[[25  3]
 [ 2  6]]
i_n_x_t.npy: 0.7777777777777778

In [64]:
model_e_svm = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = SVC(gamma='auto', class_weight='balanced')
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    model_e_svm[d] = clf

i_n_x_-.npy: 0.7777777777777778
[[23  5]
 [ 3  5]]
o_n_-_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_-_-.npy: 0.8055555555555556
[[25  3]
 [ 4  4]]
o_n_v_-.npy: 0.8888888888888888
[[25  3]
 [ 1  7]]
o_n_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_v_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
o_n_x_-.npy: 0.8888888888888888
[[26  2]
 [ 2  6]]
i_n_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
i_n_x_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
i_n_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_n_x_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
o_n_x_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_n_v_-.npy: 0.8055555555555556
[[23  5]
 [ 2  6]]
o_n_-_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
o_n_-_-.npy: 0.8611111111111112
[[25  3]
 [ 2  6]]
i_n_x_t.npy: 0.7777777777777778

## Regression

In [67]:
y = records['carb_num'][mask]
X = data_u['i_n_v_-.npy']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.6)

### Linear regression

In [65]:
from sklearn.linear_model import LinearRegression

In [68]:
reg = LinearRegression(n_jobs=5)
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

-0.5576861056531992

In [70]:
model_u_linear = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.7)
    clf = LinearRegression(n_jobs=5)
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    model_u_linear[d] = clf

i_n_x_-.npy: -1.0429679767140527
o_n_-_t.npy: -0.25335024886382285
i_n_-_-.npy: -0.9181924822179699
o_n_v_-.npy: -0.13730328170158757
o_n_-_p.npy: -0.16533320987865197
i_n_v_t.npy: -0.04703137946065428
o_n_v_s.npy: -4.322237131402667
o_n_x_-.npy: -0.2556004451938423
i_n_x_p.npy: -1.039553922398774
i_n_v_s.npy: -20.476127914659585
i_n_x_s.npy: -11.23674680450402
i_n_-_p.npy: -0.9199558897108064
o_n_x_p.npy: -0.2632762012651939
o_n_x_s.npy: -4.4642080691905885
o_n_x_t.npy: -0.014700652051022667
i_n_v_p.npy: -0.9499575747410811
i_n_v_-.npy: -0.950169715214543
o_n_-_s.npy: -7.7157630320248565
o_n_-_-.npy: -0.1603923407681085
i_n_x_t.npy: -0.13770547164200098
i_n_-_t.npy: 0.006902683747397997
o_n_v_t.npy: -0.042306774904844024
o_n_v_p.npy: -0.13803847963604432
i_n_-_s.npy: -13.098631740901306


In [71]:
model_e_linear = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.7)
    clf = LinearRegression(n_jobs=5)
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    model_e_linear[d] = clf

i_n_x_-.npy: -1.0429679767140527
o_n_-_t.npy: -0.25335024886382285
i_n_-_-.npy: -0.9181924822179699
o_n_v_-.npy: -0.13730328170158757
o_n_-_p.npy: -0.16533320987865197
i_n_v_t.npy: -0.04703137946065428
o_n_v_s.npy: -4.322237131402667
o_n_x_-.npy: -0.2556004451938423
i_n_x_p.npy: -1.039553922398774
i_n_v_s.npy: -20.476127914659585
i_n_x_s.npy: -11.23674680450402
i_n_-_p.npy: -0.9199558897108064
o_n_x_p.npy: -0.2632762012651939
o_n_x_s.npy: -4.4642080691905885
o_n_x_t.npy: -0.014700652051022667
i_n_v_p.npy: -0.9499575747410811
i_n_v_-.npy: -0.950169715214543
o_n_-_s.npy: -7.7157630320248565
o_n_-_-.npy: -0.1603923407681085
i_n_x_t.npy: -0.13770547164200098
i_n_-_t.npy: 0.006902683747397997
o_n_v_t.npy: -0.042306774904844024
o_n_v_p.npy: -0.13803847963604432
i_n_-_s.npy: -13.098631740901306


### Random forest

In [72]:
from sklearn.ensemble import RandomForestRegressor

In [73]:
model_u_rr = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.7)
    clf = RandomForestRegressor(n_estimators=500, n_jobs=5)
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    model_u_rr[d] = clf

i_n_x_-.npy: -0.20303416537538976
o_n_-_t.npy: -0.20613788469491845
i_n_-_-.npy: -0.22399553833852637
o_n_v_-.npy: -0.23145226005679786
o_n_-_p.npy: -0.059301730072513825
i_n_v_t.npy: -0.11012481568281474
o_n_v_s.npy: -0.570538177727536
o_n_x_-.npy: -0.26012765413989913
i_n_x_p.npy: 0.002157851277950873
i_n_v_s.npy: -0.16117861336127248
i_n_x_s.npy: -0.06020384427472503
i_n_-_p.npy: -0.1262951731781965
o_n_x_p.npy: -0.13501035639251158
o_n_x_s.npy: -0.052760277261573174
o_n_x_t.npy: -0.17121698640712513
i_n_v_p.npy: -0.005869143235269769
i_n_v_-.npy: -0.20461427742700367
o_n_-_s.npy: -1.158306895254901
o_n_-_-.npy: -0.22106844324354125
i_n_x_t.npy: -0.19325903333425254
i_n_-_t.npy: 0.011403695276958281
o_n_v_t.npy: -0.16882134020789086
o_n_v_p.npy: -0.0648949353993773
i_n_-_s.npy: -0.06631627979817489


In [74]:
model_e_rr = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.7)
    clf = RandomForestRegressor(n_estimators=500, n_jobs=5)
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    model_e_rr[d] = clf

i_n_x_-.npy: -0.1925639154098544
o_n_-_t.npy: -0.22575066960765433
i_n_-_-.npy: -0.20201676759767295
o_n_v_-.npy: -0.2612391656786788
o_n_-_p.npy: -0.08617507248614564
i_n_v_t.npy: -0.14167842212357673
o_n_v_s.npy: -0.5747468773884035
o_n_x_-.npy: -0.24548701723234734
i_n_x_p.npy: -0.02062880377181653
i_n_v_s.npy: -0.1756157687281148
i_n_x_s.npy: -0.039723899969671184
i_n_-_p.npy: -0.08192058198461516
o_n_x_p.npy: -0.10542525484573617
o_n_x_s.npy: -0.0729427364415891
o_n_x_t.npy: -0.16866643965921324
i_n_v_p.npy: -0.011340009705257836
i_n_v_-.npy: -0.20520039085720554
o_n_-_s.npy: -1.2100129233229477
o_n_-_-.npy: -0.2662107389781905
i_n_x_t.npy: -0.21137328958614843
i_n_-_t.npy: 0.009126489067799004
o_n_v_t.npy: -0.1299208254983597
o_n_v_p.npy: -0.048218848824064686
i_n_-_s.npy: -0.0681710614574429


### Support vector machine

In [75]:
from sklearn.svm import SVR

In [76]:
model_u_svr = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.7)
    clf = SVR(gamma='auto')
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    model_u_svr[d] = clf

i_n_x_-.npy: -0.08651327032195755
o_n_-_t.npy: -0.07849453836880826
i_n_-_-.npy: -0.08941467144290449
o_n_v_-.npy: -0.08118517992701979
o_n_-_p.npy: -0.07849453836879694
i_n_v_t.npy: -0.07849453836880826
o_n_v_s.npy: -0.07800730304654402
o_n_x_-.npy: -0.08080126595184267
i_n_x_p.npy: -0.07849470358760224
i_n_v_s.npy: -0.07808437765706344
i_n_x_s.npy: -0.07805524445569256
i_n_-_p.npy: -0.07863266017010906
o_n_x_p.npy: -0.07849453836880826
o_n_x_s.npy: -0.07815699098170126
o_n_x_t.npy: -0.07849453836880826
i_n_v_p.npy: -0.07849290209618665
i_n_v_-.npy: -0.091714455611138
o_n_-_s.npy: -0.07801840342626942
o_n_-_-.npy: -0.08426548690742375
i_n_x_t.npy: -0.07849453836880826
i_n_-_t.npy: -0.07849453836880826
o_n_v_t.npy: -0.07849453836880826
o_n_v_p.npy: -0.07849453836880826
i_n_-_s.npy: -0.07803530310792328


In [77]:
model_e_svr = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.7)
    clf = SVR(gamma='auto')
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    model_e_svr[d] = clf

i_n_x_-.npy: -0.08651327032195755
o_n_-_t.npy: -0.07849453836880826
i_n_-_-.npy: -0.08941467144290449
o_n_v_-.npy: -0.08118517992701979
o_n_-_p.npy: -0.07849453836879694
i_n_v_t.npy: -0.07849453836880826
o_n_v_s.npy: -0.07800730304654402
o_n_x_-.npy: -0.08080126595184267
i_n_x_p.npy: -0.07849470358760224
i_n_v_s.npy: -0.07808437765706344
i_n_x_s.npy: -0.07805524445569256
i_n_-_p.npy: -0.07863266017010906
o_n_x_p.npy: -0.07849453836880826
o_n_x_s.npy: -0.07815699098170126
o_n_x_t.npy: -0.07849453836880826
i_n_v_p.npy: -0.07849290209618665
i_n_v_-.npy: -0.091714455611138
o_n_-_s.npy: -0.07801840342626942
o_n_-_-.npy: -0.08426548690742375
i_n_x_t.npy: -0.07849453836880826
i_n_-_t.npy: -0.07849453836880826
o_n_v_t.npy: -0.07849453836880826
o_n_v_p.npy: -0.07849453836880826
i_n_-_s.npy: -0.07803530310792328
