In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from load import load_pseudo, load_condons

pd.options.display.precision = 3
pd.options.display.max_colwidth = 10

In [2]:
%time records = load_pseudo()
numerical_response = pd.read_csv('../data/pseudo/Perron_phenotype-GSU-training.csv')
records = records.merge(numerical_response[['strain', 'carb.lag.delta', 'toby.lag.delta']],
                        left_on='lab-id', right_on='strain', how='left')
records.rename(columns={'carb.lag.delta': 'carb_num', 'toby.lag.delta': 'toby_num'}, inplace=True)
records.drop(columns=['strain', 'lab-id'], inplace=True)
records.head()

CPU times: user 4.64 s, sys: 92 ms, total: 4.74 s
Wall time: 4.76 s


Unnamed: 0,id,sequence,missing,missing_%,sequence_i,missing_i,missing_%_i,carb,toby,carb_num,toby_num
0,TA151,ATGAGT...,31842,6.588,ATGAGT...,28410,5.878,True,False,-2.0,16.0
1,IC1,ATGAGT...,46071,9.532,ATGAGT...,34714,7.182,False,False,2.0,14.0
2,A237,ATGAGT...,44514,9.21,ATGAGT...,35933,7.434,True,False,-1.0,4.0
3,5920,ATGAGT...,49497,10.241,ATGAGT...,36873,7.629,,,,
4,LiA96,ATGAGT...,44067,9.117,ATGAGT...,34454,7.128,False,False,0.0,18.0


In [3]:
mask = (records['toby'].notna() & records['carb'].notna())

# Feature selection

In [6]:
# 22 seconds
%time o_c = load_condons('../data/pseudo/concatenated.fasta')
%time i_c = load_condons('../data/pseudo/concatenated_naive_impute.fasta')

# 1.5 minutes
d = {}
for label, content in o_c.iteritems():
    d.update(content.value_counts().to_dict())
d_sorted = dict(sorted(d.items(), key=lambda x: x[1], reverse=True))
mapping = {key: i for i, key in enumerate(d_sorted.keys())}

import json
with open('../data/pseudo/preprocess/others/condon_mapping.json', 'w') as output:
    json.dump(mapping, output, indent='\t')

import json
with open('../data/pseudo/preprocess/others/condon_mapping.json', 'r') as input_:
    mapping = json.load(input_)

# 22 seconds
%time o_c_ = o_c.applymap(lambda x: mapping[x])
%time i_c_ = i_c.applymap(lambda x: mapping[x])
np.save('../data/pseudo/preprocess/o_c_-_-.npy', o_c_)
np.save('../data/pseudo/preprocess/i_c_-_-.npy', i_c_)

CPU times: user 22 s, sys: 956 ms, total: 22.9 s
Wall time: 23 s
CPU times: user 21.7 s, sys: 725 ms, total: 22.4 s
Wall time: 22.4 s
CPU times: user 21.6 s, sys: 415 ms, total: 22 s
Wall time: 22.1 s
CPU times: user 21.9 s, sys: 343 ms, total: 22.2 s
Wall time: 22.3 s


In [7]:
o_c_ = np.load('../data/pseudo/preprocess/o_c_-_-.npy')
i_c_ = np.load('../data/pseudo/preprocess/i_c_-_-.npy')

## Remove based on SNP counts
similar to variance threshold but seems better

In [None]:
# 2 minutes
%time variant_counts_o = o_c.apply(pd.Series.value_counts, axis=0)
%time variant_counts_i = i_c.apply(pd.Series.value_counts, axis=0)
np.save('../data/pseudo/preprocess/others/variant_counts_o.npy', variant_counts_o)
np.save('../data/pseudo/preprocess/others/variant_counts_i.npy', variant_counts_i)

variant_counts_o = pd.DataFrame(np.load('../data/pseudo/preprocess/others/variant_counts_o.npy'))
variant_counts_i = pd.DataFrame(np.load('../data/pseudo/preprocess/others/variant_counts_i.npy'))

# True     85753
variant_max_counts_o = variant_counts_o.max()
(pd.Series(variant_max_counts_o<121)).value_counts()

# True      56191
variant_max_counts_i = variant_counts_i.max()
(variant_max_counts_i<121).value_counts()

o_c_v = o_c_[mask][:, variant_max_counts_o<121]
i_c_v = i_c_[mask][:, variant_max_counts_i<121]
np.save('../data/pseudo/preprocess/o_c_v_-.npy', o_c_v)
np.save('../data/pseudo/preprocess/i_c_v_-.npy', i_c_v)

## $\chi^2$ on the previous result
because some features are all 0's, so gives `divide by 0` warning

no warning if we remove those features (on the previous step)

In [9]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

o_c_x = SelectKBest(chi2, k=85753//2).fit_transform(o_c_v, records['toby'][mask].astype('i4'))
i_c_x = SelectKBest(chi2, k=56191//2).fit_transform(i_c_v, records['toby'][mask].astype('i4'))

np.save('../data/pseudo/preprocess/o_c_x_-.npy', o_c_x)
np.save('../data/pseudo/preprocess/i_c_x_-.npy', i_c_x)

# Feature extraction

In [8]:
o_c_v = np.load('../data/pseudo/preprocess/o_c_v_-.npy')
i_c_v = np.load('../data/pseudo/preprocess/i_c_v_-.npy')
o_c_x = np.load('../data/pseudo/preprocess/o_c_x_-.npy')
i_c_x = np.load('../data/pseudo/preprocess/i_c_x_-.npy')

## String kernel

In [9]:
from strkernel.mismatch_kernel import MismatchKernel

%time o_c__s = MismatchKernel(l=125, k=2, m=1).get_kernel(o_c_)
%time i_c__s = MismatchKernel(l=125, k=2, m=1).get_kernel(i_c_)
np.save('../data/pseudo/preprocess/o_c_-_s.npy', o_c__s.kernel)
np.save('../data/pseudo/preprocess/i_c_-_s.npy', i_c__s.kernel)

%time o_c_v_s = MismatchKernel(l=125, k=2, m=1).get_kernel(o_c_v)
%time i_c_v_s = MismatchKernel(l=125, k=2, m=1).get_kernel(i_c_v)
np.save('../data/pseudo/preprocess/o_c_v_s.npy', o_c_v_s.kernel)
np.save('../data/pseudo/preprocess/i_c_v_s.npy', i_c_v_s.kernel)

%time o_c_x_s = MismatchKernel(l=125, k=2, m=1).get_kernel(o_c_x)
%time i_c_x_s = MismatchKernel(l=125, k=2, m=1).get_kernel(i_c_x)
np.save('../data/pseudo/preprocess/o_c_x_s.npy', o_c_x_s.kernel)
np.save('../data/pseudo/preprocess/i_c_x_s.npy', i_c_x_s.kernel)

## PCA

In [10]:
from sklearn.decomposition import PCA

%time o_c__p = PCA(n_components=119).fit_transform(o_c_)
%time i_c__p = PCA(n_components=119).fit_transform(i_c_)
np.save('../data/pseudo/preprocess/o_c_-_p.npy', o_c__p)
np.save('../data/pseudo/preprocess/i_c_-_p.npy', i_c__p)

%time o_c_v_p = PCA(n_components=119).fit_transform(o_c_v)
%time i_c_v_p = PCA(n_components=119).fit_transform(i_c_v)
np.save('../data/pseudo/preprocess/o_c_v_p.npy', o_c_v_p)
np.save('../data/pseudo/preprocess/i_c_v_p.npy', i_c_v_p)

%time o_c_x_p = PCA(n_components=119).fit_transform(o_c_x)
%time i_c_x_p = PCA(n_components=119).fit_transform(i_c_x)
np.save('../data/pseudo/preprocess/o_c_x_p.npy', o_c_x_p)
np.save('../data/pseudo/preprocess/i_c_x_p.npy', i_c_x_p)

## TSNE

In [11]:
from sklearn.manifold import TSNE

%time o_c__t = TSNE(n_components=3).fit_transform(o_c_)
%time i_c__t = TSNE(n_components=3).fit_transform(i_c_)
np.save('../data/pseudo/preprocess/o_c_-_t.npy', o_c__t)
np.save('../data/pseudo/preprocess/i_c_-_t.npy', i_c__t)

%time o_c_v_t = TSNE(n_components=3).fit_transform(o_c_v)
%time i_c_v_t = TSNE(n_components=3).fit_transform(i_c_v)
np.save('../data/pseudo/preprocess/o_c_v_t.npy', o_c_v_t)
np.save('../data/pseudo/preprocess/i_c_v_t.npy', i_c_v_t)

%time o_c_x_t = TSNE(n_components=3).fit_transform(o_c_x)
%time i_c_x_t = TSNE(n_components=3).fit_transform(i_c_x)
np.save('../data/pseudo/preprocess/o_c_x_t.npy', o_c_x_t)
np.save('../data/pseudo/preprocess/i_c_x_t.npy', i_c_x_t)

In [10]:
# verify all possible combinations are created
import os
d = os.listdir('../data/pseudo/preprocess/')
s = {'{}_{}_{}_{}.npy'.format(impute, c_or_n, selection, extraction) for impute in 'io' for c_or_n in 'nc' for selection in '-vx' for extraction in '-pts'}
s - set(d)

set()

# Machine learning

In [11]:
import os

random_state = 42

s = {'{}_c_{}_{}.npy'.format(impute, selection, extraction)
     for impute in 'io'
     for selection in '-vx'
     for extraction in '-pts'}

data_u = {d: np.load(os.path.join('../data/pseudo/preprocess', d)) for d in s}
# mask all data to remove x with NAN labels
for k, v in data_u.items():
    if v.shape[0] != 119:
        data_u[k] = v[mask]

In [13]:
import pickle
from sklearn.preprocessing import OneHotEncoder

for file, X in data.items():
    encoder = OneHotEncoder(categories='auto', sparse=False, dtype=np.int32)
    %time X_encode = encoder.fit_transform(X)
    
    np.save(os.path.join('../data/pseudo/preprocess/onehot', file), X_encode)
    with open(os.path.join('../data/pseudo/preprocess/onehot-encoder', file[:file.index('.')]), 'wb') as output:
        pickle.dump(encoder, output)

In [12]:
data_e = {d: np.load(os.path.join('../data/pseudo/preprocess/onehot', d)) for d in s}

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
y = records['carb'][mask].astype('?')

## Classification

### Logistic regression

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [15]:
param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
              'class_weight': [None, 'balanced', {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:32}, {0:1, 1:64}, {0:1, 1:128}],
              'l1_ratio': [0., 0.2, 0.4, 0.6, 0.8, 1.]}
clf = GridSearchCV(LogisticRegression(penalty='elasticnet', solver='saga', max_iter=2000, verbose=1, n_jobs=5),
                   param_grid=param_grid,
                   scoring=['recall', 'balanced_accuracy'],
                   refit='balanced_accuracy',
                   cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state))

In [16]:
model_u_logistic = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = LogisticRegression(penalty='none', class_weight='balanced',
                             solver='lbfgs', max_iter=2000, n_jobs=5)
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    model_u_logistic[d] = clf

o_c_x_t.npy: 0.7222222222222222
[[20  8]
 [ 2  6]]
o_c_-_p.npy: 0.6388888888888888
[[16 12]
 [ 1  7]]
i_c_x_p.npy: 0.5833333333333334
[[15 13]
 [ 2  6]]
i_c_v_t.npy: 0.25
[[ 4 24]
 [ 3  5]]
i_c_x_s.npy: 0.6388888888888888
[[19  9]
 [ 4  4]]
i_c_x_-.npy: 0.7222222222222222
[[24  4]
 [ 6  2]]
o_c_x_s.npy: 0.6666666666666666
[[18 10]
 [ 2  6]]
o_c_-_t.npy: 0.4166666666666667
[[15 13]
 [ 8  0]]
o_c_v_-.npy: 0.7777777777777778
[[22  6]
 [ 2  6]]
o_c_v_p.npy: 0.6388888888888888
[[16 12]
 [ 1  7]]
o_c_v_t.npy: 0.7222222222222222
[[21  7]
 [ 3  5]]
i_c_v_s.npy: 0.6666666666666666
[[19  9]
 [ 3  5]]
i_c_-_p.npy: 0.5555555555555556
[[14 14]
 [ 2  6]]
i_c_-_-.npy: 0.7777777777777778
[[24  4]
 [ 4  4]]
o_c_x_-.npy: 0.7777777777777778
[[22  6]
 [ 2  6]]
o_c_-_s.npy: 0.75
[[21  7]
 [ 2  6]]
o_c_x_p.npy: 0.6388888888888888
[[16 12]
 [ 1  7]]
i_c_v_-.npy: 0.7222222222222222
[[23  5]
 [ 5  3]]
i_c_x_t.npy: 0.5833333333333334
[[17 11]
 [ 4  4]]
o_c_-_-.npy: 0.75
[[22  6]
 [ 3  5]]
o_c_v_s.npy: 0.6944444

In [17]:
model_e_logistic = {}
for d, X in data_e.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = LogisticRegression(penalty='none', class_weight='balanced',
                             solver='lbfgs', max_iter=2000, n_jobs=5)
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    model_e_logistic[d] = clf

o_c_x_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_v_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_x_s.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_x_-.npy: 0.75
[[24  4]
 [ 5  3]]
o_c_x_s.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_-_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_v_-.npy: 0.8055555555555556
[[23  5]
 [ 2  6]]
o_c_v_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_v_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_v_s.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_-_-.npy: 0.75
[[25  3]
 [ 6  2]]
o_c_x_-.npy: 0.7777777777777778
[[22  6]
 [ 2  6]]
o_c_-_s.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_v_-.npy: 0.7777777777777778
[[24  4]
 [ 4  4]]
i_c_x_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_-_-.npy: 0.8055555555555556
[[24  4]
 [ 3  5]]
o_c_v_s.

### Random forest

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
model_u_random = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = RandomForestClassifier(n_estimators=500, n_jobs=5, class_weight='balanced')
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    model_u_random[d] = clf

o_c_x_t.npy: 0.7777777777777778
[[27  1]
 [ 7  1]]
o_c_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_v_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_x_s.npy: 0.7222222222222222
[[26  2]
 [ 8  0]]
i_c_x_-.npy: 0.7777777777777778
[[26  2]
 [ 6  2]]
o_c_x_s.npy: 0.8055555555555556
[[26  2]
 [ 5  3]]
o_c_-_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_v_-.npy: 0.7777777777777778
[[27  1]
 [ 7  1]]
o_c_v_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_v_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_v_s.npy: 0.8055555555555556
[[28  0]
 [ 7  1]]
i_c_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_-_-.npy: 0.7777777777777778
[[26  2]
 [ 6  2]]
o_c_x_-.npy: 0.7777777777777778
[[27  1]
 [ 7  1]]
o_c_-_s.npy: 0.8055555555555556
[[26  2]
 [ 5  3]]
o_c_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_v_-.npy: 0.7777777777777778
[[26  2]
 [ 6  2]]
i_c_x_t.npy: 0.75
[[26  2]
 [ 7  1]]
o_c_-_-.npy: 0.75
[[26  2]
 [ 7  1]]
o_c_v_s.

In [20]:
model_e_random = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = RandomForestClassifier(n_estimators=500, n_jobs=5, class_weight='balanced')
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    model_e_random[d] = clf

o_c_x_t.npy: 0.7777777777777778
[[27  1]
 [ 7  1]]
o_c_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_v_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_x_s.npy: 0.7222222222222222
[[26  2]
 [ 8  0]]
i_c_x_-.npy: 0.7777777777777778
[[26  2]
 [ 6  2]]
o_c_x_s.npy: 0.8055555555555556
[[26  2]
 [ 5  3]]
o_c_-_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_v_-.npy: 0.7777777777777778
[[27  1]
 [ 7  1]]
o_c_v_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_v_t.npy: 0.75
[[27  1]
 [ 8  0]]
i_c_v_s.npy: 0.8055555555555556
[[28  0]
 [ 7  1]]
i_c_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_-_-.npy: 0.7777777777777778
[[26  2]
 [ 6  2]]
o_c_x_-.npy: 0.7777777777777778
[[27  1]
 [ 7  1]]
o_c_-_s.npy: 0.8055555555555556
[[26  2]
 [ 5  3]]
o_c_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_v_-.npy: 0.7777777777777778
[[26  2]
 [ 6  2]]
i_c_x_t.npy: 0.75
[[27  1]
 [ 8  0]]
o_c_-_-.npy: 0.75
[[26  2]
 [ 7  1]]
o_c_v_s.npy: 0.8055555

### Support vector machine

In [21]:
from sklearn.svm import SVC

In [22]:
model_u_svm = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = SVC(gamma='auto', class_weight='balanced')
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    model_u_svm[d] = clf

o_c_x_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_v_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_x_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
i_c_x_-.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_x_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
o_c_-_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_v_-.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_v_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_v_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_v_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
i_c_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_-_-.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_x_-.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_-_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
o_c_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_v_-.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_x_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_-_-.npy: 0.7777777777777778

In [23]:
model_e_svm = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, stratify=y, train_size=0.7)
    clf = SVC(gamma='auto', class_weight='balanced')
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    model_e_svm[d] = clf

o_c_x_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_v_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_x_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
i_c_x_-.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_x_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
o_c_-_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_v_-.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_v_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_v_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_v_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
i_c_-_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_-_-.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_x_-.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_-_s.npy: 0.2222222222222222
[[ 0 28]
 [ 0  8]]
o_c_x_p.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_v_-.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
i_c_x_t.npy: 0.7777777777777778
[[28  0]
 [ 8  0]]
o_c_-_-.npy: 0.7777777777777778

## Classification

In [28]:
y = records['carb_num'][mask]
X = data_u['i_c_v_-.npy']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.6)

### Linear regression

In [24]:
from sklearn.linear_model import LinearRegression

In [29]:
model_u_linear = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.7)
    clf = LinearRegression(n_jobs=5)
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    model_u_linear[d] = clf

o_c_x_t.npy: -0.06236691803697636
o_c_-_p.npy: -0.1616166231717644
i_c_x_p.npy: -0.7516873737948402
i_c_v_t.npy: -0.046520717838766545
i_c_x_s.npy: -52.804848710674165
i_c_x_-.npy: -0.7634902330970657
o_c_x_s.npy: -12.043767033444734
o_c_-_t.npy: 0.0031379546348111464
o_c_v_-.npy: -0.1457025264115981
o_c_v_p.npy: -0.1336045007027069
o_c_v_t.npy: -0.09286239716640687
i_c_v_s.npy: -41.16346551093732
i_c_-_p.npy: -0.6959725926467928
i_c_-_-.npy: -0.6866939142203072
o_c_x_-.npy: -0.23725683493187688
o_c_-_s.npy: -10.815286898441316
o_c_x_p.npy: -0.23768100366691902
i_c_v_-.npy: -0.7061418255484204
i_c_x_t.npy: -0.044110352006261344
o_c_-_-.npy: -0.15649042254701828
o_c_v_s.npy: -4.970325111333776
i_c_v_p.npy: -0.6967222031624971
i_c_-_t.npy: 0.019653437690379305
i_c_-_s.npy: -71.91066692863257


In [30]:
model_e_linear = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.7)
    clf = LinearRegression(n_jobs=5)
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    model_e_linear[d] = clf

o_c_x_t.npy: -0.06236691803697636
o_c_-_p.npy: -0.1616166231717644
i_c_x_p.npy: -0.7516873737948402
i_c_v_t.npy: -0.046520717838766545
i_c_x_s.npy: -52.804848710674165
i_c_x_-.npy: -0.7634902330970657
o_c_x_s.npy: -12.043767033444734
o_c_-_t.npy: 0.0031379546348111464
o_c_v_-.npy: -0.1457025264115981
o_c_v_p.npy: -0.1336045007027069
o_c_v_t.npy: -0.09286239716640687
i_c_v_s.npy: -41.16346551093732
i_c_-_p.npy: -0.6959725926467928
i_c_-_-.npy: -0.6866939142203072
o_c_x_-.npy: -0.23725683493187688
o_c_-_s.npy: -10.815286898441316
o_c_x_p.npy: -0.23768100366691902
i_c_v_-.npy: -0.7061418255484204
i_c_x_t.npy: -0.044110352006261344
o_c_-_-.npy: -0.15649042254701828
o_c_v_s.npy: -4.970325111333776
i_c_v_p.npy: -0.6967222031624971
i_c_-_t.npy: 0.019653437690379305
i_c_-_s.npy: -71.91066692863257


### Random forest regression

In [31]:
from sklearn.ensemble import RandomForestRegressor

In [32]:
model_u_rr = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.7)
    clf = RandomForestRegressor(n_estimators=500, n_jobs=5)
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    model_u_rr[d] = clf

o_c_x_t.npy: -0.09737593316606463
o_c_-_p.npy: -0.1322842534395765
i_c_x_p.npy: -0.03854408183297031
i_c_v_t.npy: -0.11639450119937145
i_c_x_s.npy: -0.18070000716865664
i_c_x_-.npy: -0.2384167250268825
o_c_x_s.npy: 0.02466407973751683
o_c_-_t.npy: 0.05117599481650981
o_c_v_-.npy: -0.1935143872728775
o_c_v_p.npy: 0.08448985624086669
o_c_v_t.npy: -0.5281207610907386
i_c_v_s.npy: -0.04596060293914905
i_c_-_p.npy: -0.08601197816316986
i_c_-_-.npy: -0.1813005795583007
o_c_x_-.npy: -0.2111609532107308
o_c_-_s.npy: -0.9910387390884778
o_c_x_p.npy: 0.0035553260360088323
i_c_v_-.npy: -0.24095694692437042
i_c_x_t.npy: -0.09328442581819174
o_c_-_-.npy: -0.20460981278778023
o_c_v_s.npy: -0.5991909637431416
i_c_v_p.npy: -0.005574371832694602
i_c_-_t.npy: -0.3728404500813369
i_c_-_s.npy: -0.05593235380076633


In [33]:
model_e_rr = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.7)
    clf = RandomForestRegressor(n_estimators=500, n_jobs=5)
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    model_e_rr[d] = clf

o_c_x_t.npy: -0.1115954061043869
o_c_-_p.npy: -0.13487763015247167
i_c_x_p.npy: -0.06578505522622624
i_c_v_t.npy: -0.1272774503846259
i_c_x_s.npy: -0.14928058010973588
i_c_x_-.npy: -0.20304404753370653
o_c_x_s.npy: 0.022372111831040264
o_c_-_t.npy: 0.05791313937522402
o_c_v_-.npy: -0.20945897477184383
o_c_v_p.npy: 0.09925719639361441
o_c_v_t.npy: -0.5274312145358295
i_c_v_s.npy: -0.07454944972290378
i_c_-_p.npy: -0.09297292144806857
i_c_-_-.npy: -0.20443909244809633
o_c_x_-.npy: -0.17117404725798901
o_c_-_s.npy: -0.9797854263420553
o_c_x_p.npy: 0.015096239763985775
i_c_v_-.npy: -0.23545545827014802
i_c_x_t.npy: -0.06769170332791097
o_c_-_-.npy: -0.2101866583583778
o_c_v_s.npy: -0.6444252342220629
i_c_v_p.npy: -0.0006828394496678492
i_c_-_t.npy: -0.3355599125423918
i_c_-_s.npy: -0.04616066205299263


### Support vector machine

In [34]:
from sklearn.svm import SVR

In [35]:
model_u_svr = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.7)
    clf = SVR(gamma='auto')
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    model_u_svr[d] = clf

o_c_x_t.npy: -0.07849453836880826
o_c_-_p.npy: -0.07849453836880826
i_c_x_p.npy: -0.07849453836880826
i_c_v_t.npy: -0.07849453836880826
i_c_x_s.npy: -0.07807466984972877
i_c_x_-.npy: -0.07849243855588028
o_c_x_s.npy: -0.07809771255883113
o_c_-_t.npy: -0.07849453836880826
o_c_v_-.npy: -0.07849455641444947
o_c_v_p.npy: -0.07849453836880826
o_c_v_t.npy: -0.07849453836880826
i_c_v_s.npy: -0.07806637720859144
i_c_-_p.npy: -0.07849453836880826
i_c_-_-.npy: -0.07897358015607425
o_c_x_-.npy: -0.0784945383756761
o_c_-_s.npy: -0.07801058474447276
o_c_x_p.npy: -0.07849453836880826
i_c_v_-.npy: -0.07851038991206472
i_c_x_t.npy: -0.07849453836880826
o_c_-_-.npy: -0.07850324051490443
o_c_v_s.npy: -0.07803351988881957
i_c_v_p.npy: -0.07849453836880826
i_c_-_t.npy: -0.07849453836880826
i_c_-_s.npy: -0.07803630371112602


In [36]:
model_e_svr = {}
for d, X in data_u.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, train_size=0.7)
    clf = SVR(gamma='auto')
    clf.fit(X_train, y_train)
    print('{}: {}'.format(d, clf.score(X_test, y_test)))
    model_e_svr[d] = clf

o_c_x_t.npy: -0.07849453836880826
o_c_-_p.npy: -0.07849453836880826
i_c_x_p.npy: -0.07849453836880826
i_c_v_t.npy: -0.07849453836880826
i_c_x_s.npy: -0.07807466984972877
i_c_x_-.npy: -0.07849243855588028
o_c_x_s.npy: -0.07809771255883113
o_c_-_t.npy: -0.07849453836880826
o_c_v_-.npy: -0.07849455641444947
o_c_v_p.npy: -0.07849453836880826
o_c_v_t.npy: -0.07849453836880826
i_c_v_s.npy: -0.07806637720859144
i_c_-_p.npy: -0.07849453836880826
i_c_-_-.npy: -0.07897358015607425
o_c_x_-.npy: -0.0784945383756761
o_c_-_s.npy: -0.07801058474447276
o_c_x_p.npy: -0.07849453836880826
i_c_v_-.npy: -0.07851038991206472
i_c_x_t.npy: -0.07849453836880826
o_c_-_-.npy: -0.07850324051490443
o_c_v_s.npy: -0.07803351988881957
i_c_v_p.npy: -0.07849453836880826
i_c_-_t.npy: -0.07849453836880826
i_c_-_s.npy: -0.07803630371112602
