In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from preproc.filters import markov_filter, rnn_filter
from eda.tools import seq_to_num, acc_score, prep_submit
from models.baseline import Baseline
from models.diff_table import DiffTable
from models.markov_chain import MarkovChain
from models.linear_model import LinearModel
from models.nonlinear_model import NonLinearModel
from models.lin_reg import LinReg
from models.pipeline import Pipeline
from models.rnn import RNN

Using TensorFlow backend.


In [2]:
df_train = pd.read_csv("data/train.csv", index_col=0)
df_test = pd.read_csv('data/test.csv', index_col=0)

In [3]:
X_train, y_train = seq_to_num(df_train.Sequence, pad=False)
X_test, y_test = seq_to_num(df_test.Sequence, pad=False)

In [4]:
models = [
    ('DT', DiffTable(), None),
    ('LRR', LinearModel(), None),
    ('NLRR', NonLinearModel(), None),
    ('MC', MarkovChain(), markov_filter),
    ('RNN', RNN(), rnn_filter),
    ('LR', LinReg(), None)
]

In [5]:
pipe = Pipeline(models, verbose=True)
ind, pred = pipe.predict(X_train)

solved by LRR: 8272
solved by NLRR: 385
solved by MC: 1843


  linalg.lstsq(X, y)
12it [00:00, 116.50it/s]

solved by RNN: 20529


47987it [06:10, 129.65it/s]

solved by LR: 18356





In [8]:
acc_score(y_train[ind], pred[ind])

0.27225109794915187

In [6]:
len(ind)

49385

In [2]:
df_validate = pd.read_csv('data/kaggle_test.csv', index_col=0)
X_val = seq_to_num(df_validate.Sequence, pad=False, target_split=False)

In [3]:
models = [
    ('DT', DiffTable(), None),
    ('LRR', LinearModel(), None),
    ('NLRR', NonLinearModel(), None),
    ('MC', MarkovChain(), markov_filter),
    ('RNN', RNN(), rnn_filter),
    ('LR', LinReg(), None)
]

In [4]:
pipe = Pipeline(models, fallback=Baseline(), verbose=True)
pred = pipe.predict(X_val)

113845it [03:16, 578.26it/s]
  if a == '-100':


solved by DT: 4178
solved by LRR: 9443


  return [x1 ** 2, x2 ** 2, x1 * x2, x1, x2]
  return sum([x[0] ** 2 * solution[0], x[1] ** 2 * solution[1], x[0] * x[1] * solution[2], x[0] * solution[3],
  x[1] * solution[4], solution[5]])


solved by NLRR: 588
solved by MC: 2610


  linalg.lstsq(X, y)
16it [00:00, 158.53it/s]

solved by RNN: 28654


  resids = np.sum(np.abs(x[n:])**2, axis=0)
68372it [09:25, 120.90it/s]


solved by LR: 26088
solved by fallback-model Baseline(function=mode): 42284


In [5]:
prep_submit(pred, 'submit_with_linreg.csv')