In [1]:
import numpy as np
import pandas as pd
from tools import seq_to_num, acc_score, prep_submit

In [2]:
df_train = pd.read_csv("../data/train.csv", index_col=0)
df_test = pd.read_csv('../data/test.csv', index_col=0)

In [3]:
X_train, y_train = seq_to_num(df_train.Sequence, pad=False)

Using TensorFlow backend.


In [24]:
def good_len(data):
    """
    filter out seqences which have good length increase
    """
    for seq in data:
        lengths = [len(f"{_:.0f}") for _ in seq]
        if lengths.count(1) == len(lengths):
            print("Sequence of single-digit numbers")
            continue
        seq_good = True
        for cur, next in zip(lengths, lengths[1:]):
            pred_length = cur * 2
            if abs(next - pred_length) > 1:
                seq_good = False
                break
        if seq_good:
            print(f"Good seq: {seq[:5]}")
        

In [25]:
good_len(X_train[:100])

Sequence of single-digit numbers
Sequence of single-digit numbers
Sequence of single-digit numbers
Good seq: [101.]
Sequence of single-digit numbers
Sequence of single-digit numbers
Good seq: [1.000e+00 2.000e+00 6.000e+00 4.000e+01 1.608e+03]
Sequence of single-digit numbers
Good seq: [ 3.  6. 63.]
Good seq: [2.000e+00 2.300e+01 2.357e+03]


In [88]:
import sys
sys.path.append('..')
from models.diff_table import DiffTable

In [114]:
def quad_good(data, verbose=False, stoplen=2):
    """
    a_n = a_{n-1} ** 2 + c
    
    Note:
        - Works only for monotonically increasing sequences.
        - Encapsulates all sequences solved by DiffTable
    """
    sequences = []
    indices = []
    predictions = []
    ind_iter = data.index if isinstance(data, (np.ndarray, pd.Series)) else range(len(data))
    for seq, ind in zip(data, ind_iter):
        if len(seq) < stoplen:
            continue
        divisor = seq[0] - 1
        if divisor == 0 and len(seq) > 1:
            divisor = seq[1] - 1
        if divisor == 0:
            continue
        diffs = [(next - cur) / divisor for cur, next in zip(seq, seq[1:])]
        if len(diffs) < stoplen:
            continue
        if np.all([np.isclose(prev, diff) for prev, diff  in zip(seq, diffs)]):
            if verbose:
                print(f"Good seq: {seq[:5]}")
            sequences.append(seq)
            indices.append(ind)
            predictions.append(seq[-1] * (divisor + 1))
            continue
        quad_diffs = [(next - cur ** 2) for cur, next in zip(seq, seq[1:])]
        _, _, pred = DiffTable().predict([quad_diffs], maxstep=10, stoplen=stoplen)
        if len(pred) > 0:
            if verbose:
                print(f"a_n = a_n-1^2 + {pred[0]}, {ind}: {seq[:5]}")
            sequences.append(seq)
            indices.append(ind)
            predictions.append(seq[-1] ** 2 + pred[0])
    return sequences, indices, predictions

In [54]:
X_train[X_train.map(is_increasing)].shape

(41822,)

In [131]:
X_increase = X_train[X_train.map(is_increasing)]
X_increase.shape

(41822,)

In [134]:
seq_diff, ind_diff, pred_diff = DiffTable().predict(X_increase, maxstep=10, stoplen=10)

In [159]:
X_unsolved = X_increase[~X_increase.index.isin(ind_diff)]

In [160]:
X_unsolved.shape

(39678,)

In [161]:
seq, ind, pred = quad_good(X_unsolved, stoplen=10, verbose=False)

In [162]:
len(ind)

25

In [156]:
cnt = 0
y_ind, pred_int = [], []
for pred_i, i in enumerate(ind):
    if i in ind_diff:
        cnt += 1
    else:
        pred_int.append(pred_i)
        y_ind.append(i)
print(f"{cnt}/{len(ind)} are already predicted by common differences")

1208/1233 are already predicted by common differences


In [157]:
len(ind)

1233

In [163]:
acc_score(y_train[ind], pred)

0.68

In [170]:
DiffTable().predict([[0, 3, 8, 15, 24]])

([[0, 3, 8, 15, 24]], [0], [35])

In [52]:
is_increasing = lambda seq: np.all([cur < next for cur, next in zip(seq, seq[1:])])

In [164]:
quad_good([[0, 3, 8, 15]])

([], [], [])