In [1]:
import pandas as pd
import numpy as np
from tools import seq_to_num, acc_score, prep_submit

In [2]:
df_train = pd.read_csv('../data/train.csv', index_col=0)
df_test = pd.read_csv('../data/test.csv', index_col=0)

In [4]:
train_X, train_y = seq_to_num(df_train.Sequence, pad=False)

Using TensorFlow backend.


In [248]:
def diff(data, only_seq=False, verbose=False):
    """
    Calculate next term if first difference is constant
    
    @returns:
        * list of sequences that have constant difference
        * list of the corresponding indices
        * list of predicted terms
    """
    sequences = []
    indices = []
    predictions = []
    for seq, idx in zip(data, data.index):
        if len(seq) < 3:
#             if verbose:
#                 print("Sequence is too small to calculate terms differences:", seq)
            continue
        last_elems = [seq[-1]]  # last elements of the corresponding differences
        diffs = [_ for _ in seq]
        for i in range(1, len(seq) - 1):
            diffs = [next - cur for cur, next in zip(diffs, diffs[1:])]
            last_elems.append(diffs[-1])
            uniques = np.unique(diffs)
            if len(uniques) == 1:
                if verbose:
                    print(f"Seq {seq[:5]}... has constant {i}-th difference {uniques[-1]}")
                sequences.append(seq)
                indices.append(idx)
                predictions.append(sum(last_elems))
                break
    if only_seq:
        return sequences
    else:
        return sequences, indices, predictions

In [249]:
seq, ind, pred = diff(train_X, verbose=False)
acc_score(train_y[ind], pred)

0.9046701974000962

In [250]:
len(seq)

2077

In [251]:
def diff_varstep(data, maxstep=1, only_seq=False, verbose=False):
    """
    Calculate next term if first difference is constant
    
    @returns:
        * list of sequences that have constant difference
        * list of the corresponding indices
        * list of predicted terms
    """
    sequences = []
    indices = []
    predictions = []
    for seq, idx in zip(data, data.index):
        solution_found = False
        for step in range(1, maxstep + 1):
            if len(seq) < (step + 2):
#             if verbose:
#                 print("Sequence is too small to calculate terms differences:", seq)
                continue
            last_elems = [seq[-step]]  # last elements of the corresponding differences
            diffs = [_ for _ in seq]
            for i in range(1, (len(seq) - 2) // (step + 1) + 2):
                diffs = [next - cur for cur, next in zip(diffs, diffs[step if i == 1 else 1:])]
                if len(diffs) == 0:
                    break
                last_elems.append(diffs[-1])
                uniques = np.unique(diffs)
                if len(uniques) == 1:
                    if verbose:
                        print(f"Seq {seq[:5]}... has constant {i}-th difference {uniques[-1]} with step {step}")
                    sequences.append(seq)
                    indices.append(idx)
                    predictions.append(sum(last_elems))
                    solution_found = True
                    break
            if solution_found:
                break
    if only_seq:
        return sequences
    else:
        return sequences, indices, predictions

In [225]:
test = [2, 4, 7, 11, 16]
diff_varstep(pd.Series([test], dtype='object'), maxstep=2, verbose=True)

Seq [2, 4, 7, 11, 16]... has constant 2-th difference 1 with step 1


([[2, 4, 7, 11, 16]], [0], [22])

In [226]:
test = [2, 6, 8, 12]
diff(pd.Series([test], dtype='object'), step=2, verbose=True)

Seq [2, 6, 8, 12]... has constant 1-th difference 6 with step 2


([[2, 6, 8, 12]], [0], [14])

In [252]:
seq2, ind2, pred2 = diff_varstep(train_X, verbose=False, maxstep=10)

In [253]:
len(seq2)

3091

In [254]:
acc_score(train_y[ind2], pred2)  # first-order difference, perfect prediction

0.9330313814299579

In [239]:
len(seq) * acc_score(train_y[ind], pred)

15.0

In [256]:
kg_test = pd.read_csv('../data/kaggle_test.csv', index_col=0)

In [260]:
X = seq_to_num(kg_test.Sequence, pad=False, target_split=False)

In [262]:
_, ind, pred = diff_varstep(X, maxstep=11)

In [265]:
def mmode(arr):
    modes = []
    for row in arr:
        counts = {i: row.tolist().count(i) for i in row}
        if len(counts) > 0:
            modes.append(max(counts.items(), key=lambda x:x[1])[0])
        else:
            modes.append(0)
    return modes

In [269]:
mode_pred = pd.Series(mmode(X[~X.index.isin(ind)]), index=X.index[~X.index.isin(ind)])

In [272]:
pred_total = pd.Series(np.zeros(kg_test.shape[0]), index=kg_test.index, dtype=np.int64)
pred_total[ind] = pred
pred_total[X.index[~X.index.isin(ind)]] = mode_pred

In [273]:
prep_submit(pred_total)