In [1]:
import pandas as pd
import numpy as np
from tools import seq_to_num, acc_score, prep_submit

In [2]:
df_train = pd.read_csv('../data/train.csv', index_col=0)
df_test = pd.read_csv('../data/test.csv', index_col=0)

In [3]:
train_X, train_y = seq_to_num(df_train.Sequence, pad=False)

Using TensorFlow backend.


In [120]:
def diff(data, only_seq=False, verbose=False):
    """
    Calculate next term if first difference is constant
    
    @returns:
        * list of sequences that have constant difference
        * list of the corresponding indices
        * list of predicted terms
    """
    sequences = []
    indices = []
    predictions = []
    for seq, idx in zip(data, data.index):
        diffs = []
        if len(seq) < 3:
#             if verbose:
#                 print("Sequence is too small to calculate terms differences:", seq)
            continue
        diffs = [next - cur for cur, next in zip(seq, seq[1:])]
        uniques = np.unique(diffs)
        if len(uniques) == 1:
            if verbose:
                print(f"Seq {seq[:5]}... has constant first difference {uniques[-1]}")
            sequences.append(seq)
            indices.append(idx)
            next_term = seq[-1] + uniques[-1]
            predictions.append(next_term)
        elif len(diffs) > 2:
            sec_diff = [next - cur for cur, next in zip(diffs, diffs[1:])]
            sec_uniques = np.unique(sec_diff)
            if len(sec_uniques) == 1:
                if verbose:
                    print(f"Seq {seq[:5]}... has constant second difference {sec_uniques[-1]}")
                sequences.append(seq)
                indices.append(idx)
                next_term = seq[-1] + uniques[-1] + sec_uniques[-1]
                predictions.append(next_term)
            
    if only_seq:
        return sequences
    else:
        return sequences, indices, predictions

In [121]:
test = [2, 4, 7, 11, 16]
diff(pd.Series([test], dtype='object'))

([[2, 4, 7, 11, 16]], [0], [22])

In [130]:
seq, ind, pred = diff(train_X[:500], verbose=True)

Seq [ 736. 1736. 2736. 3736. 4736.]... has constant first difference 1000.0
Seq [ 1090.  4358.  9804. 17428. 27230.]... has constant second difference 2178.0
Seq [10. 28. 46. 64. 82.]... has constant first difference 18.0


In [131]:
pred

[34736.0, 1115168.0, 946.0]

In [132]:
train_y[ind]

Id
126395      34736.0
205820    1115168.0
67083         946.0
Name: Sequence, dtype: float64

In [133]:
acc_score(train_y[ind], pred)  # first-order difference, perfect prediction

1.0