In [1]:
import numpy as np
import pandas as pd
from tools import seq_to_num, acc_score

In [2]:
df_train = pd.read_csv('../data/train.csv', index_col=0)
X_train, y_train = seq_to_num(df_train.Sequence, pad=False)

Using TensorFlow backend.


In [323]:
seq = X_train[4:5].values[0]
target = y_train[4:5]

In [324]:
seq, target

(array([1.0000e+00, 1.2100e+02, 1.3100e+02, 1.4100e+02, 1.5100e+02,
        1.6100e+02, 1.7100e+02, 1.8100e+02, 1.9100e+02, 1.2321e+04,
        1.2421e+04, 1.2521e+04, 1.2621e+04, 1.2721e+04, 1.2821e+04,
        1.2921e+04, 1.3431e+04, 1.3531e+04, 1.3631e+04, 1.3731e+04,
        1.3831e+04, 1.3931e+04, 1.4541e+04, 1.4641e+04, 1.4741e+04,
        1.4841e+04, 1.4941e+04, 1.5651e+04, 1.5751e+04, 1.5851e+04,
        1.5951e+04, 1.6761e+04, 1.6861e+04, 1.6961e+04, 1.7871e+04,
        1.7971e+04]), Id
 152540    18981.0
 Name: Sequence, dtype: float64)

In [325]:
target.values

array([18981.])

In [137]:
num_of_points = 5
X, y = [seq[i: i + num_of_points] for i in range(len(seq) - num_of_points)], seq[num_of_points:]

In [317]:
def fit_n_points(seq, n_points):
    X, y = create_data(seq, n_points)
    return LinearRegression().fit(X, y)

In [3]:
from sklearn.preprocessing import PolynomialFeatures

def create_data(seq, num_of_points, poly_deg=1):
    X = [seq[i: i + num_of_points] for i in range(len(seq) - num_of_points)]
    if poly_deg > 1:
        X = PolynomialFeatures(poly_deg).fit_transform(X)
    y = seq[num_of_points:]
    return X, y

  return f(*args, **kwds)
  return f(*args, **kwds)


In [4]:
def pred_best_reg(seq, min_num, max_num, poly_deg=1, verbose=False):
    """
    Try to fit linear regression to previous several numbers, recording score and looking for perfect fit
    """
    min_num = min(len(seq) - 1, min_num)
    max_num = min(len(seq)- 1, max_num)
    best_acc, best_num_of_points = -1, -1
    lr = LinearRegression()
    for num_of_points in range(min_num, max_num + 1):
        X, y = create_data(seq, num_of_points)
        lr.fit(X, y)
        pred = lr.predict(X).round()
        acc = acc_score(y, pred)
        if acc > best_acc:
            best_acc = acc
            best_num_of_points = num_of_points
        if best_acc == 1:
            break
    if verbose:
        print(f"Best acc: {best_acc}, num of points: {best_num_of_points}")
    # predict
    X, y = create_data(seq, best_num_of_points)
    lr.fit(X, y)
    pred = lr.predict([seq[-best_num_of_points:]])[0]
    if np.abs(pred - pred.round()) > .01:
        return None
    return pred.round()

In [345]:
pred_best_reg(seq, 1, 40)

In [7]:
import tqdm
from sklearn.linear_model import LinearRegression

class LinReg:
    def __init__(self, max_prev=40, poly_deg=1, verbose=False):
        self.max_prev = max_prev
        self.poly_deg = poly_deg
        self.verbose = verbose
        self._mod = LinearRegression()
    
    def predict(self, data):
        sequences = []
        indices = []
        predictions = []
        ind_iter = data.index if isinstance(data, (np.ndarray, pd.Series)) else range(len(data))
        for seq, ind in tqdm.tqdm(zip(data, ind_iter)):
            if len(seq) < 2:
                continue
            pred = self._pred_best_reg(seq)
            if pred is None:
                continue
            sequences.append(seq)
            indices.append(ind)
            predictions.append(pred)
        return sequences, indices, predictions
    
    def _pred_best_reg(self, seq):
        """
        Try to fit linear regression to previous several numbers, recording score and looking for perfect fit
        """
        min_num = min(len(seq) - 1, 1)
        max_num = min(len(seq)- 1, self.max_prev)
        best_acc, best_num_of_points = -1, -1
        for num_of_points in range(min_num, max_num + 1):
            X, y = self._create_data(seq, num_of_points)
            self._mod.fit(X, y)
            pred = self._mod.predict(X).round()
            acc = acc_score(y, pred)
            if acc > best_acc:
                best_acc = acc
                best_num_of_points = num_of_points
            if np.isclose(best_acc, 1):
                break
        if self.verbose:
            print(f"Best acc: {best_acc}, num of points: {best_num_of_points}")
        # predict
        X, y = self._create_data(seq, best_num_of_points)
        self._mod.fit(X, y)
        pred_data = seq[-best_num_of_points:]
        if self.poly_deg > 1:
            pred_data = PolynomialFeatures(self.poly_deg).fit_transform([pred_data])
            pred = self._mod.predict(pred_data)[0]
        else:
            pred = self._mod.predict([pred_data])[0]
        if np.fabs(pred - pred.round()) > .01:
            return None
        return pred.round()
    
    def _create_data(self, seq, num_of_points):
        X = [seq[i: i + num_of_points] for i in range(len(seq) - num_of_points)]
        if self.poly_deg > 1:
            X = PolynomialFeatures(self.poly_deg).fit_transform(X)
        y = seq[num_of_points:]
        return X, y

In [8]:
lr = LinReg()

In [9]:
_, ind, pred = lr.predict(X_train)

  linalg.lstsq(X, y)
79016it [12:24, 106.19it/s]


In [10]:
acc_score(y_train[ind], pred)

0.2979236898559608

In [11]:
df_val = pd.read_csv("../data/kaggle_test.csv", index_col=0)
X_val = seq_to_num(df_val.Sequence, target_split=False, pad=False)

In [13]:
len(ind)

26104