In [None]:
# revised ensembling from https://www.kaggle.com/viktorbarbarich/linear-regression-baseline
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold

# Import dependencies and begin ETL

In [None]:
%%time
data = pd.read_pickle('../input/ump-train-picklefile/train.pkl')
data.drop(columns = ['row_id'], inplace = True) #row id is the time_id and investment_id joined using '-'

# Let's plot the trajectory of some investments

In [None]:
%matplotlib inline

for investment in np.random.choice(pd.unique(data['investment_id']), 20):
    data[data['investment_id']==investment].plot('time_id', 'target')

# Data Pre-processing

In [None]:
y = data.pop('target')
X = data
X.describe()

# Features appear already scaled with mean ~ 0 and std dev ~ 1, so there is no need to scale them.

In [None]:
X['time_id'].value_counts()

# There are 1211 different time steps with varying numbers of investments per time step.

In [None]:
X['investment_id'].value_counts()

# There are 3579 different investments

In [None]:
# from sklearn.decomposition import IncrementalPCA

# transformer = IncrementalPCA(n_components=10)
# transformer.partial_fit(X)
# X = transformer.transform(X)

# Neither PCA nor incremental PCA works within the memory limits of this kernel. Unsupervised dimensionality reduction may be an option later... 

# Let's train models for ensembling using Stratified K folding to maintain the distribution of time steps per sample.

In [None]:
CNT_MODELS = 30 # 30 linear models

models = []

for i in range(0, CNT_MODELS):
    X_s = X.loc[i::CNT_MODELS]
    y_s = y.loc[X_s.index]
    models.append(LinearRegression().fit(X_s, y_s))


# Let's stack the models creating a meta model.

In [None]:
df = pd.DataFrame(columns=np.arange(len(models)))

for i, model in enumerate(models):
    df[i]=model.predict(X)

regr = LinearRegression().fit(df.values, y)

# Predict values. First predict from each model in the ensemble. Finally, use those predictions as input into the meta model.

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test()
i = 0

for (test_df, sample_prediction_df) in iter_test:
    test_df.reset_index(inplace = True)
    test_df.pop('row_id')
    test_df.rename(columns={'index':'time_id'}, inplace = True)
    test_df['time_id'] = i
    df = pd.DataFrame(columns=np.arange(len(models)))
    for i, model in enumerate(models):
        df[i] = model.predict(test_df.values)
    sample_prediction_df['target'] = regr.predict(df)
    env.predict(sample_prediction_df)
    i += 1

In [None]:
test_df