#  Ubiquant Market Prediction with Ridge Regression
## Import Packages

In [None]:
import os
import pandas as pd
import numpy as np
import gc
import math

## Utilities

In [None]:
def reduce_memory_usage(df, features):
    for feature in features:
        item = df[feature].astype(np.float16)
        df[feature] = item
        del item
        gc.collect()

In [None]:
import tensorflow as tf
from sklearn import metrics

def symmetric_mean_absolute_percentage_error(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))
def evaluate(model, x_val, y_val):
    y_pred = model.predict(x_val)
    r2 = metrics.r2_score(y_val, y_pred)
    mse = metrics.mean_squared_error(y_val, y_pred)
    mae = metrics.mean_absolute_error(y_val, y_pred)
    mape = tf.keras.metrics.mean_absolute_percentage_error(y_val, y_pred).numpy()
    rmse = np.sqrt(mse)
    smape = symmetric_mean_absolute_percentage_error(y_val, y_pred)
    print("R2 Score:", r2)
    print("MSE:", mse)
    print("MAE:", mae)
    print("MAPE", mape)
    print("RMSE:", rmse)
    print("SMAPE:", smape)
    return {"r2": r2, "mse": mse, "mae": mae, "mape": mape, "rmse": rmse, "smape": smape}

In [None]:
def inference(models, df):
    y_preds = []
    for model in models:
        y_pred = model.predict(df[feature_columns])
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

## Import dataset

In [None]:
%%time
features = [f'f_{i}' for i in range(300)]
X = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet', columns=features + ["target"])
X.head()

## Reducing Memories
There are totally 3141410 records and each record has 303 columns. If we convert all data type to int16 and float16, then the total memory of training data will be  (3141410 x 303 x 2)  / (1024^3) G, which is about 1.8G.

In [None]:
X.info()

In [None]:
%%time
#reduce_memory_usage(X, features + ["target"])

In [None]:
X.info()

In [None]:
y = X.pop("target")
y.head()

## Modeling
I will start modeling with Ridge Regression. You may replace with other models.

In [None]:
%%time
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import Ridge, LinearRegression, ElasticNet, Lasso
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
kfold = KFold(5, shuffle=False, random_state=None)
from sklearn.pipeline import Pipeline
models = []
use_k_fold = False
if use_k_fold:
    for (train_indices, valid_indices) in kfold.split(X):
        X_train, X_val = X.iloc[train_indices], X.iloc[valid_indices]
        y_train, y_val = y.iloc[train_indices], y.iloc[valid_indices]
        # MinMaxScaler
        # RobustScaler
        model = Pipeline([('scaler', MinMaxScaler()), ('ridge', Ridge())])
        model.fit(X_train, y_train)
        evaluate(model, X_val, y_val)
        models.append(model)
        del X_train
        del X_val
        del y_train
        del y_val
        gc.collect()
else:
    model = Ridge()
    #model = StackingRegressor([
    #    ("ridge", Ridge()), 
    #    ("lr", LinearRegression()),
    #    ("elastic", ElasticNet()),
    #    ("lasso", Lasso())
    # ])
    #model = Pipeline([('scaler', RobustScaler()), ('ridge', Ridge())])
    model.fit(X, y)
    evaluate(model, X, y)
    models.append(model)

In [None]:
del X
del y
gc.collect()

## Submission

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 
for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df['target'] = inference(models, test_df)
    env.predict(sample_prediction_df) 