In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## About 
1. From [this](https://www.kaggle.com/kartushovdanil/ubiquant-market-prediction-eda#4.-Features) EDA notebook it is quite evident that there are some features which appears kind off normally distributed , while some features have skewed distributions
2. I am applying StandardScaler() of sklearn for Features that follows normal distribution and RobustScaler() for Features having skewed distributions as RobustScaler() works best in case of Outliers
3. For Training and Validation I am not doing anything significant , I am using top 2.5 Million Datapoints for train and validations
4. In train I am using first 812 ids for train and the remaining approx 200 for validation
5. If you find this notebook useful please drop an upvote , it will motivate me to produce more work :) , in this competition
6. Any Feedbacks are much appreciated.

In [None]:
train = pd.read_pickle("../input/speed-up-reading-csv-to-pickle/train.pkl")

In [None]:
train = train.head(n=2500000).reset_index(drop = True)

In [None]:
ids = train.time_id.unique()

In [None]:
train_ids = ids[0:812]
val_ids = ids[812:]

In [None]:
set(train_ids).intersection(set(val_ids))

## Validation Strategy 
1. 1013 unique time_ids in train
2. Using 20% of 1013 approximately last 200 time ids for validation

In [None]:
train.head()

In [None]:
val = train.loc[train.time_id.isin(val_ids)]
train = train.loc[train.time_id.isin(train_ids)]


In [None]:
del train_ids , val_ids , ids

In [None]:
train.tail()

In [None]:
val.head()

In [None]:
print(train.shape)
print(val.shape)
print(set(train.time_id).intersection(set(val.time_id)))

## Applying Preprocessing

In [None]:
## Separate Normally and Skewed Distributed Features
normal_features = [1 , 2 , 6 , 9 , 20 , 21, 24
                  ,28 , 35 , 36 , 40 , 43 ,
                  50 , 51 , 57 , 67 , 69 , 72,
                  75 , 76 ,82 ,85 , 86 ,
                  90 , 93 , 94 , 96 , 98 ,
                  103 , 105 , 109 , 106 , 114 ,116,
                  125 , 126 , 130 , 133 , 134 ,135 ,139,140 , 141 ,144,146,
                  141 ,171,
                  178 , 180 , 185 , 189 , 192 , 194 ,195 ,199,
                  205 , 206 ,212 ,213 , 217 , 221 ,222,223,
                  226 , 230 , 239 ,242 ,
                   252 , 254 ,256 , 259 ,261 , 266 ,273,
                  276 , 283 , 285 , 290 , 297]


normal_features_list = ['f_'+str(i) for i in normal_features]


print("Number of Normally dist features approx are",len(normal_features_list))

all_features_list = ['f_'+str(i) for i in range(0 , 300)]
skewed_features_list = list(set(all_features_list).difference(set(normal_features_list)))
print("Number of Skewed dist features approx are",len(skewed_features_list))
## Check for mistakes
print(set(skewed_features_list).intersection(set(normal_features_list)))

In [None]:
scaler_normal = StandardScaler()
scaler_outlier = RobustScaler()

train[normal_features_list] = scaler_normal.fit_transform(train[normal_features_list])
train[skewed_features_list] = scaler_outlier.fit_transform(train[skewed_features_list])

In [None]:
train.head()

In [None]:
## Always apply scaler transform on validation data separately
val[normal_features_list] = scaler_normal.transform(val[normal_features_list])
val[skewed_features_list] = scaler_outlier.transform(val[skewed_features_list])

### Lightgbm Model

In [None]:
def pearson_coef(data):
    return data.corr()['target']['preds']

def comp_metric(valid_df):
    return np.mean(valid_df[['time_id', 'target', 'preds']].groupby('time_id').apply(pearson_coef))

In [None]:
import os
import glob
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)

In [None]:
def train_and_evaluate(train , val):
    params = {
      'objective': 'rmse',  
      'boosting_type': 'gbdt',
      'num_leaves': 100,
      'n_jobs': -1,
      'learning_rate': 0.1,
      'feature_fraction': 0.8,
      'bagging_fraction': 0.8,
      'verbose': -1
    }
    models = []
    groups = train['time_id']
    x_train = train.drop(['row_id', 'target', 'time_id','investment_id'], axis = 1)
    y_train = train['target']
    oof_predictions = np.zeros(val.shape[0])
    x_val = val.drop(['row_id', 'target', 'time_id','investment_id'], axis = 1)
    y_val = val['target']
    train_dataset = lgb.Dataset(x_train, y_train)
    val_dataset = lgb.Dataset(x_val, y_val)
    model = lgb.train(params = params, 
                      train_set = train_dataset, 
                      valid_sets = [train_dataset, val_dataset], 
                      num_boost_round = 500, 
                      early_stopping_rounds = 50, 
                      verbose_eval = 50)
        # Add predictions to the out of folds array
    oof_predictions = model.predict(x_val)
    val['preds'] = oof_predictions
    models.append(model)
    print("OOF PCC :")    
    print(comp_metric(val))
    return oof_predictions , models


In [None]:
oof_predictions , models = train_and_evaluate(train , val)

In [None]:
del train ,val

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    X_test = test_df.drop(["row_id",'investment_id'], axis=1)
    X_test[normal_features_list] = scaler_normal.transform(X_test[normal_features_list])
    X_test[skewed_features_list] = scaler_outlier.transform(X_test[skewed_features_list])
    y_preds = [model.predict(X_test, num_iteration=model.best_iteration) for model in models]
    sample_prediction_df["target"] = sum(y_preds) / len(y_preds)
    env.predict(sample_prediction_df)

In [None]:
sample_prediction_df

## Acknowledgements
1. https://www.kaggle.com/kartushovdanil/ubiquant-market-prediction-eda#4.-Features
2. https://www.kaggle.com/valleyzw/ubiquant-lgbm-baseline
3. https://www.kaggle.com/ilialar/ubiquant-eda-and-baseline