In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import seaborn as sns
import matplotlib.pyplot as plt

This notebook contains an initial EDA of train dataset for the [Ubiquant Market Prediction competition](https://www.kaggle.com/c/ubiquant-market-prediction) as well as a simple baseline model training and inference. Here I analyze the overall structure of the dataset, the distributions of different features, and the correlation of features and target.

# Basic EDA

Let's load the data and look a the high-level data structure.

In [None]:
data_types_dict = {
#     'time_id': 'int32',
    'investment_id': 'int16',
    "target": 'float16',
}

features = [f'f_{i}' for i in range(300)]

for f in features:
    data_types_dict[f] = 'float16'
    
target = 'target'

In [None]:
train_df = pd.read_csv('/kaggle/input/ubiquant-market-prediction/train.csv', 
#                        nrows=5 * 10**3,
                       usecols = data_types_dict.keys(),
                       dtype=data_types_dict)

In [None]:
train_df

Our dataset contains 300 anonymous features that don't have any description, `investment_id,` and target that is also some anonymous float value.

## Target

In [None]:
train_df['target'].hist(bins = 100, figsize = (20,10))

The target values look quite normal without any outliers or long tails. We should not have any problems working with it. Let's also plot distributions of targets of a few random features:

In [None]:
for f in np.random.choice(train_df['investment_id'].unique(), 10):
    train_df[train_df['investment_id'] == f]['target'].hist(bins = 100, alpha = 0.2, figsize = (20,10))

On a high-level target for each investment_id also looks ok.

## Investment_id

In [None]:
train_df['investment_id'].nunique()

In [None]:
train_df['investment_id'].value_counts().plot(kind = 'bar',figsize = (20,10))

We have 3579 different investments, and most of them have a substantial amount of data points and probably don't require any filtering so far.

# Features

It is hard to analyze all features one by one, but let's do so aggregated analysis. First of all, let's just look at some features distributions.

In [None]:
f = 'f_67'
train_df[f].hist(bins = 100, figsize = (20,10))

In [None]:
f = 'f_109'
train_df[f].hist(bins = 100, figsize = (20,10))

In [None]:
f = 'f_62'
train_df[f].hist(bins = 100, figsize = (20,10))

In [None]:
f = 'f_234'
train_df[f].hist(bins = 100, figsize = (20,10))

In [None]:
f = 'f_164'
train_df[f].hist(bins = 100, figsize = (20,10))

Some features look normal, but most have outliers, skewed distribution, and multiple modes. Probably the analysis of features one by one will bring a lot of value later in the competition, but we will not go deep into it in this notebook.

In [None]:
train_df[features].nunique().hist()

All features have a lot of unique values, so they either float or have some added noise to hide the integer/categorical nature.

## Features interaction

We will do analysis on a smaller random 1% samle of the dataset to speed up the process.

In [None]:
sample_df = train_df.sample(frac = 0.01)
sample_df

In [None]:
correlation = sample_df[[target] + features].corr()

In [None]:
correlation['target'].iloc[1:].hist(bins = 20, figsize = (20,10))

There is no strong correlation between features and target. Let's look at the correlation of features with each other.

In [None]:
sns.clustermap(correlation, figsize=(20, 20))

There are definitely some clusters of highly correlated features that can be later analyzed together.

## Feature engeneering

Let's take some top features from the last run of the notebook, look at them and generate some interactions.

In [None]:
top_feautures = ['f_74', 'f_153', 'f_145', 'f_108', 'f_231']

In [None]:
from seaborn import pairplot
sample_df = train_df.sample(10000).reset_index()
pairplot(sample_df[top_feautures + ['target']])

In [None]:
gen_features = []

for i, f1 in enumerate(top_feautures[:-1]):
    for j, f2 in enumerate(top_feautures[i+1:]):
        train_df[f"{f1}*{f2}"] = train_df[f1] * train_df[f2]
        train_df[f"{f1}/{f2}"] = train_df[f1] / train_df[f2]
        
        gen_features.append(f"{f1}*{f2}")
        gen_features.append(f"{f1}/{f2}")

# Model training

In [None]:
from lightgbm import LGBMRegressor

I will use LGBMRegressor to train a simple baseline model.

In [None]:
features += gen_features

In [None]:
from sklearn.model_selection import StratifiedKFold 
seed = 0
folds = 15
models = []

skf = StratifiedKFold(folds, shuffle = True, random_state = seed)

for train_index, test_index in skf.split(train_df, train_df['investment_id']):
    train = train_df.iloc[train_index]
    valid = train_df.iloc[test_index]
    
    lgbm = LGBMRegressor(
        num_leaves=31,
        n_estimators = 1500,
        min_child_samples = 1000, 
        subsample=0.7, 
        subsample_freq=1,
        n_jobs= -1
    )

    lgbm.fit(train[features], train[target], eval_set = (valid[features], valid[target]), early_stopping_rounds = 10)
    models.append(lgbm)

In [None]:
import lightgbm
lightgbm.plot_importance(lgbm, figsize = (20, 60))

In [None]:
import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    
    for i, f1 in enumerate(top_feautures[:-1]):
        for j, f2 in enumerate(top_feautures[i+1:]):
            test_df[f"{f1}*{f2}"] = test_df[f1] * test_df[f2]
            test_df[f"{f1}/{f2}"] = test_df[f1] / test_df[f2]
    
    test_df['target']  = 0
    
    for lgbm in models:
        test_df['target'] += lgbm.predict(test_df[features])
    test_df['target'] /= len(models)
    env.predict(test_df[['row_id','target']])