In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
# Read data
train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')

In [None]:
def reduce_mem_usage(df, col_excluded = []):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    df_cols = list(df.columns)
    df_cols = [col for col in df_cols if col not in col_excluded]
        
    for col in df_cols:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    
    
    
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

train = reduce_mem_usage(train, ['row_id', 'time_id', 'investment_id'])

# Extensive EDA

In [None]:
#General information
train.info(memory_usage = "deep")

__Rough Overview: There are 5 Types of columns__

* __Investment id:__ The statistical unit to analyze. There are 3579 different investments with their proper range of time ids 
* __time ids:__ There are 1211 different time ids and for each investment id there are o average 877 different time ids
* __row_id:__ Union of the time_id and the investment_id for a specific record
* __f_x:__ 300 anonymized features out of the market data for each specific
* __target:__  investment's return rate for a specific time_id

In [None]:
train.head()

In [None]:
#no null values on target variable and other variables
print("Number of null values in dataset: {} samples".format(train.isnull().sum().sort_values().sum()))

## Investment_id and time_id

In [None]:
print("There are {} different investment_ids".format(len(train["investment_id"].value_counts().index)))
print("There are {} different time_ids".format(len(train["time_id"].value_counts().index)))

mean_cnt_time_id = np.round(train.groupby("investment_id")["time_id"].count().mean(),2)

print("There are on average {} different time id per investment_id".format(mean_cnt_time_id))

But not every investment has the exact 877.73 time steps. Some have more and some have less (red line below). 

In [None]:
iid = train["investment_id"].value_counts()

fig, sub = plt.subplots(1,1,figsize=(20,5))
sns.barplot(x=iid.index, y = iid.values,ax = sub, order = iid.index, palette = "cividis")
sub.axhline(y = mean_cnt_time_id, ls = "--", lw = 3.0, color = "red", alpha = 0.7)
sub.set_xlabel("Different investment ids")
sub.set_ylabel("Different Time ids");

Looking at the count of the investment ids grouped by the time id, we see that the relationship exhibit a trend and an unusual behavior: 

* Trend: There are more investments which have been executed in the later time id space than in the earlier time ids
* Unusual behavior: Within the range around time id 400, there are less investment ids than in other time_id ranges

In [None]:
#credits: https://www.kaggle.com/allunia/ubiquant-eda
tid_iid = train.groupby("time_id")["investment_id"].count()

fig, sub = plt.subplots(1,1,figsize=(30,5))

sns.scatterplot(x = tid_iid.index, y = tid_iid.values, ax = sub)
sub.grid()

In [None]:
freq_timeid = train.groupby("investment_id")["time_id"].max().value_counts().index.max()

time_id_max = train.groupby("investment_id")["time_id"].max()
tid_max_high_freq = time_id_max[time_id_max == 1219]
tid_max_outlier = time_id_max[time_id_max != 1219]

print("{} of the {} investment_ids have a max time id of {}".format(tid_max_high_freq.shape[0], time_id_max.shape[0], freq_timeid))
print("So there are {} outliers with different max time ids".format(tid_max_outlier.shape[0]))

In [None]:
samples = 15

tid_max_outlier_samples = tid_max_outlier.sample(samples).index

fig, sub = plt.subplots(1,1,figsize = (16,8))

for n in range(samples):
    
    plt.plot(train[train["investment_id"] == tid_max_outlier_samples[n]]["time_id"],
            train[train["investment_id"] == tid_max_outlier_samples[n]]["target"].cumsum(), ".")
    
    plt.xlim([0,1220])
    plt.title("Outlier investment ids (# timeids != 1219)")
    
plt.grid()

But there are apparently "holes": Listing all unique time ids in a sorted row, you got not a clean row of upcounting ids. There is a small fraction of holes in the set

In [None]:
shifted = np.array(sorted(train["time_id"].unique())[1:])
original = np.array(sorted(train["time_id"].unique())[:-1])

unique, counts = np.unique(shifted - original, return_counts = True)
pd.Series(counts, index = unique)

## Target distribution

The target distribution itself looks ok. Not strongly skewed nor any bigger outliers observable

In [None]:
fig, sub = plt.subplots(1,1,figsize=(12,4))
train["target"].hist(bins = 100, edgecolor = "black")
sub.get_yaxis().set_major_formatter(plt.matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
sub.set_ylabel("Count")
sub.set_xlabel("Target");

In [None]:
train.set_index("time_id", drop = True, inplace = True)

To get an impression for the different return rates of the respective investments in the following some plots <br />
The different investments seem to have a individual development and in the plots, several gaps between some time_ids become evident

In [None]:
def plot_target_analysis(sample_ls = [], col="target"):
    
    if len(sample_ls)!=0:
        for invest in sample_ls:

            fig, sub = plt.subplots(1,3,figsize = (30,5))
            df_tmp = train[train["investment_id"]==invest]

            #general 
            df_tmp_t = df_tmp[col]
            df_tmp_t_mean = df_tmp.rolling(window = 10)[col].mean()
            df_tmp_t_std = df_tmp.rolling(window = 10)[col].std()

            #development
            df_tmp_d = df_tmp[col].cumsum()

            sub[0].plot(df_tmp_t.index, df_tmp_t, alpha = 0.3)
            sub[0].plot(df_tmp_t_mean.index, df_tmp_t_mean, color = "red", label = "10d mean")
            sub[0].plot(df_tmp_t_std.index, df_tmp_t_std, color = "green", ls = "--", label = "10d std")
            sub[0].set_title(f"{col} over time_id (Investment id {invest})")
            sub[0].set_xlim((-50,1250))
            sub[0].set_ylabel(f"{col}")
            sub[0].legend(loc = "upper right")
            sub[0].grid()

            sub[1].plot(df_tmp_d.index, df_tmp_d)
            sub[1].set_title(f"{col} development (Investment id {invest})")
            sub[1].set_xlim((-50,1250))
            sub[1].set_ylabel(f"{col}")
            sub[1].grid()

            sub[2].hist(df_tmp_t, bins = 30, edgecolor = "black")
            sub[2].set_title(f"{col} distribution (Investment id {invest})")
            sub[2].set_xlabel(f"{col}")
            sub[2].grid()

            fig.tight_layout()
    else:
        print("no invest_ls given.")
        
invests = train["investment_id"].sample(3)
plot_target_analysis(invests,"target")

## fx-distribution

In [None]:
plot_target_analysis(invests,"f_2")

__The fx features are all numerical (continuous) variables__

In [None]:
target_sample = train.sample(frac = 0.10)

In [None]:
cat_chk = target_sample[target_sample.columns[3:]]

[col for col in cat_chk if cat_chk[col].nunique()<50]

## Relationships 

There seems not to be a strong linear correlation between the features and the target

In [None]:
correlation = target_sample[target_sample.columns[2:]].corr()

fig = plt.figure(figsize=(10, 4))

plt.hist(correlation["target"][1:], edgecolor = "black", bins = 25)
plt.ylabel("Count")
plt.xlabel("Correlation")
plt.grid()

In [None]:
corr_dict = {k : v for k, v in sorted(abs(correlation["target"]).items(), key = lambda item: item[1])}

In [None]:
print("-----the five features with the lowest linear correlation with the target-----\n")

for feat in list(corr_dict)[:5]:
    print("Feature {} correlation: {}".format(feat, corr_dict[feat]))
    
print("\n-----the five features with the highest linear correlation with the target-----\n")

for feat in list(corr_dict)[-6:-1]:
    print("Feature {} correlation: {}".format(feat, corr_dict[feat]))

__Looking at the linear correlation between the fx features, we can see that there are several highly correlated features. Highly correlated features often contain the same information and we could drop some features without information loss__

In [None]:
sns.clustermap(abs(correlation), figsize = (15,15), cmap = "mako");

In [None]:
import gc

gc.collect()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from joblib import Parallel, delayed
import dill as pickle
from joblib.externals.loky import set_loky_pickler
set_loky_pickler("dill")

In [None]:
target_sample = train.sample(frac = 0.005)

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = train.columns[3:]

vif_data["VIF"] = [variance_inflation_factor(target_sample[target_sample.columns[3:]].values.astype("float32"), i) for i in range(len(train.columns[3:]))]
#Parallel processing doesn't work on kaggle due to 'PicklingError' problems
#vif_list = Parallel(n_jobs = 3, verbose = 5)(delayed(variance_inflation_factor)(target_sample[target_sample.columns[3:]].values.astype("float32"), i) for i in range(len(train.columns[3:])))

The features with a Variance Inflation Factor > 5 can be dropped due to the risk of Multicollinearity  (some books speak about a rule of thumb of VIF of 10 as a bareer, but we are a bit more conservative here)

In [None]:
vif_data[vif_data["VIF"]>5]["feature"].values

In [None]:
train = train.drop(vif_data[vif_data["VIF"]>5]["feature"].values, axis = 1)

In [None]:
gc.collect()

In addition to the feature selection by looking at th VIF, we can also select features using the embedded functioning of the lightgbm model and its feature importance calculation

In [None]:
from sklearn.feature_selection import SelectFromModel
import lightgbm as lgb

featureCutoff = 100

lgbc=lgb.LGBMRegressor(n_estimators=50, learning_rate=0.05,
                    num_leaves=32, colsample_bytree=0.2,                                           
                    reg_alpha=3, reg_lambda=1, min_split_gain=0.01,    
                    min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, max_features=featureCutoff)
embeded_lgb_selector.fit(X = train[train.columns[3:]], y = train["target"])

In [None]:
filter = embeded_lgb_selector.get_support()
selected_feat = train.columns[3:][filter]

selected_feat = list(train.columns[:3]) + list(selected_feat)

To enrich the dataset it's possible to engineer some further information out of existing information

In [None]:
def feature_engineering(df, features):
    
    df['mean'] = df[features].mean(axis=1)
    df['median'] = df[features].median(axis=1)
    #df['q01'] = df[features].quantile(q=0.01, axis=1)
    #df['q05'] = df[features].quantile(q=0.05, axis=1)
    #df['q10'] = df[features].quantile(q=0.10, axis=1)
    df['q25'] = df[features].quantile(q=0.25, axis=1)
    df['q75'] = df[features].quantile(q=0.75, axis=1)
    df['q90'] = df[features].quantile(q=0.90, axis=1)
    df['q95'] = df[features].quantile(q=0.95, axis=1)
    #df['q99'] = df[features].quantile(q=0.99, axis=1)
    df['max'] = df[features].max(axis=1)
    df['min'] = df[features].min(axis=1)
    
    df['std'] = df[features].std(axis=1)
    df['range'] = df['max'] - df['min']
    df['iqr'] = df['q75'] - df['q25']
    df['tails'] = df['range'] / df['iqr']
    df['dispersion'] = df['std'] / df['mean']
    df['dispersion_2'] = df['iqr'] / df['median']
    df['skew'] = df[features].skew(axis=1)
    df['kurt'] = df[features].kurt(axis=1)
    
    df['median-max'] = df['median'] - df['max']
    df['median-min'] = df['median'] - df['min']
    #df['q99-q95'] = df['q99'] - df['q95']
    #df['q99-q90'] = df['q99'] - df['q90']
    #df['q01-q05'] = df['q01'] - df['q05']
    #df['q01-q10'] =  df['q01'] - df['q10']
    
    gc.collect()
    
    return df

In [None]:
train = feature_engineering(train, train.columns[3:])

In [None]:
train.head()