# Exploratory Data Analysis Jane Street

<img src="https://www.europol.europa.eu/sites/default/files/images/finance_budget.jpg">


## Hi all! I have decided to try my best at exploring the data. I'm eager to share with you what I've found and to receive your feedback even more! 

# Imports and Data Loading

Here we just install necessary packages (datatable), import them, define functions for later usage and load train.csv data. 

**Train float64 columns will be downcasted to float32 to ease RAM burden**


In [None]:
!pip install datatable
!pip install MLXtend

In [None]:
import numpy as np
import pandas as pd
import tqdm
import itertools
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
plt.style.use('fivethirtyeight')
import seaborn as sns
pd.options.display.max_columns = 200
import os
import gc
import re
import datatable as dt
def chunks(l, n):
    """ Yield n successive chunks from l.
    """
    newn = int(len(l) / n)
    for i in range(0, n-1):
        yield l[i*newn:i*newn+newn]
    yield l[n*newn-newn:]
    
input_path = '/kaggle/input/'
root_path = os.path.join(input_path, 'jane-street-market-prediction')
train = dt.fread(os.path.join(root_path, "train.csv")).to_pandas()
float64_cols = train.select_dtypes('float64').columns
train[float64_cols] = train[float64_cols].astype('float32')
resp_cols = [i for i in train.columns if 'resp' in i]

features_names = list(set(train.columns) - set(resp_cols) - set(['weight', 'ts_id', 'date']))
features_index = list(map(lambda x: int(re.sub("feature_", "", x)), features_names))
features = sorted(list(zip(features_names, features_index)), key = lambda x: x[1])
features = [i[0] for i in features] + resp_cols

# Basic Train Exploration

Here I start looking at how train.csv looks like: 

- dtypes and memory usage
- nan values distribution
- unique values distribution

- feature density/boxplot distribution
- feature distribution through time



In [None]:
display(train.info(), train.dtypes.value_counts().to_frame().rename(columns = {0: 'dtype'}))

In [None]:
#count
nan_values_train = (train
 .apply(lambda x: x.isna().sum(axis = 0)/len(train))
 .to_frame()
 .rename(columns = {0: 'percentage_nan_values'})
.sort_values('percentage_nan_values', ascending = False)
)

display((train
 .apply(lambda x: x.isna().sum(axis = 0))
 .to_frame()
 .rename(columns = {0: 'count_nan_values'})
.sort_values('count_nan_values', ascending = False)
.transpose()), nan_values_train.transpose())

In [None]:
fig, ax = plt.subplots(figsize = (20, 12))

sns.set_palette("RdBu", 10)
#RdBu, YlGn
ax = sns.barplot(x='percentage_nan_values', 
            y='feature', 
            palette = 'GnBu_r',
            data=nan_values_train.reset_index().rename(columns = {'index': 'feature'}).head(40))

for p in ax.patches:
    width = p.get_width() 
    if width < 0.01:# get bar length
        ax.text(width,       # set the text at 1 unit right of the bar
            p.get_y() + p.get_height() / 2, # get Y coordinate + X coordinate / 2
            '{:1.4f}'.format(width), # set variable to display, 2 decimals
            ha = 'left',   # horizontal alignment
            va = 'center')  # vertical alignment
    else:
        if width < 0.03:
            color_text = 'black'
        else:
            color_text = 'white'
        ax.text(width /2, 
                # set the text at 1 unit right of the bar
            p.get_y() + p.get_height() / 2, # get Y coordinate + X coordinate / 2
            '{:1.4f}'.format(width), # set variable to display, 2 decimals
            ha = 'left',   # horizontal alignment
            va = 'center',
            color = 'white',
            fontsize = 11)  # vertical alignment

ax.set_title('Top 40 Features for percentage of NaN Values')



<img src="https://images.emojiterra.com/google/android-10/512px/1f914.png" width="50" height="50" style="top:03%; left:80%"> 

Some features have almost the same number of nan values. Let's plot their distribution over time.


In [None]:
top_nan_features = (nan_values_train.head(30).index.tolist())
del nan_values_train
fig, axes = plt.subplots(10, 3, figsize = (40, 30))
ax = axes.ravel()

ts_id = train.ts_id
mini_df = pd.concat([(train[top_nan_features].isna().astype(int)),train[['date']]], 1)
mini_df = mini_df.groupby('date').sum().reset_index()

for i in range(len(top_nan_features)):
    
    feature_name = top_nan_features[i]
    
    mini_df[[feature_name, "date"]].plot(y = feature_name , kind = 'line',
                                         xlabel = 'date', 
                                         ylabel = feature_name+ "_nans", linewidth=0.3,
                                         legend = False,
                                         ax = ax[i])

In [None]:
train['daily_ts_id'] = (train.groupby('date').cumcount())

In [None]:
fig, axes = plt.subplots(10, 3, figsize = (40, 30))
ax = axes.ravel()

ts_id = train.ts_id
mini_df = pd.concat([(train[top_nan_features].isna().astype(int)),train[['ts_id']]], 1).iloc[:50000, :]
new_day = (train.iloc[:50000, :].query("daily_ts_id == 0").ts_id.tolist())

for i in range(len(top_nan_features)):
    
    feature_name = top_nan_features[i]
    
    mini_df[[feature_name, "ts_id"]].plot(y = feature_name , kind = 'line',
                                         xlabel = 'ts_id', 
                                         ylabel = feature_name+ "_nans", linewidth=0.3,
                                         legend = False,
                                         ax = ax[i])
    for m in range(len(new_day)):
        ax[i].axvline(new_day[m], alpha = 0.5, ymin = 0, ymax = 1, linestyle = ":", color = 'blue')
        if i == 0:
            if m == 2:
                ax[i].text(new_day[m]-1500, 1.1, "day {}".format(m), size = 7, alpha = 0.8)
            else:
                ax[i].text(new_day[m]+200, 1.1, "day {}".format(m), size = 7, alpha = 0.8)

<img src="https://images.emojiterra.com/google/android-10/512px/1f914.png" width="50" height="50" style="top:03%; left:80%"> 

It seems like for most of the features nan happen in the first part of the day (each blue line represents the beginning of a new day).


Unique values per column in Train

In [None]:
#percentage
train_unique_values = (train
 .apply(lambda x: x.nunique())
 .to_frame()
 .rename(columns = {0: 'number_unique_values'})
.sort_values('number_unique_values', ascending = True)
)

display(train_unique_values.transpose())
del train_unique_values

Let's plot all features and then deepdive on the relationship between them

<h4> Features Distributions </h4>

In [None]:
feature_chunks = list(chunks(range(len(features)), 15))

for j in feature_chunks:

    fig, axes = plt.subplots(3, 3, figsize = (20, 12))
    ax = axes.ravel()

    for i in j:

        feature = train[features[i]]
    
        sns.distplot(feature, hist=True, kde=True, 
         #bins=35, 
         color = 'blue', 
         hist_kws={'edgecolor':'black'},
         kde_kws={'linewidth': 2}, ax = ax[i%9])
        ax[i%9].grid(True)

Boxplots to further understand each feature distribution

In [None]:
for j in feature_chunks:

    fig, axes = plt.subplots(3, 3, figsize = (20, 12))
    ax = axes.ravel()

    for i in j:

        feature = train[features[i]]
    
        sns.boxplot(feature, 
         color = 'blue', ax = ax[i%9])
        ax[i%9].grid(True)

<img src="https://images.emojiterra.com/google/android-10/512px/1f914.png" width="20" height="20" style="left"> 

*Most of the features have long-tailed distributions, some of them in both directions, while others just on the positive axis. 
There are also some bimodal distributions, it would be nice to inspect their relationship with time*


In [None]:
for j in feature_chunks:

    fig, axes = plt.subplots(3, 3, figsize = (20, 12))
    ax = axes.ravel()

    for i in j:

        mini_df = (train.groupby('date').agg({features[i]: ['mean', 'std']}))
        mini_df.iloc[:, 0].plot(kind = 'line',xlabel = 'date', 
                                ylabel = features[i], linewidth=1,
                               ax = ax[i%9])

<img src="https://images.emojiterra.com/google/android-10/512px/1f914.png" width="20" height="20" style="left"> 

*Most of the features have a different behaviour before and after date 200, being more spiky before and almost stationary after.*


# Features relationships

Here I try to go a little bit deeper in inspecting the relationships between features and between feature and time. 

- Features Correlations
- Features AutoCorrelation
- Features CrossCorrelation

Let's create a new columns which gives the daily_tsid, starting each date from 0 and squashing it between 0-1 

In [None]:
train['daily_ts_id'] = (train.groupby('date').cumcount())

In [None]:
gc.collect() #Let's free some RAM up
created_cols = [] 
correlation_matrix = train.drop(created_cols, axis = 1, errors = 'ignore').corr()

In [None]:
plt.figure(figsize = (30, 12))

ax = sns.heatmap(
    correlation_matrix, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(220, 20, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    {'fontsize': 5},
    rotation=90,
    horizontalalignment='right'
);

<img src="https://images.emojiterra.com/google/android-10/512px/1f914.png" width="20" height="20" style="top:03%; left:80%"> 

*It seems like there are group of features very correlated, most of them are also near each other as we can see around the diagonal.
Date and ts_id are very much correlated, but I don't see any other feature particularly correlated with them.*

In [None]:
corrMatrix=correlation_matrix.loc[features, features].copy()

corrMatrix.loc[:,:] =  np.tril(corrMatrix, k=-1) # borrowed from Karl D's answer
CORRELATION_THRESHOLD = 0.65
already_in = set()
positive_correlated_features = []
for col in corrMatrix:
    perfect_corr = corrMatrix[col][corrMatrix[col] > CORRELATION_THRESHOLD].index.tolist()
    if perfect_corr and col not in already_in:
        already_in.update(set(perfect_corr))
        perfect_corr.append(col)
        positive_correlated_features.append(perfect_corr)


te = TransactionEncoder()

te_ary = te.fit(positive_correlated_features).transform(positive_correlated_features)

df = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = (apriori(df, min_support=0.03, use_colnames=True))
frequent_itemsets = frequent_itemsets.loc[frequent_itemsets.itemsets.apply(lambda x: len(x) > 2)].reset_index(drop = True)
pd.options.display.max_colwidth = 300

In [None]:
plt.figure(figsize = (30, 10))
#all_but_time = list(set(train.columns) - set(['date', 'ts_id', 'daily_ts_id']))
ax = sns.heatmap(
    correlation_matrix.loc[['date', 'ts_id', 'daily_ts_id'], features], 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(220, 20, n=200),
    square=False
)

<img src="https://images.emojiterra.com/google/android-10/512px/1f914.png" width="20" height="20" style="top:03%; left:80%"> 

*There are definitely some features very correlated with daily_ts_id. Let's plot a few of them*

In [None]:
abs_daily = (correlation_matrix.loc[features, 'daily_ts_id'].to_frame()
.rename(columns = {'daily_ts_id' : 'correlation'}).assign(abs_corr=lambda x: abs(x)))
features_daily = sorted(abs_daily.sort_values('abs_corr', ascending = False).head(5).index.tolist())

fig, axes = plt.subplots(5, 1, figsize = (15, 10), sharex = True)
new_day = (train.iloc[:50000, :].query("daily_ts_id == 0").ts_id.tolist())
ax = axes.ravel()
for j in range(len(features_daily)):
    feature_name = features_daily[j]
    feature = train[feature_name]
    feature[:50000].plot(linewidth = 0.5, ax = ax[j], xlabel='ts_id', ylabel = feature_name)
    for m in range(len(new_day)):
        ax[j].axvline(new_day[m], alpha = 0.5, ymin = -5, ymax = 5, linestyle = ":", color = 'blue')
        if j == 0:
            if m == 2:
                ax[j].text(new_day[m]-1500, 5.1, "day {}".format(m), size = 7, alpha = 0.8)
            else:
                ax[j].text(new_day[m]+200, 5.1, "day {}".format(m), size = 7, alpha = 0.8)

<img src="https://images.emojiterra.com/google/android-10/512px/1f914.png" width="20" height="20" style="top:03%; left:80%"> 

*Vertical blue lines indicate a new day. These features are supersimilar and definitely seem to have a positive correlation with daily_ts_id. I don't have a clue about their actual meaning*

In [None]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_correlations(df, percentage=0.1):
    au_corr = df.corr().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    n = int(len(au_corr)*percentage)
    return au_corr[0:n]

features_without_resp = list(set(features) - set(resp_cols))
df_correlation_plot = correlation_matrix.loc[features_without_resp, features_without_resp]

df_correlation_plot = (get_top_correlations(df_correlation_plot, percentage = 1)
                      .reset_index(drop = False)
                      .rename(columns = {0: 'correlation'}))

df_correlation_plot['index_1'] = df_correlation_plot.level_0.str.replace("feature_", "").astype(int)
df_correlation_plot['index_2'] = df_correlation_plot.level_1.str.replace("feature_", "").astype(int)
df_correlation_plot['absolute_correlation'] = df_correlation_plot['correlation'].abs()
df_correlation_plot['sign_correlation'] = (df_correlation_plot['correlation'] > 0).astype(int).replace({1: 'positive', 0:'negative'})

df_correlation_plot['pair_of_features'] = df_correlation_plot['index_1'].astype(str) + "-" + df_correlation_plot['index_2'].astype(str) 
gc.collect()

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (20, 12))
ax = axes.ravel()
sns.set_palette("RdBu", 10)
#RdBu, YlGn
positive_dict = {0: 'positive', 1: 'negative'}
for j in range(2):
    if j == 0:
        sns.barplot(x='correlation', 
            y='pair_of_features',
            ax = ax[j],
            data=df_correlation_plot.head(40))
    else:
        sns.barplot(x='correlation', 
            y='pair_of_features',
            ax = ax[j],
            data=df_correlation_plot.tail(40))

    for p in ax[j].patches:
        width = p.get_width() 
        ax[j].text(width /2, 
                # set the text at 1 unit right of the bar
            p.get_y() + p.get_height() / 2, # get Y coordinate + X coordinate / 2
            '{:1.4f}'.format(width), # set variable to display, 2 decimals
            ha = 'left',   # horizontal alignment
            va = 'center',
            color = 'black',
            fontsize = 11)  # vertical alignment

    ax[j].set_title('Top 40 pair of features for {} correlation'.format(positive_dict[j]))

<img src="https://images.emojiterra.com/google/android-10/512px/1f914.png" width="20" height="20" style="top:03%; left:80%"> 

*So many features are highly correlated, let's see the correlation distribution*

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (12, 8))
quantiles = np.quantile(df_correlation_plot.correlation, [0.05, 0.25, 0.5, 0.75, 0.95])
sns.distplot(df_correlation_plot.correlation, hist=False, kde=True, 
             color = 'blue', hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 1}, ax = ax)
for j in range(len(quantiles)):
    ax.axvline(quantiles[j], alpha = 0.3, ymax = 2, linestyle = ":")
ax.grid(True)
ax.text(quantiles[0]-.1, 0.37, "5th", size = 10, alpha = 0.8)
ax.text(quantiles[1]-.1, 0.37, "25th", size = 10, alpha = 0.85)
ax.text(quantiles[2]-.1, 0.37, "50th", size = 10, alpha = 1)
ax.text(quantiles[3]-.1, 0.37, "75th", size = 10, alpha = 0.85)
ax.text(quantiles[4]-.1, 0.37, "95th", size = 10, alpha =.8)
ax.set_title('Correlation distribution')

Distribution seems pretty symmetric. 

Autocorrelation for different lags

In [None]:
gc.collect()
autocorr_dataframe = (pd.DataFrame(train.apply(lambda x: x.autocorr(), 0))
                     .reset_index().rename(columns = {'index': 'feature', 0: 'autocorrelation'})
                     .sort_values('autocorrelation', ascending = False))
gc.collect()

In [None]:
fig, ax = plt.subplots(figsize = (20, 12))

sns.set_palette("RdBu", 10)
#RdBu, YlGn
ax = sns.barplot(x='autocorrelation', 
            y='feature', 
           # palette = 'GnBu_r',
            data=(autocorr_dataframe
                  .head(40)))

for p in ax.patches:
    width = p.get_width() 
    if width < 0.01:# get bar length
        ax.text(width,       # set the text at 1 unit right of the bar
            p.get_y() + p.get_height() / 2, # get Y coordinate + X coordinate / 2
            '{:1.4f}'.format(width), # set variable to display, 2 decimals
            ha = 'left',   # horizontal alignment
            va = 'center')  # vertical alignment
    else:
        ax.text(width /2, 
                # set the text at 1 unit right of the bar
            p.get_y() + p.get_height() / 2, # get Y coordinate + X coordinate / 2
            '{:1.4f}'.format(width), # set variable to display, 2 decimals
            ha = 'left',   # horizontal alignment
            va = 'center',
            color = 'black',
            fontsize = 11)  # vertical alignment

ax.set_title('Top 40 Features for autocorrelation')

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (12, 8))
quantiles = np.quantile(autocorr_dataframe.autocorrelation, [0.05, 0.25, 0.5, 0.75, 0.95])
sns.distplot(autocorr_dataframe.autocorrelation, hist=False, kde=True, 
             color = 'blue', hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 1}, ax = ax)
for j in range(len(quantiles)):
    ax.axvline(quantiles[j], alpha = 0.3, ymax = 2, linestyle = ":")
ax.grid(True)
ax.text(quantiles[0]-.05, 0.27, "5th", size = 10, alpha = 0.8)
ax.text(quantiles[1]-.05, 0.37, "25th", size = 10, alpha = 0.85)
ax.text(quantiles[2]-.05, 0.37, "50th", size = 10, alpha = 1)
ax.text(quantiles[3]-.05, 0.37, "75th", size = 10, alpha = 0.85)
ax.text(quantiles[4]-.05, 0.37, "95th", size = 10, alpha =.8)
ax.set_title('Autocorrelation Lag 1 distribution')

Cross Correlation

In [None]:
def crosscorr(datax, datay, lag=0):
    """ Lag-N cross correlation. 
    Parameters
    ----------
    lag : int, default 0
    datax, datay : pandas.Series objects of equal length

    Returns
    ----------
    crosscorr : float
    """
    return datax.corr(datay.shift(lag))

In [None]:
RECALCULATE = False
if RECALCULATE:

    total_lags = range(1, 3)

    cross_corr = {}

    combinations = list(itertools.product(features, features))

    for j in total_lags:
        cross_corr[j] = []
        for k in tqdm.tqdm(combinations):
            cross_corr[j].append(crosscorr(train[k[0]], train[k[1]], lag = j))
        #cross_corr[j] = list(map(lambda x: crosscorr(train[x[0]], train[x[1]], j), combinations))

    cross_correlations = (pd.DataFrame(combinations)
                          .rename(columns = {0: 'first_feature', 1: 'second_feature'}))

    cross_correlations_melt = (pd.melt(cross_correlations, id_vars=['first_feature', 'second_feature'], 
                               value_vars=['cross_correlation_lag_1', 'cross_correlation_lag_2'],
                               var_name = 'lag',
                               value_name = 'cross_correlation')
                              .assign(lag=lambda x: x.lag.str.replace('cross_correlation_lag_', "")))

    cross_correlations_melt['pair_of_features'] = (cross_correlations_melt['first_feature'].str.replace("feature_", "") + 
                                              "_"  + cross_correlations_melt['second_feature'].str.replace("feature_", "") + "_lag" +
                                                   cross_correlations_melt['lag']
                                                  ).astype(str)

else:
    cross_correlations_melt = pd.read_pickle(os.path.join(input_path, 'crosscorrelation/lag1and2crosscorrelations_melted.pickle'))
    cross_correlations_melt['pair_of_features'] = (cross_correlations_melt['first_feature'].str.replace("feature_", "") + 
                                              "_"  + cross_correlations_melt['second_feature'].str.replace("feature_", "") + "_lag" +
                                                   cross_correlations_melt['lag']
                                                  ).astype(str)

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (20, 12))
ax = axes.ravel()
sns.set_palette("RdBu", 10)
#RdBu, YlGn
positive_dict = {0: 'negative', 1: 'positive'}
for j in range(2):
    if j == 0:
        sns.barplot(x='cross_correlation', 
                y='pair_of_features', 
               # palette = 'GnBu_r',
                data=(cross_correlations_melt
                      .sort_values('cross_correlation')
                      .head(40)),
                   ax = ax[j])
    else:
        sns.barplot(x='cross_correlation', 
                y='pair_of_features', 
               # palette = 'GnBu_r',
                data=(cross_correlations_melt
                      .sort_values('cross_correlation', ascending = False)
                      .head(40)),
                   ax = ax[j])

    for p in ax[j].patches:
        width = p.get_width() 
        ax[j].text(width /2, 
                # set the text at 1 unit right of the bar
            p.get_y() + p.get_height() / 2, # get Y coordinate + X coordinate / 2
            '{:1.4f}'.format(width), # set variable to display, 2 decimals
            ha = 'left',   # horizontal alignment
            va = 'center',
            color = 'black',
            fontsize = 11)  # vertical alignment

    ax[j].set_title('Top 40 Features for {} crosscorrelation'.format(positive_dict[j]))