# Jane St. Market Prediction Competition

## A brief exploratory data analysis

In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('max_rows', 200)

sns.set()
#sns.set(style="ticks", context="talk")
#plt.style.use("dark_background")

# 1. Metadata - features.csv

There are 130 features in the training dataset; metadata is a 130 features x 30 tags boolean matrix

In [None]:
meta = pd.read_csv('/kaggle/input/jane-street-market-prediction/features.csv')
print(meta.shape)
meta.head(2)

In [None]:
# plot boolean matrix (true/false)
plt.figure(figsize=(10, 8))
meta_int = meta[[f for f in meta.columns if f != "feature"]].astype('int8')
p = sns.heatmap(meta_int, cbar=False)

# 2. Training data

In [None]:
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
print(train.info())
display(train.head())

# 2.1 Number of trades per day

In [None]:
daily_count = train.groupby('date')['resp'].count()
daily_count = daily_count.to_frame().reset_index().rename({'resp': 'count_'}, axis=1)

fig, ax = plt.subplots(1,2, figsize=(16, 6))
sns.distplot(daily_count.count_, kde=False, ax=ax[0])
sns.lineplot(x='date', y='count_', data=daily_count, ax=ax[1])

## Number of trades per day according to feature_0

The column feature_0 is the only one with a binary/categorical value. Maybe it is a buy/sell order or a long/short position

In [None]:
dcount = train.groupby(['date', 'feature_0'])['resp'].count()
dcount = dcount.to_frame().reset_index().rename({'resp': 'count_'}, axis=1)

fig, ax = plt.subplots(1,2, figsize=(16, 6))
sns.distplot(dcount[dcount.feature_0 == -1].count_, hist=False, ax=ax[0])
sns.distplot(dcount[dcount.feature_0 == +1].count_, hist=False, ax=ax[0])
sns.lineplot(x='date', y='count_', data=dcount[dcount.feature_0 == -1], ax=ax[1])
sns.lineplot(x='date', y='count_', data=dcount[dcount.feature_0 == +1], ax=ax[1])

In [None]:
train.feature_0.value_counts(dropna=False)

# 2.2 Returns (resp) and weights

In [None]:
f = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'feature_0']
g = sns.PairGrid(train[f], hue='feature_0')
g.map_offdiag(sns.scatterplot)
g.map_diag(sns.kdeplot)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(16, 6))
sns.distplot(train.weight, ax=ax[0])
sns.distplot(train[train.weight < 10].weight, ax=ax[1])

## Average weight per day


The average weight (grouped by day) is slightly increasing

In [None]:
plt.figure(figsize=(10, 5))
sns.lineplot(x=range(500), y=train.groupby('date')['weight'].mean())

## Average return (resp) per day

In [None]:
plt.figure(figsize=(10, 5))
sns.lineplot(x=range(500), y=train[train.feature_0 == -1].groupby('date')['resp'].mean())
sns.lineplot(x=range(500), y=train[train.feature_0 == 1].groupby('date')['resp'].mean())

# 2.3 Missing pattern analysis

In [None]:
def plot_missing_pattern(start_day, end_day, size=(14, 6)):
    sample = train[(train.date >= start_day) & (train.date <= end_day)]
    sample = sample[[f'feature_{i}' for i in range(130)]]
    sample = sample.apply(lambda x: x.isna()).astype('int8')

    plt.figure(figsize=size)
    plt.title("Missing pattern - day {} to {}".format(start_day, end_day))
    p = sns.heatmap(sample, cbar=False)
               
plot_missing_pattern(0, 0)
plot_missing_pattern(320, 320)

In [None]:
plot_missing_pattern(100, 108, size=(14, 8))

# 2.4 Plot each feature (histogram + timeserie)

In [None]:
train.feature_0.value_counts(dropna=False)

In [None]:
def plot_feat(feature, day=0, line=True, use_sample=False):
    sample = train[train.date == day] if use_sample else train
    fig, ax = plt.subplots(1,2, figsize=(12, 3))
    sns.distplot(sample[feature], kde=False, ax=ax[0])
    if line:
        sns.lineplot(x=sample.ts_id, y=sample[feature], ax=ax[1])
    else:
        sns.scatterplot(x=sample.ts_id, y=sample[feature], ax=ax[1])

In [None]:
plot_feat('feature_1')
plot_feat('feature_2')
plot_feat('feature_3')

In [None]:
plot_feat('feature_4')
plot_feat('feature_5')
plot_feat('feature_6')

In [None]:
plot_feat('feature_7')
plot_feat('feature_8')
plot_feat('feature_9')

In [None]:
plot_feat('feature_10')
plot_feat('feature_11')
plot_feat('feature_12')

In [None]:
plot_feat('feature_13')
plot_feat('feature_14')
plot_feat('feature_15')

In [None]:
plot_feat('feature_16')
plot_feat('feature_17')
plot_feat('feature_18')

In [None]:
plot_feat('feature_10')
plot_feat('feature_20')
plot_feat('feature_21')

In [None]:
plot_feat('feature_22')
plot_feat('feature_23')
plot_feat('feature_24')

In [None]:
plot_feat('feature_25')
plot_feat('feature_26')
plot_feat('feature_27')

In [None]:
plot_feat('feature_28')
plot_feat('feature_29')
plot_feat('feature_30')

In [None]:
plot_feat('feature_31')
plot_feat('feature_32')
plot_feat('feature_33')

In [None]:
plot_feat('feature_34')
plot_feat('feature_35')
plot_feat('feature_36')

In [None]:
plot_feat('feature_37')
plot_feat('feature_38')
plot_feat('feature_39')

In [None]:
plot_feat('feature_40')
plot_feat('feature_41')
plot_feat('feature_42')

In [None]:
plot_feat('feature_43')
plot_feat('feature_44')
plot_feat('feature_45')

In [None]:
plot_feat('feature_46')
plot_feat('feature_47')
plot_feat('feature_48')

In [None]:
plot_feat('feature_49')
plot_feat('feature_50')
plot_feat('feature_51')

# Mean values

In [None]:
p = sns.distplot(train[[f'feature_{i}' for i in range(130)]].mean())

## Work in progress...