### Jane Street Market Prediction | EDA

Jane Street Market Prediction <br>
Test your model against future real market data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import data
# a mock sample submission file in the correct format
example_sample_submission = pd.read_csv('/kaggle/input/jane-street-market-prediction/example_sample_submission.csv')
#metadata pertaining to the anonymized features
features= pd.read_csv('/kaggle/input/jane-street-market-prediction/features.csv')
# a mock test set which represents the structure of the unseen test set. You will not be directly using the test set or sample submission in this competition, as the time-series API will get/set the test set and predictions.
example_test= pd.read_csv('/kaggle/input/jane-street-market-prediction/example_test.csv')
# the training set, contains historical data and returns
train= pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')


In [None]:
train.info()

In [None]:
#resp_{1,2,3,4} values that represent returns over different time horizons
#Trades with weight = 0 were intentionally included in the dataset for completeness, although such trades will not contribute towards the scoring evaluation.
train.head()

### Understanding Resp


... you are provided a resp value, as well as several other resp_{1,2,3,4} values that represent returns over different time horizons. <i><b>These variables are not included in the test set.</i></b> Trades with weight = 0 were intentionally included in the dataset for completeness, although such trades will not contribute towards the scoring evaluation.

In [None]:
def histo(col):
    title1 = np.round(train[col].mean(),2)
    title2 = np.round(train[col].skew(),2)
    title3 = np.round(train[col].kurtosis(),2)
    plt.style.use('ggplot')
    plt.hist(train[col], bins = 100)
    plt.title(col + ' Avg ' + np.str(title1) + ' Skew ' + np.str(title2) + ' Kurt ' + np.str(title3))
    plt.show()

In [None]:
histo('resp')

In [None]:
histo('resp_1')

In [None]:
histo('resp_2')

In [None]:
histo('resp_3')

In [None]:
histo('resp_4')

### Understanding Weight vs Resp

Each trade has an associated weight and resp, which together represents a return on the trade. Trades with weight = 0 were intentionally included in the dataset for completeness, although such trades will not contribute towards the scoring evaluation.

In [None]:
histo('weight')

In [None]:
train['weight_resp'] = train['weight'] * train['resp'] 

In [None]:
histo('weight_resp')

In [None]:
sns.jointplot(x="weight", y="resp", data=train, height=10, ratio=3, color="r")
plt.show()

When the weight is high the variance of resp is low, more confident in taking small gains at risk of making small losses.

In [None]:
sns.jointplot(x="weight", y="resp_1", data=train, height=10, ratio=3, color="r")
plt.show()

In [None]:
sns.jointplot(x="weight", y="resp_2", data=train, height=10, ratio=3, color="r")
plt.show()

In [None]:
sns.jointplot(x="weight", y="resp_3", data=train, height=10, ratio=3, color="r")
plt.show()

In [None]:
sns.jointplot(x="weight", y="resp_4", data=train, height=10, ratio=3, color="r")
plt.show()

In [None]:


fig, ax = plt.subplots(figsize=(15, 5))
v1= pd.Series(train['weight_resp']).cumsum()
v2= pd.Series(train['resp']).cumsum()
# v3= pd.Series(train['weight'])
ax.set_xlabel ("Trade", fontsize=18)
ax.set_ylabel ("Cumulative", fontsize=18)
v1.plot(lw=3)
v2.plot(lw=3)
# v3.plot(lw=3)
ax.legend(('weight_resp','resp','weight'))




Any small sudden movements in resp appear to be magnified by the weight as seen in the weight_resp.

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
balance= pd.Series(train['resp']).cumsum()
resp_1= pd.Series(train['resp_1']).cumsum()
resp_2= pd.Series(train['resp_2']).cumsum()
resp_3= pd.Series(train['resp_3']).cumsum()
resp_4= pd.Series(train['resp_4']).cumsum()
ax.set_xlabel ("Trade", fontsize=18)
ax.set_title ("Cumulative resp and time horizons 1, 2, 3, and 4 (500 days)", fontsize=18)
balance.plot(lw=3)
resp_1.plot(lw=3)
resp_2.plot(lw=3)
resp_3.plot(lw=3)
resp_4.plot(lw=3)
plt.legend(loc="upper left");
del resp_1
del resp_2
del resp_3
del resp_4


In [None]:
#Correlation between responses
f,ax = plt.subplots(figsize=(15, 15))
sns.heatmap(train[['resp_1','resp_2','resp_3','resp_4']].corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.show()

In [None]:
sns.jointplot(x="resp_1", y="resp_4", data=train, height=10, ratio=3, color="r")
plt.show()

In [None]:
train_features = ([col for col in train.columns if 'feature' in col])
train_response = ([col for col in train.columns if 'resp' in col])
train_heatmap = train_response + train_features

In [None]:
train_heatmap

### Understanding features

In [None]:
#Correlation between features and weight_resp
fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(train[train_heatmap].corr()[['weight_resp']].sort_values('weight_resp').tail(20),
 vmax=1, vmin=-1, cmap='YlGnBu', annot=True, ax=ax);
ax.invert_yaxis()

...developing good models will be challenging for many reasons, including a very low signal-to-noise ratio, potential redundancy, strong feature correlation...

In [None]:
# fig, ax = plt.subplots(figsize=(12,12))
# sns.heatmap(train[train_heatmap].corr()[['weight']].sort_values('weight').tail(20),
#  vmax=1, vmin=-1, cmap='YlGnBu', annot=True, ax=ax);
# ax.invert_yaxis()

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(train[train_heatmap].corr()[['resp']].sort_values('resp').tail(20),
 vmax=1, vmin=-1, cmap='YlGnBu', annot=True, ax=ax);
ax.invert_yaxis()

In [None]:
# fig, ax = plt.subplots(figsize=(12,12))
# sns.heatmap(train[train_heatmap].corr()[['resp_1']].sort_values('resp_1').tail(20),
#  vmax=1, vmin=-1, cmap='YlGnBu', annot=True, ax=ax);
# ax.invert_yaxis()

In [None]:
# fig, ax = plt.subplots(figsize=(12,12))
# sns.heatmap(train[train_heatmap].corr()[['resp_2']].sort_values('resp_2').tail(20),
#  vmax=1, vmin=-1, cmap='YlGnBu', annot=True, ax=ax);
# ax.invert_yaxis()

In [None]:
# fig, ax = plt.subplots(figsize=(12,12))
# sns.heatmap(train[train_heatmap].corr()[['resp_3']].sort_values('resp_3').tail(20),
#  vmax=1, vmin=-1, cmap='YlGnBu', annot=True, ax=ax);
# ax.invert_yaxis()

In [None]:
# fig, ax = plt.subplots(figsize=(12,12))
# sns.heatmap(train[train_heatmap].corr()[['resp_4']].sort_values('resp_4').tail(20),
#  vmax=1, vmin=-1, cmap='YlGnBu', annot=True, ax=ax);
# ax.invert_yaxis()

In [None]:
data = train.copy()
data = data[data['resp_1'] < data['resp_1'].quantile(0.99)]
data = data[data['resp_1'] < data['resp_1'].quantile(0.99)]

f,ax1 = plt.subplots(figsize =(20,10))
sns.pointplot(x='heals',y='winPlacePerc',data=data,color='lime',alpha=0.8)
sns.pointplot(x='boosts',y='winPlacePerc',data=data,color='blue',alpha=0.8)
plt.text(4,0.6,'Heals',color='lime',fontsize = 17,style = 'italic')
plt.text(4,0.55,'Boosts',color='blue',fontsize = 17,style = 'italic')
plt.xlabel('Number of heal/boost items',fontsize = 15,color='blue')
plt.ylabel('Win Percentage',fontsize = 15,color='blue')
plt.title('Heals vs Boosts',fontsize = 20,color='blue')
plt.grid()
plt.show()

In [None]:
example_sample_submission.head()

In [None]:
features.head()

In [None]:
example_test.head()

This competition is evaluated on a utility score. Each row in the test set represents a trading opportunity for which you will be predicting an action value, 1 to make the trade and 0 to pass on it. Each trade j has an associated weight and resp, which represents a return.

For each date i, we define:

<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
  <msub>
    <mi>p</mi>
    <mi>i</mi>
  </msub>
  <mo>=</mo>
  <munder>
    <mo>&#x2211;<!-- ∑ --></mo>
    <mi>j</mi>
  </munder>
  <mo stretchy="false">(</mo>
  <mi>w</mi>
  <mi>e</mi>
  <mi>i</mi>
  <mi>g</mi>
  <mi>h</mi>
  <msub>
    <mi>t</mi>
    <mrow class="MJX-TeXAtom-ORD">
      <mi>i</mi>
      <mi>j</mi>
    </mrow>
  </msub>
  <mo>&#x2217;<!-- ∗ --></mo>
  <mi>r</mi>
  <mi>e</mi>
  <mi>s</mi>
  <msub>
    <mi>p</mi>
    <mrow class="MJX-TeXAtom-ORD">
      <mi>i</mi>
      <mi>j</mi>
    </mrow>
  </msub>
  <mo>&#x2217;<!-- ∗ --></mo>
  <mi>a</mi>
  <mi>c</mi>
  <mi>t</mi>
  <mi>i</mi>
  <mi>o</mi>
  <msub>
    <mi>n</mi>
    <mrow class="MJX-TeXAtom-ORD">
      <mi>i</mi>
      <mi>j</mi>
    </mrow>
  </msub>
  <mo stretchy="false">)</mo>
  <mo>,</mo>
</math>
<br>
<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
  <mi>t</mi>
  <mo>=</mo>
  <mfrac>
    <mrow>
      <mo>&#x2211;<!-- ∑ --></mo>
      <msub>
        <mi>p</mi>
        <mi>i</mi>
      </msub>
    </mrow>
    <msqrt>
      <mo>&#x2211;<!-- ∑ --></mo>
      <msubsup>
        <mi>p</mi>
        <mi>i</mi>
        <mn>2</mn>
      </msubsup>
    </msqrt>
  </mfrac>
  <mo>&#x2217;<!-- ∗ --></mo>
  <msqrt>
    <mfrac>
      <mn>250</mn>
      <mrow>
        <mrow class="MJX-TeXAtom-ORD">
          <mo stretchy="false">|</mo>
        </mrow>
        <mi>i</mi>
        <mrow class="MJX-TeXAtom-ORD">
          <mo stretchy="false">|</mo>
        </mrow>
      </mrow>
    </mfrac>
  </msqrt>
  <mo>,</mo>
</math>

where |i| is the number of unique dates in the test set. The utility is then defined as:
<br>
<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
  <mi>u</mi>
  <mo>=</mo>
  <mi>m</mi>
  <mi>i</mi>
  <mi>n</mi>
  <mo stretchy="false">(</mo>
  <mi>m</mi>
  <mi>a</mi>
  <mi>x</mi>
  <mo stretchy="false">(</mo>
  <mi>t</mi>
  <mo>,</mo>
  <mn>0</mn>
  <mo stretchy="false">)</mo>
  <mo>,</mo>
  <mn>6</mn>
  <mo stretchy="false">)</mo>
  <mo>&#x2211;<!-- ∑ --></mo>
  <msub>
    <mi>p</mi>
    <mi>i</mi>
  </msub>
  <mo>.</mo>
</math>