In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from collections import defaultdict
# Warning
import warnings
warnings.filterwarnings("ignore")

In [None]:
import cufflinks
import cufflinks as cf

import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

sample_prediction_df = pd.read_csv('../input/jane-street-market-prediction/example_sample_submission.csv')
for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df.action = 0 #make your 0/1 prediction here
    env.predict(sample_prediction_df)

## 导引目录:
* [总体构造 Overall](#first-bullet)
* [数据分布 Distribution](#second-bullet)
* [相关性与分组 Correlation](#third-bullet) 
* [分组实例 Group 0](#forth-bullet) 


## 总体构造 Overall <a class="anchor" id="first-bullet"></a>

In [None]:
train  = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
meta_data = pd.read_csv('../input/jane-street-market-prediction/features.csv')
test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')
train.head()

In [None]:
# 数据行数/列数确认
print('train data shape is {}'.format(train.shape))
print('metadata features shape is {}'.format(meta_data.shape))
print('test data shape is {}'.format(test.shape))

如果觉得数据不是按照逻辑排列，想让数据表达更清晰，可以再次sort一下。

`df.sort_values(by=['date', 'ts_id'], inplace=True)` 

In [None]:
train.ts_id.unique()

In [None]:
train.isnull().sum()

## 数据分布 Distribution <a class="anchor" id="second-bullet"></a>

可以看出数据是用天数`date`，加权`weight`，多个`resp`，多个数据特征`features`，和`ts_id`构成的，我们需要对中间的weight和resp做出分析。经由图像显示，他们多呈现正态分布且数据标准化，对我们接下来的数据分析意义重大。

The distribution of the train dateset seems normally distributed (standardized) and are mean/zero reverted. 

同时可以看出有很多缺失的数据，所以接下来我们需要进行数据填充。因为数据是呈正态分布的，所以我也选择了使用平均数来替代NA。

In [None]:
# Features Plotting 
fig, axes = plt.subplots(nrows=44
                         , ncols=3,figsize=(25,250))
for i, column in enumerate(train.iloc[:,7:].columns):
    sns.distplot(train[column],ax=axes[i//3,i%3])

In [None]:
# resp Plotting 
fig = plt.figure(figsize=(16,6))
ax = plt.subplot(1,1,1)
train.groupby('date')[['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']].sum().cumsum().plot(ax=ax)
plt.title('Cumulative Sum Return of resp\'s',fontsize=18)
plt.xlabel('Date',fontsize=14)
plt.axvspan(0,150,linestyle=':',linewidth=2,label='first 150 days',color='yellow',alpha=.2)
plt.legend(fontsize=12,ncol=2,loc=2);

In [None]:
# Weight Plotting 
plt.figure(figsize = (12,5))
ax = sns.distplot(train['weight'], 
             bins=1400, 
             kde_kws={"clip":(0.001,1.4)}, 
             hist_kws={"range":(0.001,1.4)},
             color='darkcyan', 
             kde=False);

values = np.array([rec.get_height() for rec in ax.patches])
norm = plt.Normalize(values.min(), values.max())
colors = plt.cm.jet(norm(values))

for rec, col in zip(ax.patches, colors):
    rec.set_color(col)
plt.xlabel("Histogram of non-zero weights", size=14)
plt.show();
del values

In [None]:
# Another weight trend 
train['resp_trend'] = train['resp'].cumsum()
train['weight_trend'] = (train['weight'] * train['resp']).cumsum()
train.plot(x='ts_id', y='weight_trend', figsize=(15,5))
plt.xlabel('Trade', size=18)
plt.title('Cumulative return of resp and weight trend', size=18)
plt.legend(loc=2)
plt.show()

In [None]:
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={
    'height_ratios': (.15, .85)})
sns.boxplot(train['weight'], ax=ax_box)
sns.distplot(train['weight'], ax=ax_hist)

y = sns.JointGrid(data=train, x='weight', y='resp')
y.plot_joint(sns.scatterplot, s=100, alpha=.5)
y.plot_marginals(sns.distplot, kde=True, color='green')

#### 数据缺失 Null Values 

因为高频交易的规律与特性，我们会只取单独一天的数据来看EDA。同时接下来的数据填充，是由平均数mean来填充的。

In [None]:
sample_df = train.query('date == 1')
sample_df.describe()

In [None]:
nan_val = train.isna().sum()[train.isna().sum() > 0].sort_values(ascending=False)
fig, axs = plt.subplots(figsize=(10,10))
sns.barplot(y = nan_val.index[0:62], 
            x = nan_val.values[0:62], 
            alpha = 0.8)
plt.title('Missing Values of Train Dataset')
plt.xlabel('# of NA')
plt.show()

In [None]:
sample_df = sample_df.apply(lambda x: x.fillna(x.mean()), axis=0)
print('After replacing, number of features with null values: ', sample_df.isna().sum().sum())

In [None]:
sample_df.iloc[:, 7:-2].hist(bins=100, figsize=(30,74), layout=(35,4));

在填充完缺失数据之后，我们也来比较一下resp之于weight的散点图的区别。可以发现：
* weight更低的时候，resp的离散程度就更高一些。
* **Lower weight trades have a much higher dispersion in resp.** 
* weight可能可以作为一种未来收益波动的预测量。
* **Weights implies to be some kind of predictor of future return volatility.** 


In [None]:
sns.scatterplot(data=sample_df, x='resp', y='weight', color='green', alpha=.3)
plt.title('resp vs weight \ncorrelation={}'.format(round(sample_df.weight.corr(sample_df.resp), 3))); 

In [None]:
sns.scatterplot(data=train, x='resp', y='weight', alpha=.3)
plt.title('resp vs weight \ncorrelation={}'.format(round(train.weight.corr(train.resp), 3))); 

In [None]:
sns.pairplot(sample_df[['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']], corner=True)

In [None]:
import gc 
featstr = [i for i in train.columns[7:-2]]
fig = plt.figure(figsize=(20,80))
fig.suptitle('Features Box plot with 0.1% 99.9% whiskers',fontsize=22, y=.89)
grid =  gridspec.GridSpec(29,4,figure=fig,hspace=.5,wspace=.05)
counter = 0
for i in range(29):
    for j in range(4):
        subf = fig.add_subplot(grid[i, j]);
        sns.boxplot(x= sample_df[featstr[counter]],saturation=.5,color= 'blue', ax= subf,width=.5,whis=(.1,99.9));
        subf.set_xlabel('')
        subf.set_title('{}'.format(featstr[counter]),fontsize=16)
        counter += 1
        gc.collect();

In [None]:
from scipy import stats 
def r2(x, y):
    return stats.pearsonr(x, y)[0] ** 2

ratio=4
f = plt.figure(figsize=(25,60))
outer_grid = gridspec.GridSpec(7, 3, wspace=0.3, hspace=0.3)

class myjoint(sns.JointGrid):
    def __init__(self, x, y, data=None,height=7, ratio=5, space=.2,
                 dropna=True, xlim=None, ylim=None, size=None):
        super(myjoint, self).__init__(x, y, data,height, ratio, space,
                 dropna, xlim, ylim, size)
        plt.close(2)
        # Set up the subplot grid
        self.ax_joint = f.add_subplot(gs[1:, :-1])
        self.ax_marg_x = f.add_subplot(gs[0, :-1], sharex=self.ax_joint)
        self.ax_marg_y = f.add_subplot(gs[1:, -1], sharey=self.ax_joint)

        # Turn off tick visibility for the measure axis on the marginal plots
        plt.setp(self.ax_marg_x.get_xticklabels(), visible=False)
        plt.setp(self.ax_marg_y.get_yticklabels(), visible=False)
for i, column in enumerate(['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']):
    gs = gridspec.GridSpecFromSubplotSpec(ratio+1, ratio+1,
            subplot_spec=outer_grid[i], wspace=0.3, hspace=0.3)
    g = myjoint(y="weight", x=column, data=train, ratio=ratio)
    g = g.plot(sns.regplot, sns.distplot)
    r2_score = r2(x=train[column],y=train["weight"])
    plt.xlabel(f"{column} R2 score:{round(r2_score,4)}")

## 相关性与分组 Correlation <a class="anchor" id="first-bullet"></a>
**元数据** 竞赛主持人对此进行了描述，这也是一个假设，即元数据中的标记代表某种概念（例如体积不平衡）可用来创建这些功能。这使得验证另一个**假设**：根据相同的值，由相同概念制成的要素将具有相同的行为，并且要素之间也具有相关性。因此，为了检验该假设的有效性，我们将尝试将这些特征划分为不同的*类别*。

相关性的图标表明，feature resp_4相对于feature resp拥有最高的正相关性，而feature resp_1用于最低的正相关性。

In [None]:
corr = sample_df.iloc[:,2:7].corr()
f, ax = plt.subplots(figsize=(15,10))
sns.heatmap(corr, cmap='BrBG', center=0, vmin=1, vmax=1, annot=True, square=True, linewidths=.5, cbar_kws={
    'shrink': .5
})

## 分组实例 Group 0 <a class="anchor" id="forth-bullet"></a>
Distributions : Group 0 features with 1st concept 

In [None]:
conditions = [
    (sample_df['weight'] <= 40), 
    (sample_df['weight'] > 40) & (sample_df['weight'] <= 80), 
    (sample_df['weight'] > 80)
]
values = ['tier1', 'tier2', 'tier3']
sample_df['weight_tier'] = np.select(conditions, values)
categories = defaultdict(list)
for columns in meta_data.columns[1:]: 
    categories[f'{columns}'].append(meta_data.query(f'{columns} == True')['feature'].to_list())
tag_0 = sample_df[[*categories['tag_0'][0]]]

In [None]:
fig, axes = plt.subplots(nrows=6, ncols=3,figsize=(25,50))
for i, column in enumerate(tag_0.columns):
    sns.distplot(tag_0[column],ax=axes[i//3,i%3],color='Green')

#### Group 0 Correlation Analysis 

In [None]:
f, ax = plt.subplots(figsize=(45, 20))
sns.heatmap(tag_0.corr(), cmap='BrBG', center=0, vmin=-1, vmax=1, annot=True, 
           square=True, linewidths=.5, cbar_kws={'shrink': .5})

#### Group 0 Linear Regression Analysis 

In [None]:
ratio = 4 
f = plt.figure(figsize=(25, 60))
outer_grid = gridspec.GridSpec(6, 3, wspace=.3, hspace=.3)
for i, column in enumerate(tag_0.columns): 
    gs = gridspec.GridSpecFromSubplotSpec(ratio+1, ratio+1, 
                                         subplot_spec=outer_grid[i], wspace=.3, hspace=.3)
    g = myjoint(y='resp', x=column, data=sample_df, ratio=ratio)
    g = g.plot(sns.regplot, sns.distplot)
    r2_score = r2(x=sample_df[column].values, y=sample_df['resp'].values)
    plt.xlabel(f'{column} R2 score: {round(r2_score, 4)}')
f.tight_layout()

#### Group 0 Scatter Plot

In [None]:
ratio = 4 
f = plt.figure(figsize=(25, 60))
outer_grid = gridspec.GridSpec(6, 3, wspace=.3, hspace=.3)
for i, column in enumerate(tag_0.columns): 
    gs = gridspec.GridSpecFromSubplotSpec(ratio+1, ratio+1, 
                                         subplot_spec=outer_grid[i], wspace=.3, hspace=.3)
    g = myjoint(y='resp', x=column, data=sample_df, ratio=ratio)
    g = g.plot(sns.scatterplot, sns.distplot)
    r2_score = r2(x=sample_df[column].values, y=sample_df['resp'].values)
    plt.xlabel(f'{column} R2 score: {round(r2_score, 4)}')