## This notebook is for exploring the dataset, including data understanding, distributions, correlations, missing values etc

In [None]:
# import packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [None]:
pd.set_option('display.max_columns', 100) # 100 column limit
pd.set_option('display.max_rows', 100) # 100 row limits

## Fetch data

In [None]:
import os
import dask.dataframe as dd
from dask.distributed import Client, progress

# reading the paths of all the files present in the dataset
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
%time
train = dd.read_csv("../input/jane-street-market-prediction/train.csv")
#features = dd.read_csv("../input/jane-street-market-prediction/features.csv")
#example_test = dd.read_csv("../input/jane-street-market-prediction/example_test.csv")
# transform train from dask format to pandas dataframe
train = train.compute()

In [None]:
train.describe()

In [None]:
#print(train.head())

### There are 500 days in total in train dataset

In [None]:
#train['date'].unique()

### The number of transactions/trades each day is different.

In [None]:
sns.catplot(x="date", kind="count", data=train)

## Plot resp_1, resp_2, resp_3, resp_4, resp and weight along time for day 0,1,2


In [None]:
date_0 = train.loc[train['date'] == 0]
date = date_0.copy()
x = range(date.shape[0])

fig, axs = plt.subplots(6)
fig.suptitle('Resp_1, Resp_2, Resp_3, Resp_3, Resp_4, Resp and Weight for Day 0')
axs[0].plot(x,date['resp_1'],'r')
axs[1].plot(x,date['resp_2'],'b')
axs[2].plot(x,date['resp_3'],'g')
axs[3].plot(x,date['resp_4'],'y')
axs[4].plot(x,date['resp'],'y')
axs[5].plot(x,date['weight'],'y')


plt.show()

del date

In [None]:
date_1 = train.loc[train['date'] == 1]
date = date_1.copy()
x = range(date.shape[0])

fig, axs = plt.subplots(6)
fig.suptitle('Resp_1, Resp_2, Resp_3, Resp_3, Resp_4, Resp and Weight for Day 1')
axs[0].plot(x,date['resp_1'],'r')
axs[1].plot(x,date['resp_2'],'b')
axs[2].plot(x,date['resp_3'],'g')
axs[3].plot(x,date['resp_4'],'y')
axs[4].plot(x,date['resp'],'y')
axs[5].plot(x,date['weight'],'y')

plt.show()
del date

In [None]:
date_2 = train.loc[train['date'] == 2]
date = date_2.copy()
x = range(date.shape[0])

fig, axs = plt.subplots(6)
fig.suptitle('Resp_1, Resp_2, Resp_3, Resp_3, Resp_4, Resp and Weight for Day 2')
axs[0].plot(x,date['resp_1'],'r')
axs[1].plot(x,date['resp_2'],'b')
axs[2].plot(x,date['resp_3'],'g')
axs[3].plot(x,date['resp_4'],'y')
axs[4].plot(x,date['resp'],'y')
axs[5].plot(x,date['weight'],'y')
plt.show()

del date

### What is the resp like when weight = 0

In [None]:
train_weight_positive = train.loc[train['weight'] != 0]

train_weight_0 = train.loc[train['weight'] == 0]
sns.distplot(train_weight_0['resp'])

### What is the percentage of weight = 0

## Can we find out why these weight = 0 from the features?

## Feature_0

In [None]:
train['feature_0'].unique()

### Plot Feature_0 vs Resp for Day 0

In [None]:
sns.catplot(x="feature_0",y="resp", kind="box", data=date_0)

Contigency table of Feature_0 with Resp >0. 

It shows that feature_0 does not have a clear association with positive return

In [None]:
pd.crosstab(train['feature_0'], train['resp'] > 0,  margins = False) 

### The distribution of feature_0 for weight = 0 or not

In [None]:
pd.crosstab(train['feature_0'], train['weight'] > 0,  margins = False) 

## Number of missing values for each feature

For the whole train dataset

In [None]:
feature_missing_values = pd.DataFrame(train.isna().sum().sort_values(ascending=False),columns=['number of missings'])
feature_missing_values.T


For train dataset when the weight is positive

In [None]:
feature_missing_values = pd.DataFrame(train_weight_positive.isna().sum().sort_values(ascending=False),columns=['number of missings'])
feature_missing_values.T


### Correlations of features

Feature correlations for the whole train dataset.

Some features are pecfectly correlated. Some correlation are even 1.

In [None]:
train.corr().style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

### Plot some features that have high correlation from a time serious view? e.g. feature 60 vs feature 61

In [None]:
plt.plot(train['ts_id'],train['feature_60'],'r',train['ts_id'],train['feature_61'],'green')
plt.title("Feature 60 and 61 for whole train dataset", fontsize=16, fontweight='bold')
plt.xlabel("transactions/trades along time")
plt.show()

### The Pearson correlation coefficient for Feature 60 and 61 is 0.997 which means a total positive linear correlation

In [None]:
diff = train['feature_60'] - train['feature_61']
print(f'The difference between feature 60 and 61 is normally distributed. \n The range is \n{diff.describe()}')

covariance = np.cov(train['feature_60'], train['feature_61'])
print(f'The covariance matrix is \n {covariance}')

from scipy.stats import pearsonr
corr, _ = pearsonr(train['feature_60'], train['feature_61'])
print('Pearsons correlation: %.3f' % corr)

plt.hist(diff, bins = 1000)
plt.show()

### The distribution of all features and The distribution of all features for positive weight

In [None]:
# Taken from this notebook: https://www.kaggle.com/blurredmachine/jane-street-market-eda-viz-prediction

import plotly.express as px

date = 0
n_features = 130

cols = [f'feature_{i}' for i in range(1, n_features)]
hist = px.histogram(
    train[train["date"] == date], 
    x=cols, 
    animation_frame='variable', 
    range_y=[0, 600], 
    range_x=[-7, 7]
)

hist.show()



date = 0
n_features = 130

cols = [f'feature_{i}' for i in range(1, n_features)]
hist = px.histogram(
    train_weight_positive[train_weight_positive["date"] == date], 
    x=cols, 
    animation_frame='variable', 
    range_y=[0, 600], 
    range_x=[-7, 7]
)

hist.show()


### Using PCA to deal with multicollinearity

In [None]:
# remove the returns and ts_id for PCA analysis
cols = list(train.columns)
for removeCol in ['date', 'weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp', 'ts_id']:
    cols.remove(removeCol)

train_features = train[cols]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#In general it is a good idea to scale the data
scaler = StandardScaler()
scaler.fit(train_features)
train_features_scaled = scaler.transform(train_features)


# initiate PCA 
pca = PCA(n_components=50)

# transform 
train_features_scaled_transformed = pca.fit_transform(train_features_scaled[~np.isnan(train_features_scaled).any(axis=1)])


# when testing,transform new data using already fitted pca 
# (don't re-fit the pca)
# newdata_transformed = pca.transform(newdata)

#The amount of variance that each PC explains
var= pca.explained_variance_ratio_
print(var)

#Cumulative Variance explains
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

print(var1)

plt.plot(var1)



In [None]:
loadings = pca.components_.T
df_loadings = pd.DataFrame(loadings)
df_loadings