The purpose is to perform an exploratory data analysis on the data provided for the Jane Street Market Prediction 

### Import

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import time
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA



### DataSet 

In [None]:
# df.fread().to_pandas() is faster than pd.read_csv()

In [None]:
folder_path = '../input/jane-street-market-prediction/'

In [None]:
%%time
train_df = pd.read_csv(folder_path +'train.csv')
features_df = pd.read_csv(folder_path + 'features.csv')
sample_df = pd.read_csv(folder_path + 'example_sample_submission.csv')
test_data_df = pd.read_csv(folder_path + 'example_test.csv')

In [None]:
train_df

In [None]:
train_df = train_df.astype({c: np.float32 for c in train_df.select_dtypes(include='float64').columns})#limit memory use
test_data_df = test_data_df.astype({c: np.float32 for c in train_df.select_dtypes(include='float64').columns})

In [None]:
train_df.describe()

In [None]:
test_data_df.describe()

### For performance issues, 

In [None]:
train_df['action'] = ((train_df['weight'].values * train_df['resp'].values) 
                  > 0).astype('int')

In [None]:
features = [c for c in train_df.columns if 'feature' in c]

In [None]:
resps = [c for c in train_df.columns if 'resp' in c]

In [None]:
features

### Compute the Correlation

In [None]:
plt.figure(figsize=(40, 40))
ax = sns.heatmap(train_df[features].corr(), vmin=-1, vmax=1, cmap='RdBu')
ax.set_title('Features Correlation', fontdict={'fontsize':18}, pad=16);

There is a strong correlation among the features.

### Display Number of Null values by features

In [None]:
null_count = train_df[features].isna().sum().sort_values(ascending=True)

In [None]:
null_count

In [None]:
null_count_df = null_count.to_frame().reset_index()

In [None]:
null_count_df.columns

In [None]:
null_count_df.rename(columns={'index':'feature', 0:'count'}, inplace=True)

In [None]:
null_count_df[null_count_df['count']>0]

In [None]:
plt.figure(figsize=(30, 30))
ax = sns.barplot(x="count", y="feature", data=null_count_df[null_count_df['count']>0])
ax.set_title('Number of empty Features', fontdict={'fontsize':18}, pad=16);

In [None]:
### The empty cells are completed by the mean of the columns

train_df_median = train_df[features].median()

In [None]:
train_df_median

In [None]:
train_x_no_nan = train_df[features].fillna(train_df_median)

In [None]:
train_x_no_nan

In [None]:
plt.figure(figsize=(8, 8))
bx=sns.heatmap(train_df[resps].corr(), vmin=-1, vmax=1, annot=True, cmap='RdBu')
bx.set_title('Resp Correlation', fontdict={'fontsize':18}, pad=16);

In [None]:
plt.figure(figsize=(8, 8))
bx=sns.countplot(x="action", data=train_df)
bx.set_title('Number of actions', fontdict={'fontsize':18}, pad=16)
plt.xlabel("Type of actions")
plt.ylabel("Number of actions")