## Installing packages

In [None]:
!pip install plotly

## Importing all the necessary packages

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import seaborn as sns
import sklearn
import janestreet
import os, sys
import gc
import math
import random
import pathlib
from sklearn.preprocessing import MinMaxScaler
from catboost import CatBoostClassifier
import cv2
import plotly.express as px
import plotly.figure_factory as ff
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Loading the dataset

In [None]:
train_df=pd.read_csv('../input/jane-street-market-prediction/train.csv')

In [None]:
print('Total number of entries in the train dataset are:', len(train_df))
train_df.head()

In [None]:
features_df = pd.read_csv('/kaggle/input/jane-street-market-prediction/features.csv', index_col = 0)
print('Total number of features are:', len(features_df))
features_df.head()


# Exploratory Data Analysis

In [None]:
print("Properties",train_df.info())
print("Shape:",train_df.shape)

In [None]:
train_df.describe()

In [None]:
print("Properties",features_df.info())
print("Shape:",features_df.shape)

In [None]:
features_df.describe()

## Training set

### Missing Data

In [None]:
train_df.isnull().sum()

In [None]:
train_df.isna().head()

In [None]:
features_df.isna().head()

In [None]:
fig=px.bar(x = train_df.isnull().sum().index,y = train_df.isnull().sum().values, labels = dict(x = "Attributes", y = "Number of Missing Values"), title= 'Missing Data')
fig.show()

### Tags

In [None]:
tags = features_df.sum(axis = 1)
tags_dict = {'Features' : tags.index.values, 'Tag Count' : tags.values}
tags_df = pd.DataFrame(tags_dict)
plt.figure(figsize = (130, 25))
plt.xlabel('Features', fontsize = 100)
plt.ylabel('Tag Count', fontsize = 100)
plt.title('Tag Counts of Features', fontsize = 120)
plt.xticks(rotation ='vertical', fontsize = 50)
plt.yticks(fontsize = 50)
sns.barplot(x = 'Features', y = 'Tag Count', data = tags_df,palette="viridis")
plt.show()
del tags_dict,tags_df

**ANALYSIS**
* feature_0 has no tags
* feature 79 to 119 all has 4 tags
* feature 7 to 36 have 3 and 4 tags periodically
* Similar trend between 2 to 7, 37 to 40, 120 to 129


**Feature_0 Analysis**

In [None]:
train_df['feature_0'].value_counts()

In [None]:
feature_0_is_plus_one  = train_df.query('feature_0 ==  1').reset_index(drop = True)
feature_0_is_minus_one = train_df.query('feature_0 == -1').reset_index(drop = True)
# the plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 4))
ax1.plot((pd.Series(feature_0_is_plus_one['resp']*feature_0_is_plus_one['weight']).cumsum()), lw=3, label='return', color="purple")
ax2.plot((pd.Series(feature_0_is_minus_one['resp']*feature_0_is_minus_one['weight']).cumsum()), lw=3, label='return', color="violet")
ax1.set_title ("feature_0 = 1", fontsize=18)
ax2.set_title ("feature_0 = -1", fontsize=18)
ax1.legend(loc="lower left")
ax2.legend(loc="lower right");

del feature_0_is_plus_one
del feature_0_is_minus_one
gc.collect();

**ANALYSIS:** when feature_0 is 1, plot shows negative slope while in contrast, when feature_0 is -1, plot shows positive slope. My guess is that feature_0 corresponds to Buy(1) and Sell(-1) or vice versa. So if we set action to 1 with feature_0 = 1 then we are selling and when we set action to 0 with feature_0 = -1, then we are buying. This makes sense since whether we are buying or selling we can still lose or gain profit.

### Weights

In [None]:

null_weights = (train_df['weight'] == 0).sum()
total_weights = len(train_df['weight'])
null_weights_per = null_weights / total_weights * 100
plt.figure(figsize = (15, 6))
plt.pie(((train_df.weight==0).mean(),(1-(train_df.weight==0).mean())), explode = (0, 0.1),shadow=True, labels=(f'Null Weights\n{round((train_df.weight==0).mean()*100,3)}%',f'Considerable Weights\n{round((1-(train_df.weight==0).mean())*100,3)}%'.format()), colors = ['pink', 'green'])
plt.legend(title='Weights')
plt.show()

**ANALYSIS:** dataset has too many NULL weights that can be removed for memory efficiency

In [None]:
date_weight_df = pd.DataFrame({'Date' : np.unique(train_df['date'].values), 'NULL_Weights' : train_df[train_df['weight'] == 0.0].groupby(['date']).size().values, 'NOT_NULL_Weights' : train_df[train_df['weight'] != 0.0].groupby(['date']).size().values})
date_weight_df.head()

In [None]:
fig = plt.figure(figsize=(505, 100))

plt.xticks(rotation ='vertical', fontsize = 60)
plt.yticks(fontsize = 200)

ax = fig.add_subplot(111) 
ax2 = ax.twinx() 

date_weight_df.NOT_NULL_Weights.plot(kind='bar',color='green',ax=ax, position = 0)
date_weight_df.NULL_Weights.plot(kind='bar',color='pink', ax=ax2, position = 1)

ax.grid(None)
ax2.grid(None)

ax.set_ylabel('NOT NULL Weights', fontsize = 300)
ax2.set_ylabel('NULL Weights', fontsize = 300)
ax.set_xlabel('Time (in Days)',fontsize = 300)
fig.suptitle('NULL Weights Vs NOT NULL Weights per Day', fontsize = 450)

ax.set_xlim(-1, 505)

plt.show()

In [None]:
plt.figure(figsize = (12,5))
ax = sns.distplot(train_df['weight'], 
             bins=1000, 
             kde_kws={"clip":(0.001,1)}, 
             hist_kws={"range":(0.001,1)},
             color='purple', 
             kde=False);
values = np.array([rec.get_height() for rec in ax.patches])
norm = plt.Normalize(values.min(), values.max())
plt.xlabel("Histogram of non-zero weights", size=10)
plt.show();
del values
gc.collect();

**ANALYSIS** We can see that most weights are around 0.2 and we can see two 'peaks' which is around 0.2 and 0.3. Note that maximum weight was 167.29 represented by 1.0 on x-axis. 

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
balance= pd.Series(train_df['resp']).cumsum()
resp_1= pd.Series(train_df['resp_1']).cumsum()
resp_2= pd.Series(train_df['resp_2']).cumsum()
resp_3= pd.Series(train_df['resp_3']).cumsum()
resp_4= pd.Series(train_df['resp_4']).cumsum()
ax.set_xlabel ("Trade", fontsize=18)
ax.set_title ("Cumulative resp and time horizons 1, 2, 3, and 4 (500 days)", fontsize=18)
balance.plot(lw=3)
resp_1.plot(lw=3)
resp_2.plot(lw=3)
resp_3.plot(lw=3)
resp_4.plot(lw=3)
plt.legend(loc="upper left")
del resp_1
del resp_2
del resp_3
del resp_4
gc.collect();

**ANALYSIS:** 
We can see that resp is closely related to resp_4 (blue and purple). Resp_1 and resp_2 also seem to be closely related but much much linear. Resp_3 seem to be in the middle, where the shape is closer to upper group but position is slightly closer to green and orange.


**POINT OF IMPORTANCE:** Weight and resp multiplied together represents a return on the trade.

In [None]:
train_df['weight_resp']   = train_df['weight']* train_df['resp']
train_df['weight_resp_1'] = train_df['weight']* train_df['resp_1']
train_df['weight_resp_2'] = train_df['weight']* train_df['resp_2']
train_df['weight_resp_3'] = train_df['weight']* train_df['resp_3']
train_df['weight_resp_4'] = train_df['weight']* train_df['resp_4']

fig, ax = plt.subplots(figsize=(15, 5))
resp    = pd.Series(1+( train_df.groupby('date')['weight_resp'].mean())).cumprod()
resp_1  = pd.Series(1+( train_df.groupby('date')['weight_resp_1'].mean())).cumprod()
resp_2  = pd.Series(1+( train_df.groupby('date')['weight_resp_2'].mean())).cumprod()
resp_3  = pd.Series(1+( train_df.groupby('date')['weight_resp_3'].mean())).cumprod()
resp_4  = pd.Series(1+( train_df.groupby('date')['weight_resp_4'].mean())).cumprod()
ax.set_xlabel ("Day", fontsize=18)
ax.set_title ("Cumulative daily return(500 days)", fontsize=18)
resp.plot(lw=3, label='resp x weight')
resp_1.plot(lw=3, label='resp_1 x weight')
resp_2.plot(lw=3, label='resp_2 x weight')
resp_3.plot(lw=3, label='resp_3 x weight')
resp_4.plot(lw=3, label='resp_4 x weight')
plt.legend(loc="lower left")
del resp_1
del resp_2
del resp_3
del resp_4
gc.collect()

**ANALYSIS:** we can see that there were 'bigger' gains in the beginning and as time approach 500, the gain becomes smaller. In conclusion, the earlier trades are much bigger but we don't know what it's going to be like in our competition test set.

## Features

### Visualization

**Correlation Between Features**

In [None]:
corr = train_df.iloc[:, 7 : 137].corr()
px.imshow(corr, labels = dict(x = "Features", y = "Features"), width = 1000, height = 1000, title = "Correlation between Features",color_continuous_scale='algae')

**ANALYSIS:** Features seem to be forming clusters in the above correlation matrix. Features 17 to 26, 27 to 36, and 120 to 129 are some of the many examples shown. These are positively inclined to eachother. In a cluster, the intra cluster distance is lower than the inter cluster distance. Similarly, certain features are clearly negatively related to other features. Amongst the neutral grid, the postive and negative associations stand out!

**Correlation Between features and resps**

In [None]:
plt.figure(figsize = (20, 5))
fig = sns.heatmap(train_df.corr().iloc[2 : 7, 7 : -6], cmap = 'Paired')
fig.set(xlabel = 'Resps', ylabel = 'Features')
plt.show()

**ANALYSIS:** Features are either postively, negatively, or neutrally correlated to Resps. A pattern can be observed in the above heatmap which allows us to explore and dig deeper into their distributions. All the greens indicate negative association, blues indicate very negative association, reds indicate no association, oranges indicate positive association, purples, yellow, and brown indicate increasingly postive association in order.

**Correlation matrix is really large and confusing but there are clearly some patterns. I will cut it in parts for easier understanding and compare it to features dataframe.**

In [None]:
def show_corr_heatmap(df, method="pearson", width=10, calc_corr=False, annot=True):
    
    if calc_corr == True:
        if method == "MI":
            corr = MI_correlations(df)
        else:
            corr = df.corr(method)
    else:
        corr = df
        
    # Generate a mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(width, width))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(230, 20, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=annot, fmt=".2f")
    
    if calc_corr == True:
        return corr


def MI_correlations(df):
    corrs = {}
    for col_init in df.columns:
        corrs[col_init] = {}
        for col_corr in df.columns:
            if col_init != col_corr:
                corrs[col_init][col_corr] = calc_MI(df[col_init], df[col_corr])

    return pd.DataFrame(corrs)

def calc_MI(col_init, col_corr):
    
    if col_init.dtype == np.object:
        col_init = col_init.astype('category').cat.codes
    elif col_init.dtype.name == "category":
        col_init = col_init.cat.codes
        
    if col_corr.dtype == np.object:
        col_corr = col_corr.astype('category').cat.codes
    elif col_corr.dtype.name == "category":
        col_corr = col_corr.cat.codes

    mi = mutual_info_score(col_init, col_corr)

    return mi

**Features 0-50**

In [None]:
unnamed_features = [x for x in train_df.columns if "feature" in x]
corr_matrix = show_corr_heatmap(train_df[unnamed_features].iloc[0:50, 0:50], method="spearman", width=30, calc_corr=True, annot=True)

**Features 51-100**

In [None]:
corr_matrix = show_corr_heatmap(train_df[unnamed_features].iloc[51:100, 51:100], method="spearman", width=30, calc_corr=True, annot=True)
gc.collect()

**ANALYSIS:** Features 73-95 are highly correlated. Features 85-95 are closely related that they may show linear relationship.

**Features 110-120**

In [None]:
sns.pairplot(corr.iloc[110 : 120, 110 : 120])
plt.show()

**Features 120-130**

In [None]:
sns.pairplot(corr.iloc[120 : 130, 120 : 130])
plt.show()

**Features with resps**

In [None]:
for i in range(120, 122):

    fig, axes = plt.subplots(2, 2, figsize=(12,12))

    sns.scatterplot(data = train_df, x = f"feature_{str(i)}", y = "resp", ax = axes[0, 0], color = 'red')
    axes[0,0].set_title(f"Feature {str(i)} and Resp", fontsize = 12)
    axes[0,0].legend(labels=[f'Feature {str(i)}'])

    sns.scatterplot(data = train_df, x = f"feature_{str(i)}", y = "resp_1", ax = axes[0, 1], color = 'blue')
    axes[0,1].set_title(f"Feature {str(i)} and Resp 1", fontsize = 12)
    axes[0,1].legend(labels=[f'Feature {str(i)}'])

    sns.scatterplot(data = train_df, x = f"feature_{str(i)}", y = "resp_2", ax = axes[1, 0], color = 'green')
    axes[1,0].set_title(f"Feature {str(i)} and Resp 2", fontsize = 12)
    axes[1,0].legend(labels=[f'Feature {str(i)}'])

    sns.scatterplot(data = train_df, x = f"feature_{str(i)}", y = "resp_3", ax = axes[1, 1], color = 'yellow')
    axes[1,1].set_title(f"Feature {str(i)} and Resp 3", fontsize = 12)
    axes[1,1].legend(labels=[f'Feature {str(i)}'])
    
    plt.show()
gc.collect()

### ts_ids

In [None]:
trades_per_day = train_df.groupby(['date'])['ts_id'].count()
fig, ax = plt.subplots(figsize=(15, 5))
plt.plot(trades_per_day, color="purple")
ax.set_xlabel ("Day", fontsize=18)
ax.set_title ("Total number of ts_id for each day", fontsize=18)
ax.set_xlim(xmin=0)
ax.set_xlim(xmax=200)
plt.show()
del trades_per_day
gc.collect()

# Preprocessing

In [None]:
features = [col for col in list(train_df.columns) if 'feature' in col]

train_df = train_df[train_df['weight'] != 0]

### Cleaning the dataset

In [None]:
train_df['action'] = (train_df['resp'].values > 0).astype(int)

Imputing NAN values with mean

In [None]:
NAN_VALUE = -999

f_mean = train_df.mean()
train_df.fillna(f_mean)

X_train = train_df.loc[:, features]
y_train = train_df.loc[:, 'action']

del train_df
gc.collect()

## Using Catboost Classifier

In [None]:

print('Creating classifier...', end='')
clf = CatBoostClassifier(loss_function = 'Logloss',
                         task_type="GPU",
                         learning_rate = 0.1)


clf.fit(X_train, y_train)

print('Finished.')

del X_train, y_train
gc.collect()

In [None]:

env = janestreet.make_env()
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:    
    test_weight = test_df.iloc[0].weight
    if test_weight > 0:
        sample_prediction_df.action = clf.predict(test_df.loc[:, features].fillna(NAN_VALUE))[0]
    else:
        sample_prediction_df.action = 0
    env.predict(sample_prediction_df)