In [None]:
import gc

import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

from tensorflow import random

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sn


random.set_seed(5577)

## Data Loading

In [None]:
%%time
trainDf = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')

## Reducing Memory Usage
I had trouble to use the dataset due to it using around 5GB of RAM just after being loaded. I found this function from [sbunzini](https://www.kaggle.com/sbunzini/reduce-memory-usage-by-75) to mitigate the issue.

In [None]:
def reduce_memory_usage(df):
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)

            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')

    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

trainDf = reduce_memory_usage(trainDf)

## Data Preparation

Removing columns that will not be used as feature for the training phase.

In [None]:
dropCols = ["resp", "resp_1", "resp_2", "resp_3", "resp_4", "ts_id"]
trainDf = trainDf.drop(columns=dropCols)

Filling "na" with 0 for starter. It might be wiser to use some other techniques (imputing, mean, ...) but for a first version, this will do the job.

In [None]:
trainDf.isnull().sum()

In [None]:
trainDf.fillna(0, inplace=True)

According to the data tab of the competition :
> Trades with weight = 0 were intentionally included in the dataset for completeness, although such trades will not contribute towards the scoring evaluation.

So, I make a slice without those rows before looking into the details.

In [None]:
trainDfW = trainDf[trainDf["weight"] > 0]
trainDfW.head()

## Data Exploration
### Basics

In [None]:
trainDf.shape

In [None]:
trainDfW.shape

In [None]:
trainDfW.head()

In [None]:
trainDfW.describe()

## Data Understanding

### Correlation Matrix

In [None]:
%%time
corrDfW = trainDfW.corr()
fig, ax = plt.subplots(figsize=(25,25)) 
sn.heatmap(corrDfW, linewidths=.5, annot=False, ax=ax)
plt.show()

Although the matrix is quite heavy, it allows to identify some interesting clusters. Some features seems highly (positively or negatively) correlated. The next step would be to pinpoint those features and check from features.csv if they share the same tags... ToDo

### PCA

In [None]:
%%time
scaler = MinMaxScaler()
scaledTrain = scaler.fit_transform(trainDfW)

pca = PCA().fit(scaledTrain)
exCumul = np.cumsum(pca.explained_variance_ratio_)
px.area(
    x=range(1, exCumul.shape[0] + 1),
    y=exCumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

Here, we can see that :
* One of the component accounts for a third (36.9%) of the total variance
* The threshold of 90% variance is explained by 8 components
* The threshold of 95% variance is explained by 11 components

In [None]:
pca = PCA(n_components=2)
dfComp = pca.fit_transform(scaledTrain)

total_var = pca.explained_variance_ratio_.sum() * 100
fig = px.scatter(dfComp, x=0, y=1, color=trainDfW['weight'], title=f'Total Explained Variance: {total_var:.3f}%', labels={'0': 'PC 1', '1': 'PC 2'})
fig.show()

Now lets take a look of the two major principal components when we remove feature_0 from the dataset.

In [None]:
dfNoF0 = trainDfW.drop("feature_0", 1)
scaledTrainNoF0 = scaler.fit_transform(dfNoF0)
pca = PCA(n_components=2)
dfComp = pca.fit_transform(scaledTrainNoF0)

total_var = pca.explained_variance_ratio_.sum() * 100
fig = px.scatter(dfComp, x=0, y=1, color=trainDfW['weight'], title=f'Total Explained Variance: {total_var:.3f}%', labels={'0': 'PC 1', '1': 'PC 2'})
fig.show()