In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [None]:
import os
import gc
import sys
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
import datatable as dt
import matplotlib.pyplot as plt
plt.style.use('tableau-colorblind10')

In [None]:
folder_path = '../input/jane-street-market-prediction/'
train_data = dt.fread(folder_path + 'train.csv').to_pandas()
features = dt.fread(folder_path + 'features.csv').to_pandas()

In [None]:
# first I need the features with highest nan count
top_nan_features_vals = train_data.isna().sum().sort_values(ascending=False)
top_nan_features = top_nan_features_vals.index

In [None]:
date_feature_nan = train_data.groupby('date').apply(lambda x: x.isna().sum())

In [None]:
date_feature_nan[top_nan_features[:10]].plot(figsize=(12, 6), subplots=True);

# Why do features miss values together?

In [None]:
features_and_date = list(top_nan_features[:10]) + ['date']
train_data[features_and_date].query('date == 0').plot(subplots=True, figsize=(12, 8));

In [None]:
train_data[features_and_date].query('date == 1').plot(subplots=True, figsize=(12, 8));

In [None]:
train_data[features_and_date].query('date == 2').plot(subplots=True, figsize=(12, 8));

In [None]:
train_data[features_and_date].query('date == 3').plot(subplots=True, figsize=(12, 8));

In [None]:
train_data[features_and_date].query('date == 4').plot(subplots=True, figsize=(12, 8));

In [None]:
train_data[features_and_date].query('date == 5').plot(subplots=True, figsize=(12, 8));

We can see by visualizaing the features with top 10 NaN value counts in the first 5 days they show a consistent pattern of missing values which is always present in the beginning of the day, and then another one in the beginning of the last third.

If we keep visualizing different dates we would find the same pattern happening. But why does that happen? Is it just noise or is this some meaningful aspect of the data?

Let's take a full of all the features missing data pattern over the first day to get a clearer picture.

In [None]:
top_nan_features_vals[:60]

If we just look into the number of missing values we can see that features cluster with each other. And If we visualize them with this order we can see the clusters clearly.

In [None]:
features_and_date = list(top_nan_features[:60]) + ['date']
train_data[features_and_date].query('date == 0').plot(subplots=True, figsize=(12, 8), legend=False);

In [None]:
train_data[features_and_date].query('date == 1').plot(subplots=True, figsize=(12, 8), legend=False);

In [None]:
train_data[features_and_date].query('date == 136').plot(subplots=True, figsize=(12, 8), legend=False);

# Testing PCA of feature clusters

I'll define the clusters based on similar patterns in NaN values, and I'll first test the idea with the top 14 features.

In [None]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import numpy as np

#define first cluster features and fill na with strategy (ffill)
cluster_1_features = top_nan_features[:14]
train_data.loc[:, cluster_1_features] = train_data.loc[:, cluster_1_features].fillna(method='ffill').fillna(0)

# define pipeline
steps = [('scaler', MinMaxScaler()), ('pca', PCA(random_state=42))]
pipeline = Pipeline(steps=steps)

pipeline.fit(train_data.loc[:, cluster_1_features])

plt.figure(figsize=(10, 6))
plt.plot(list(range(1, len(cluster_1_features)+1)), pipeline['pca'].explained_variance_ratio_.cumsum());
plt.xlabel('No. of Features')
plt.ylabel('Explained Variance')
plt.axvline(5, color='r', linestyle='--');

We can use 5 features to explain more than 90% of the variance of the original cluster.

In [None]:
import numpy as np

# second cluster
cluster_2_features = top_nan_features[14:31]

train_data.loc[:, cluster_2_features] = train_data.loc[:, cluster_2_features].fillna(method='ffill').fillna(0)

# define pipeline
steps = [('scaler', MinMaxScaler()), ('pca', PCA(random_state=42))]
pipeline = Pipeline(steps=steps)

pipeline.fit(train_data.loc[:, cluster_2_features])

plt.figure(figsize=(10, 6))
plt.plot(list(range(1, len(cluster_2_features)+1)), pipeline['pca'].explained_variance_ratio_.cumsum());
plt.xlabel('No. of Features')
plt.ylabel('Explained Variance')
plt.axvline(7, color='r', linestyle='--');

7 components in the second cluster is suitable for explaining 95% of the variance.

In [None]:
# third cluster
cluster_3_features = top_nan_features[31:47]

train_data.loc[:, cluster_3_features] = train_data.loc[:, cluster_3_features].fillna(method='ffill').fillna(0)

# define pipeline
steps = [('scaler', MinMaxScaler()), ('pca', PCA(random_state=42))]
pipeline = Pipeline(steps=steps)

pipeline.fit(train_data.loc[:, cluster_3_features])

plt.figure(figsize=(10, 6))
plt.plot(list(range(1, len(cluster_3_features)+1)), pipeline['pca'].explained_variance_ratio_.cumsum());
plt.xlabel('No. of Features')
plt.ylabel('Explained Variance')
plt.axvline(7, color='r', linestyle='--');

7 components in the third cluster is suitable for explaining 95% of the variance.

In [None]:
# fourth cluster
cluster_4_features = top_nan_features[47:63]

train_data.loc[:, cluster_4_features] = train_data.loc[:, cluster_4_features].fillna(method='ffill').fillna(0)

# define pipeline
steps = [('scaler', MinMaxScaler()), ('pca', PCA(random_state=42))]
pipeline = Pipeline(steps=steps)

pipeline.fit(train_data.loc[:, cluster_4_features])

plt.figure(figsize=(10, 6))
plt.plot(list(range(1, len(cluster_4_features)+1)), pipeline['pca'].explained_variance_ratio_.cumsum());
plt.xlabel('No. of Features')
plt.ylabel('Explained Variance')
plt.axvline(6, color='r', linestyle='--');

6 components in the fourth cluster is almost suitable for explaining around 94% of the variance.

In [None]:
# checking out the last features with NaNs
features_and_date = list(top_nan_features[63:87]) + ['date', 'ts_id']
train_data[features_and_date].query('date == 0 and ts_id > 3000 and ts_id < 4000').plot(subplots=True, figsize=(12, 8), legend=False);

We can see that among these features, some are nearly identical, which could merit the use of PCA.

In [None]:
# fifth cluster
cluster_5_features = top_nan_features[63:87]

train_data.loc[:, cluster_5_features] = train_data.loc[:, cluster_5_features].fillna(method='ffill').fillna(0)

# define pipeline
steps = [('scaler', MinMaxScaler()), ('pca', PCA(random_state=42))]
pipeline = Pipeline(steps=steps)

pipeline.fit(train_data.loc[:, cluster_5_features])

plt.figure(figsize=(10, 6))
plt.plot(list(range(1, len(cluster_5_features)+1)), pipeline['pca'].explained_variance_ratio_.cumsum());
plt.xlabel('No. of Features')
plt.ylabel('Explained Variance')
plt.axvline(7, color='r', linestyle='--');

7 components in the fourth cluster is almost suitable for explaining around 94% of the variance.

**Now maybe these extracted features could be used along all features or along the non NaN features.**