# This file provides some useful tools for looking at the numbered features. I'll also provide a quick analysis of the early numbered features.

In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Load the training data from a pre-saved feather.

In [None]:
#Something gets lost when you share the file that writes the feather. I guess I'd need to save it as a dataset instead of a file.
#For this purpose, I'm just loading the csv, even though it takes a few minutes.
#train = pd.read_feather('/kaggle/input/janestreet-data/train.feather')

train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
print(train.head())

# Calculate the cross-correlation matrix and plot it.

In [None]:
correlation_matrix = train.corr()

In [None]:
cmap = sns.diverging_palette(250, 20, s=99, as_cmap=True)

# Most people prefer to use a mask to zero out the upper half of the plot. I actually prefer it this way,
# just remember that the plot is symetric around x=y. 
plt.figure(figsize=(18,16))
sns.heatmap(correlation_matrix, cmap=cmap, center=0, vmax=1, vmin=-1, 
            square=True, linewidths=.02, cbar_kws={"shrink": .5})

# Plot the cross-correlation matrix for the first 42 numbered features.

1. Note that the features come in pairs.
2. There are related sets of features, including 3-6, 7-16, 17-26, and 27-36. These sets share similar patterns in NA values (not shown). These sets also show shared relationships to feature_0. 
3. If you look closely, the internal structure of the corrlations between features 7-16 and 17-26 follows similar patterns. For example feature_7 is more correlated to feature_11 than feature_9. Following the same pattern, feature_17 is more correlated to feature_21 than feature_19. Such patterns are consistent between the ranges 7-16, 17-26, and 27-36.
4. Partial speculation. Features 3 to 36 are all closely related. I'm guessing that the eariler features represent something like 'raw data' and that the latter (higher-numbered) features are calulated from the earlier (lower-numbered) features. Features 37-40 are the summary results.

In [None]:
# The index of feature_0.
idx_0 = 7

# These values can be changed to get different ranges of features.
min_feature = 0
max_feature = 42

plt.figure(figsize=(15,12))
sns.heatmap(correlation_matrix.iloc[idx_0 + min_feature: idx_0 + max_feature, idx_0 + min_feature: idx_0 + max_feature],
            cmap=cmap, vmax=1, vmin=-1, center= 0, square=True, linewidths=.05, cbar_kws={"shrink": .5})

# This section provides some useful summary plots of a single feature.

In [None]:
# Select the feature you'd like to study by changing this variable.
feat_no = 7

feature = 'feature_' + str(feat_no)

# The first plot is simply a few days of data for your chosen feature. Plotting a couple days is a lot more helpful than the whole dataset.
days = 3
first_day= 100

def plot_time_series(train, first_day, days, feature, ax=None):
    """Plots a single feature in train over a number of days starting with first_day."""
    plt.figure(figsize=(20, 6))
    y = train.loc[(train.date >= first_day) & (train.date < first_day + days)]
    x = range(len(y))
    sns.scatterplot(x, y[feature], hue=y.date, size=0, ax=ax)
    
plot_time_series(train, first_day, days, feature)

# This is just a simple distribution plot for the selected feature.
plt.figure(figsize=(6, 6))
sns.distplot(train[feature])

# Using the feature selected in the last section, let's examine its relationship with our target variable, resp.

For the selected feature, you get very different results based on the value of feature_0. That's often true.

In [None]:
# This section plots the feature specified in the previous block of code.
print(feature)

target = 'resp'
buckets = 20

def feature_vs_target(train, feature, target, buckets, ax=None):
    """Plot averaged target and feature values after bucketing them on feature values."""
    df = train.loc[:, [target, feature]]
    # Create a column that represents which bucket a given feature value falls into.
    df['bins'] = pd.qcut(train[feature], buckets, duplicates='drop')
    # Group the data on bins, and take its average.
    plot_data = df.groupby('bins').mean()
    # Plot the averaged relationship between feature and target.
    sns.scatterplot(x=plot_data[feature], y=plot_data[target], ax=ax)
    return plot_data

# The code that calls the above function.
fig, axes = plt.subplots(1, 2, figsize=(14,6))
feature_vs_target(train.loc[train.feature_0 < 0], feature, target, buckets, ax=axes[0]);
axes[0].set_title(f'{feature} vs. RESP, feature_0 = -1')
feature_vs_target(train.loc[train.feature_0 > 0], feature, target, buckets, ax=axes[1]);
axes[1].set_title(f'{feature} vs. RESP, feature_0 = +1')

# This section allows comparisons of two variable's relationships with the target variable and each other.

1. For example, using features 37 and 39 will show that they both predict resp. Further, they are not independently distributed. In fact, they neatly split the examples (rows) into two different distributions in the joint distribution plot.
2. All of the features between 7 and 36 appear (to me) to be intermediate calculations that lead to features 37-40, which split the data into cleanly defined groups. These groups would be easily picked up by a machine learning algorithm.
3. Summary, I believe that you are looking at Janestreet's feature engineering effort, which created features 37-40 from lower-numbered features.
4. Feature 41 represents the start of a new type of data.

These conclusions are based on looking at a lot of combinations and individual features. Feel free to use the tools provided (or add new ones) to draw your own conclusions.

In [None]:
# Select the features to be examined.
feat1_no = 37
feat2_no = 39

feature1 = 'feature_' + str(feat1_no)
feature2 = 'feature_' + str(feat2_no)

# Plot both features relationship with the target variable.
plt.figure(figsize=(8, 8))
feature_vs_target(train, feature1, target, buckets)
feature_vs_target(train, feature2, target, buckets)

# Plot the joint distribution of the target variables. To save time, use only a portion of the data.
plt.figure(figsize=(8, 8))
sns.jointplot(data=train.iloc[100_000:120_000], x=feature1, y=feature2, kind='kde')

# Create summary plots for all features.

You aren't meant to study at all these plots. It's more like a reference, but you can see some interesting things by scanning through it all.

In [None]:
target = 'resp'
buckets = 20
first_day = 100
days = 2


# Plot the unnumbered columns.
for col in train.columns[0:6]:
    fig, axes = plt.subplots(1, 3, figsize=(15,4))
    fig.suptitle(col.capitalize())
    axes[0].set_title('Distribution')
    sns.distplot(train[col], ax=axes[0])
    feature_vs_target(train, col, target, buckets, ax=axes[1])
    axes[1].set_title("Relationship with RESP")
    plot_time_series(train, first_day, days, col, ax=axes[2])
    axes[2].set_title('Two Days of Data') 

In [None]:
# Plot the numbered columns.

# This gets rid of an annoying warning.
plt.rcParams.update({'figure.max_open_warning': 0})

# Plot the numbered columns.
for col in train.columns[8:]:
    fig, axes = plt.subplots(1, 4, figsize=(18,4))
    fig.suptitle(col.capitalize())
    axes[0].set_title('Distribution')
    sns.distplot(train[col], ax=axes[0])
    feature_vs_target(train.loc[train.feature_0 < 0], col, target, buckets, ax=axes[1])
    axes[1].set_title('Feature vs. RESP, feature_0 = -1')
    feature_vs_target(train.loc[train.feature_0 > 0], col, target, buckets, ax=axes[2])
    axes[2].set_title('Feature vs. RESP, feature_0 = +1')
    plot_time_series(train, first_day, days, col, ax=axes[3])
    axes[3].set_title('Two Days of Data') 