In [None]:
# DEFAULT IMPORTS

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# CUSTOM IMPORTS

# Faster load packages

!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1
import datatable as dt

# Garbage collection

import gc

# For creating deep copies

import copy

# Charts

import matplotlib.pyplot as plt
%matplotlib inline
!pip install seaborn --upgrade
import seaborn as sns

# Statistics

from scipy.stats import pearsonr, spearmanr

In [None]:
# CONTANTS AND VARIABLES

# Seaborn settings

sns.set(style="darkgrid")

# Colors for graphs

color_resp = ["#000080", "#0000EE", "#5190ED", "#88ACE0", "#B0C4DE"]

# Correlations p-values

def pearsonr_pval(x,y):
    return pearsonr(x,y)[1]

def spearmanr_pval(x,y):
    return spearmanr(x,y)[1]

# Load Data

In order to make the process faster and use less memory I read the data with the datatable and convert it with the datatable inspired by [Jane Street: EDA of day 0 and feature importance](https://www.kaggle.com/carlmcbrideellis/jane-street-eda-of-day-0-and-feature-importance#train_csv) and [Tutorial on reading large datasets](https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets#Method:-Datatable). Datatable object is temporary. Therefore I delete it as soon as it will not be useful anymore to save the memory.



In [None]:
# train.csv

dt_full_train = dt.fread('../input/jane-street-market-prediction/train.csv')
df_full_train = dt_full_train.to_pandas()
del dt_full_train

# features.csv

dt_full_features = dt.fread('../input/jane-street-market-prediction/features.csv')
df_full_features = dt_full_features.to_pandas()
del dt_full_features

gc.collect();

`df_full_train` - DataFrame with all the data from train.csv.

`df_full_features` - DataFrame with all the data from features.csv

To save even more memory while not losing the precision of data significantly, it is fine to convert float64 datatypes to float32 in DataFrame from df_full_train. It is inspired by [One-liner to Halve Your Memory Usage](https://www.kaggle.com/jorijnsmit/one-liner-to-halve-your-memory-usage).

In [None]:
df_full_train = df_full_train.astype({c: np.float32 for c in df_full_train.select_dtypes(include='float64').columns})

# General dataset overview

## Meaning of the values in the dataset

This is what we know for now about the data:

* `date` - the day of the trade.
* `feature_{0...129}` - real stock market data features. The examples of such features could be volume in various time horizons, volatility in various time horizons, indicators like RSI. The features are anonymyzed.
* `resp` - represents a return.
* `resp_1`, `resp_2`, `resp_3`, `resp_4` - it is a supplemental data of returns over specific but unspecified time horizons.
* `tag_{0...28}` - components/concepts used in future derivation.
* `ts_id` - time ordering.
* `weight` - the importance of the trade assigned by Jane Street which probably is some kind of ratio of transaction cost or in other words the capital invested in the trade.

`train.csv` consists of numerical data of resps, weights and features for specific trading opportunities. `features.csv` consists of information which tags are connected with the features (e.g. if feature_4 is the volatility of stock A in last 30 days and if tag_8 is volatility, tag_12 is 30 days, tag_16 is 5 days, then the intersection of the records mentioned above will be respectively True, True, False).

More information on the meaning of the values:
* [Data card in the competition page](https://www.kaggle.com/c/jane-street-market-prediction/data)
* [Discussion with janestreet-jjia](https://www.kaggle.com/c/jane-street-market-prediction/discussion/198965)
* [Metric de-anonymization by miguel perez](https://www.kaggle.com/c/jane-street-market-prediction/discussion/199107)
* [Question about resps answered by Will Cukierski](https://www.kaggle.com/c/jane-street-market-prediction/discussion/199478)

## train.csv

In [None]:
df_full_train.info()

In [None]:
df_full_train.head(20)

In [None]:
df_full_train.describe()

In [None]:
df_full_train.columns.values

In [None]:
df_full_train.groupby(['date']).count()

## features.csv

In [None]:
df_full_features.info()

In [None]:
df_full_features["feature"].values

In [None]:
df_full_features.head(20)

In [None]:
df_full_features.describe()

# Date

From outputs below you can see that date values range goes from 0 to 499. There are 500 dates in 2390491 rows. The dates do not occur the same number of times. Dates are montonically increasing. There are no NaN values in date column. Each value represents the day of trade.

In [None]:
df_full_train.groupby(['date']).size().values

In [None]:
df_full_train["date"].describe().apply("{0:.0f}".format)

In [None]:
df_full_train["date"].is_monotonic

In [None]:
df_full_train["date"].isna().values.any()

In [None]:
fig = plt.figure(constrained_layout=True, figsize=(12, 6))
fig.add_subplot()
fig.suptitle('Days and opportunities')
sns.lineplot(data=df_full_train["date"], ci=None);

In [None]:
del fig
gc.collect();

# ts_id

`ts_id` is an identifier for a time. It turns out that it is the same as indices. It means that the dataset is sorted by the time of the opportunitity.

In [None]:
df_full_train["ts_id"].head(100)

In [None]:
df_full_train["ts_id"].tail(100)

In [None]:
df_full_train["ts_id"].describe().apply("{0:.1f}".format)

In [None]:
df_full_train["ts_id"].is_monotonic

In [None]:
df_full_train["ts_id"].isna().values.any()

# resp

`Resp`s are returns. Their values seems to be in percentages, where for example (for the first `resp`) 0.006270 is 0,627%.

In [None]:
df_train_resps = df_full_train[["resp", "resp_1", "resp_2", "resp_3", "resp_4"]]

In [None]:
df_train_resps.head(100)

In [None]:
df_train_resps.tail(100)

What we see that medians and means for all the `resp`s is above 0. Minimum values in absolute terms are higher than maximum values.

In [None]:
df_train_resps.describe().applymap("{0:.5f}".format)

Time series for all of the `resp`s look random, but they do not need to be random. Charts confirm the values of standard deviations from the table.

In [None]:
fig = plt.figure(constrained_layout=True, figsize=(12, 15))
spec = fig.add_gridspec(4, 2)
ax1 = fig.add_subplot(spec[:2, 0:])
ax1.set_title('resp')
ax1.set(ylim=(-1, 1))
ax2 = fig.add_subplot(spec[2, 0])
ax2.set_title('resp_1')
ax2.set(ylim=(-1, 1))
ax3 = fig.add_subplot(spec[2, 1])
ax3.set_title('resp_2')
ax3.set(ylim=(-1, 1))
ax4 = fig.add_subplot(spec[3, 0])
ax4.set_title('resp_3')
ax4.set(ylim=(-1, 1))
ax5 = fig.add_subplot(spec[3, 1])
ax5.set_title('resp_4')
ax5.set(ylim=(-1, 1))
fig.suptitle('resp values')
sns.lineplot(data=df_train_resps["resp"], color=color_resp[0], ax=ax1, ci=None);
sns.lineplot(data=df_train_resps["resp_1"], color=color_resp[1], ax=ax2, ci=None);
sns.lineplot(data=df_train_resps["resp_2"], color=color_resp[2], ax=ax3, ci=None);
sns.lineplot(data=df_train_resps["resp_3"], color=color_resp[3], ax=ax4, ci=None);
sns.lineplot(data=df_train_resps["resp_4"], color=color_resp[4], ax=ax5, ci=None);

In [None]:
del fig, spec, ax1, ax2, ax3, ax4, ax5
gc.collect();

It is clearly visible that no `resp` follows normal distribution. They are steeper and have long tails.

In [None]:
def mark_points(ax):
    data_x1, data_y1 = ax.get_lines()[0].get_data()
    for i, val in enumerate([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]):
        yi = val 
        xi = np.interp(yi, data_y1, data_x1)
        x_margin = 0.01 if (i % 2 == 0) else -0.0075
        y_margin = 0
        annotate_alignment = "left" if (i % 2 == 0) else "right"
        ax.plot([xi], [yi], 'o', color='r')
        ax.annotate("x={0:.2f},y={1}".format(xi, yi), (xi + x_margin, yi + y_margin), fontsize=8, fontweight="bold", color='r', horizontalalignment=annotate_alignment)

def draw_dist(figure, specific, name, quantity):
    for i in range(quantity):
        resp = name
        if (i != 0):
            resp = resp + "_{}".format(i)
        axi = figure.add_subplot(specific[i, 0:])
        axi.set_title(resp)
        axi.set(ylim=(0, 0.012), xlim=(-0.6, 0.6))
        axj = axi.twinx()
        axj.set(ylim=(0, 1.2), xlim=(-0.6, 0.6))
        axi.set_zorder(1)
        axj.set_zorder(1)
        axj.yaxis.grid(False)
        sns.histplot(data=df_train_resps[resp], color=color_resp[i], stat="probability", ax=axi)
        ax2 = sns.ecdfplot(data=df_train_resps[resp], color="#FF9999", ax=axj, linewidth=3)
        mark_points(ax2)

fig1 = plt.figure(constrained_layout=True, figsize=(12, 20))
spec1 = fig1.add_gridspec(5, 1)
draw_dist(fig1, spec1, "resp", 5)
fig1.suptitle('resp values')
plt.show()

In [None]:
del mark_points, draw_dist, fig1, spec1
gc.collect();

The boxplots below confirm long tails by showing a large number of outliers. IQR (interquartile range, Q3 - Q1) is 1.5 which means that outliers lie below Q1 (the 1st quartile) and above Q3 (the 3rd quartile) by 1.5 * IQR.

In [None]:
fig2 = plt.figure(constrained_layout=True, figsize=(12, 20));
ax11 = sns.boxplot(data=df_train_resps);
fig2.add_axes(ax11);

In [None]:
del fig2, ax11
gc.collect();

Since `resp`s are returns in percentage a proper way to cumulate them is by multiplying the sum of value and 1 (products). In case of `resp`s there are so many records that the result passes the limit for `float64` (the result is inf). The value fits within numpy's `float128`. The chart below does not show any significant results apart from the chaos and very high values received. The table below the plots show that cumulative products are close to or above $10^{100}$.

In [None]:
ls_df_train_resps_cumprod = [df_train_resps["resp"].map(lambda x: x + 1).cumprod(), df_train_resps["resp_1"].map(lambda x: x + 1).cumprod(), df_train_resps["resp_2"].map(lambda x: x + 1).cumprod(), df_train_resps["resp_3"].map(lambda x: x + 1).cumprod(), df_train_resps["resp_4"].map(lambda x: x + 1).cumprod()]

fig3 = plt.figure(constrained_layout=True, figsize=(12, 21))
spec3 = fig3.add_gridspec(6, 2)
ax21 = fig3.add_subplot(spec3[:2, 0:])
ax21.set_title('all')
ax22 = fig3.add_subplot(spec3[2:4, 0:])
ax22.set_title('resp')
ax23 = fig3.add_subplot(spec3[4, 0])
ax23.set_title('resp_1')
ax24 = fig3.add_subplot(spec3[4, 1])
ax24.set_title('resp_2')
ax25 = fig3.add_subplot(spec3[5, 0])
ax25.set_title('resp_3')
ax26 = fig3.add_subplot(spec3[5, 1])
ax26.set_title('resp_4')
fig3.suptitle('resp cumulative product values')
sns.lineplot(data=ls_df_train_resps_cumprod, palette=color_resp, ax=ax21, ci=None);
sns.lineplot(data=ls_df_train_resps_cumprod[0], color=color_resp[0], ax=ax22, ci=None);
sns.lineplot(data=ls_df_train_resps_cumprod[1], color=color_resp[1], ax=ax23, ci=None);
sns.lineplot(data=ls_df_train_resps_cumprod[2], color=color_resp[2], ax=ax24, ci=None);
sns.lineplot(data=ls_df_train_resps_cumprod[3], color=color_resp[3], ax=ax25, ci=None);
sns.lineplot(data=ls_df_train_resps_cumprod[4], color=color_resp[4], ax=ax26, ci=None);

In [None]:
pd.concat(ls_df_train_resps_cumprod, axis=1).describe()

In [None]:
del ls_df_train_resps_cumprod, fig3, spec3, ax21, ax22, ax23, ax24, ax25, ax26
gc.collect();

The purpose of charts below is to show the magnitude and importance of long tails for `resp`s. Each charts presents the ratios between the products of two ranges. Every barplot compares the product of values in a tail and the rest of the records for below or above the median for different resps. The values of these ranges were obtained arbitrarily by trials and errors. 1 is the higher value in the pair and the second value is the ratio of the first one. They were calculated only for visualisation purposes. The green bars are for positive `resp` values and red bars for negative `resp` values.

Because of very high values passing `float64`, the calculations were made using natural logarithms.

$ratio = \frac{productrange_{1}}{productrange_{2}} \Leftrightarrow \ln ratio = \ln productrange_{1} - \ln productrange_{2} \Leftrightarrow e^{\ln ratio} = e^{\ln productrange_{1} - \ln productrange_{2}} \Leftrightarrow e^{\ln ratio} = \frac{e^{\ln productrange_{1}}}{e^{\ln productrange_{2}}} \Leftrightarrow ratio = \frac{productrange_{1}}{productrange_{2}}$


In [None]:
def ln_quant_values(series, quantiles_values):
    # Getting sum of natural logarithms from certain ranges of pandas series
    ln_values_list = []
    for i, v in enumerate(quantiles_values):
        if i < len(quantiles_values) - 1:
            v1 = quantiles_values[i+1]
            quant1 = series.quantile(q=v) if i != 0 else series.quantile(q=v) - 0.001 # For the first value include the minimum
            quant2 = series.quantile(q=v1)
            ln_sum = series.loc[(series > quant1) & (series <= quant2)].apply(lambda x: np.log(x + 1)).sum()
            ln_values_list.append(ln_sum)
        else:
            break
    return ln_values_list

def ln_normalized(ln_values_list):
    # Normalizing ln values so that the one with the highest absolute value is/has exponent equal to 0 (and argument equal to 1) and
    # the rest of logarithms has the argument which is a share of abs max argument
    ln_normalized_list = []
    for i, v in enumerate(ln_values_list):
        ln_values_list[i] = abs(ln_values_list[i])
    max_ln_value = max(ln_values_list)
    ln_element = max_ln_value
    for i in ln_values_list:
        if i != max_ln_value:
            ln_part_1 = i - ln_element
            ln_normalized_list.append(ln_part_1)
        else:
            ln_normalized_list.append(np.log(1))
    return ln_normalized_list

def is_positive(ln_values_list):
    # Returns the list of whether values are positive (true) or negative (false)
    pos_neg_bool_list = []
    for i in ln_values_list:
        if i >= 0:
            pos_neg_bool_list.append(True)
        else:
            pos_neg_bool_list.append(False)
    return pos_neg_bool_list

def conversion_to_values(ln_normalized_list):
    # Getting (normalized) natural logarithm arguments by solving the exponential equations where the base is euler's number and exponent
    # is natural logarithm
    values_list = []
    for i in ln_normalized_list:
        values_list.append(np.exp(i))
    return values_list

def get_resp_prod_normalized(series, quantiles_values):
    # The function above combined returning values list and info which values were positive (true) and which negative (false)
    ln_values_list = ln_quant_values(series, quantiles_values)
    pos_neg_bool_list = is_positive(ln_values_list)
    ln_normalized_list = ln_normalized(ln_values_list)
    values_list = conversion_to_values(ln_normalized_list)
    return {"values_list": values_list, "pos_neg_bool_list": pos_neg_bool_list}

def fig5_color_generator(pos_neg_bool_list):
    # Make the list of colors for positive (green) and negatives (red)
    fg5_color_pos = "#228B22"
    fg5_color_neg = "#DC143C"
    return [fg5_color_pos if i == True else fg5_color_neg for i in pos_neg_bool_list]

def single_tail_chart_generator(figure, specific_array, s, rng, resp_string, tail_string):
    # Generate a single bar chart for products of certain ranges (specificially 2)
    ax_n = figure.add_subplot(specific_array)
    ax_n.set_title("{}: Ratio of quantiles products - {} tail".format(resp_string, tail_string))
    dict_n = get_resp_prod_normalized(s, rng)
    df_n = pd.DataFrame(data={"Ranges": ["<{} ; {})".format(str(rng[0]), str(rng[1])), "<{} ; {}>".format(str(rng[1]), str(rng[2]))], "Fraction of max": dict_n["values_list"]})
    df_n.set_index("Ranges", inplace=True)
    color_list_n = fig5_color_generator(dict_n["pos_neg_bool_list"])
    sns.barplot(x=df_n.index, y="Fraction of max", data=df_n, palette=color_list_n, ax=ax_n, ci=None)

def tails_comparison_chart(figure, specific, data):
    # Builds entire figure with bar charts
    for i, v in enumerate(data):
        resp_name = "resp" + v[0]
        single_tail_chart_generator(figure, specific[i, 0], df_train_resps[resp_name], v[1], resp_name, "left")
        single_tail_chart_generator(figure, specific[i, 1], df_train_resps[resp_name], v[2], resp_name, "right")

# Found values (not optimal, they are just for an outline)
data_fig5 = [
    [
        "",
        [0, 0.0583175, 0.5],
        [0.5, 0.937985, 1]
    ],
    [
        "_1",
        [0, 0.04987, 0.5],
        [0.5, 0.948195, 1]
    ],
    [
        "_2",
        [0, 0.051, 0.5],
        [0.5, 0.946225, 1]
    ],
    [
        "_3",
        [0, 0.0576575, 0.5],
        [0.5, 0.937035, 1]
    ],
    [
        "_4",
        [0, 0.059975, 0.5],
        [0.5, 0.9343625, 1]
    ]
]

fig5 = plt.figure(constrained_layout=True, figsize=(12, 21))
spec5 = fig5.add_gridspec(6, 2)
fig5.suptitle('Tails products compared to tails of other ranges for resps')
tails_comparison_chart(fig5, spec5, data_fig5)

In [None]:
del tails_comparison_chart, single_tail_chart_generator, fig5_color_generator, get_resp_prod_normalized, conversion_to_values, is_positive, ln_normalized, ln_quant_values, data_fig5, fig5, spec5
gc.collect();

In the chart below we can see `resp`s for cumulative sum which is just an aggregation of pure, unchanged `resp`s values. It shows that the values does not seem to have constant variation or in other words be equally distributed throughout the time for all the `resp`s. E.g. for `resp` the plot looks exponentially.

In [None]:
ls_df_train_resps_cumsum = [df_train_resps["resp"].cumsum(), df_train_resps["resp_1"].cumsum(), df_train_resps["resp_2"].cumsum(), df_train_resps["resp_3"].cumsum(), df_train_resps["resp_4"].cumsum()]

fig4 = plt.figure(constrained_layout=True, figsize=(12, 21))
spec4 = fig4.add_gridspec(6, 2)
ax31 = fig4.add_subplot(spec4[:2, 0:])
ax31.set_title('all')
ax31.set(ylim=(-20, 1100))
ax32 = fig4.add_subplot(spec4[2:4, 0:])
ax32.set_title('resp')
ax32.set(ylim=(-20, 1100))
ax33 = fig4.add_subplot(spec4[4, 0])
ax33.set_title('resp_1')
ax33.set(ylim=(-20, 1100))
ax34 = fig4.add_subplot(spec4[4, 1])
ax34.set_title('resp_2')
ax34.set(ylim=(-20, 1100))
ax35 = fig4.add_subplot(spec4[5, 0])
ax35.set_title('resp_3')
ax35.set(ylim=(-20, 1100))
ax36 = fig4.add_subplot(spec4[5, 1])
ax36.set_title('resp_4')
ax36.set(ylim=(-20, 1100))
fig4.suptitle('resp cumulative values')
sns.lineplot(data=ls_df_train_resps_cumsum, palette=color_resp, ax=ax31, ci=None);
sns.lineplot(data=ls_df_train_resps_cumsum[0], color=color_resp[0], ax=ax32, ci=None);
sns.lineplot(data=ls_df_train_resps_cumsum[1], color=color_resp[1], ax=ax33, ci=None);
sns.lineplot(data=ls_df_train_resps_cumsum[2], color=color_resp[2], ax=ax34, ci=None);
sns.lineplot(data=ls_df_train_resps_cumsum[3], color=color_resp[3], ax=ax35, ci=None);
sns.lineplot(data=ls_df_train_resps_cumsum[4], color=color_resp[4], ax=ax36, ci=None);

In [None]:
del ls_df_train_resps_cumsum, fig4, spec4, ax31, ax32, ax33, ax34, ax35, ax36
gc.collect();

In [None]:
del df_train_resps
gc.collect();

# features and tags

In [None]:
df_features = df_full_features.set_index("feature")

In [None]:
pd.set_option('display.max_rows', 130)

In the first table below there are the lists of the tags received by features and their count. In the second one we see which features occured in specific tags.

In [None]:
def get_feature_tags(df):
    dict_features = {"index": [],
                     "count": [],
                    "tags": []}
    for index, (index_name, row) in enumerate(df.iterrows()):
        dict_features["index"].append(row.name)
        dict_features["count"].append(0)
        dict_features["tags"].append("")
        for index1, value in row.items():
            if value == True:
                space_between_tags = ", " if dict_features["tags"][index] != "" else ""
                dict_features["count"][index] += 1
                dict_features["tags"][index] += (space_between_tags + index1)
    return dict_features

dict_feature_tags = get_feature_tags(df_features)
df_feature_tags = pd.DataFrame(dict_feature_tags)
df_feature_tags = df_feature_tags.set_index("index")
df_feature_tags

In [None]:
df_feature_tags["tags_lists"] = df_feature_tags["tags"].apply(lambda x: x.split(", "))
df_feature_tags

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
def create_feature_names_dict_count(df):
    dict_features_list = dict.fromkeys(list(df.index), 0)
    return dict_features_list

def create_feature_dict_tree(df):
    dict_features_tree = {}
    for i in list(df.index):
        dict_feature_element = copy.deepcopy(create_feature_names_dict_count(df))
        dict_features_tree[i] = dict_feature_element
    return dict_features_tree

def get_features_frequency(df):
    dict_features = create_feature_dict_tree(df)
    for index, (index_name, row) in enumerate(df.iterrows()):
        if len(row["tags_lists"]) != 0 and row["tags_lists"] != ['']:
            feature_tag_list = row["tags_lists"]
            feature_denominator = len(feature_tag_list)
            for index1, (index_name1, row1) in enumerate(df.iterrows()):
                if len(row1["tags_lists"]) != 0 and row1["tags_lists"] != ['']:
                    n_feature_common_tags = 0
                    for i in feature_tag_list:
                        if i in row1["tags_lists"]:
                            n_feature_common_tags += 1
                    dict_features[index_name][index_name1] = n_feature_common_tags / feature_denominator

    return dict_features

df_features_frequency = pd.DataFrame(get_features_frequency(df_feature_tags)).applymap(float)
df_features_frequency.applymap("{:.2f}".format)

In [None]:
sns.set(style="white")
fig9 = plt.figure(constrained_layout=True, figsize=(80, 80))
ax9 = fig9.add_subplot()
mask9 = np.triu(np.ones_like(df_features_frequency, dtype=bool))
sns.heatmap(data=df_features_frequency, mask=mask9, ax=ax9, annot=True, fmt=".2f", cmap="vlag", center=0, cbar=False);

In [None]:
del fig9, ax9, df_features_frequency

In [None]:
pd.set_option('display.max_colwidth', None)
sns.set(style="darkgrid")

In [None]:
def get_tag_features(df):
    dict_tags = {"index": [],
                    "count": [],
                    "features": []}
    for index, (index_name, column) in enumerate(df.iteritems()):
        dict_tags["index"].append(index_name)
        dict_tags["count"].append(0)
        dict_tags["features"].append("")
        for index_name1, value in column.iteritems():
            if value == True:
                space_between_features = ", " if dict_tags["features"][index] != "" else ""
                dict_tags["count"][index] += 1
                dict_tags["features"][index] += (space_between_features + index_name1)
    return dict_tags
        
dict_tag_features = get_tag_features(df_features)
df_tag_features = pd.DataFrame(dict_tag_features)
df_tag_features.set_index("index", inplace=True)
df_tag_features

In [None]:
def create_tag_names_dict_count(df):
    dict_tags_list = dict.fromkeys(list(df.columns), 0)
    return dict_tags_list

def create_tag_dict_tree(df):
    dict_tags_tree = {}
    for i in list(df.columns):
        dict_tag_element = copy.deepcopy(create_tag_names_dict_count(df))
        dict_tags_tree[i] = dict_tag_element
    return dict_tags_tree

def get_tags_common(df):
    dict_tags = create_tag_dict_tree(df_features)
    for index, (index_name, column) in enumerate(df.iteritems()):
        for index1, (index_name1, value) in enumerate(column.iteritems()):
            if(value == True):
                for index2, (index_name2, value2) in enumerate(df.iloc[index1].iteritems()):
                    if(value2 == True):
                        dict_tags[index_name][index_name2] += 1
    return dict_tags

df_tags_frequency = pd.DataFrame(get_tags_common(df_features))

True values for tags appear together with other tags. Some of them can be interconnected. The first table below presents the count of coappearance of the tags while the second one shows the frequency in ratios (divided by the number of total tag occurances).

In [None]:
df_tags_frequency

In [None]:
df_tags_frequency = df_tags_frequency.apply(lambda x: x.map(lambda y: y / x.max())).round(2)
df_tags_frequency

The charts below show the ratio tables in barplots. If the color of the bar is red, it means that tag from x-axis is True in all the cases of True values for specific tag. The orange value happens in at least 75% of these cases, the yellow one in at least 50% of cases and the blue one when it is between 0% and 50%. Gray values are the values for analysed tags themselves and always amount of 100%.

In [None]:
def color_generator_fig8(s):
    color_list = []
    name = s.name
    for index, value in s.iteritems():
        if index == s.name:
            color_list.append("#9B9898")
        else:
            if value == 1:
                color_list.append("#FF0000")
            elif value >= 0.75:
                color_list.append("#FF7400")
            elif value >= 0.5:
                color_list.append("#FFC100")
            else:
                color_list.append("#0080FF")
    return color_list

def barchart_generator_fig8(df, figure, specific):
    for i, v in enumerate(df):
        ax8 = figure.add_subplot(specific[i, 0:])
        ax8.set_title(v)
        barplot8 = sns.barplot(x=df_tags_frequency.index, y=df_tags_frequency[v], palette=color_generator_fig8(df_tags_frequency[v]), ax=ax8, ci=None)
        ax8.set_ylabel("")
        barplot8.set_xticklabels(barplot8.get_xticklabels(), rotation=90);

fig8 = plt.figure(constrained_layout=True, figsize=(12, 80))
spec8 = fig8.add_gridspec(29, 2)
barchart_generator_fig8(df_tags_frequency, fig8, spec8)

In [None]:
del get_feature_tags, get_tag_features, create_tag_names_dict_count, create_tag_dict_tree, get_tags_common, df_features, color_generator_fig8, barchart_generator_fig8, df_feature_tags, df_tag_features, df_tags_frequency, dict_feature_tags, dict_tag_features, fig8, spec8
gc.collect();

In [None]:
df_full_train_features = df_full_train.iloc[:, 7:137]
df_full_train_features

In [None]:
pd.set_option('display.max_columns', 140)

In [None]:
df_full_train_features.describe().applymap("{0:.8f}".format)

In [None]:
df_full_train_features.isna().sum().T.to_frame().transpose()

In [None]:
df_full_train_features.isna().any(axis=1).sum()

In [None]:
gc.collect();

In [None]:
def draw_feature_charts(s, figure, specific, i):
    ax1 = figure.add_subplot(specific[i*2, 0:])
    ax1.set_title(s.name)
    sns.lineplot(data=s, ax=ax1, ci=None);
    ax1.set_ylabel("")
    ax2 = figure.add_subplot(specific[(i*2)+1, 0])
    sns.scatterplot(data=s, ax=ax2, ci=None);
    ax2.set_ylabel("")
    ax3 = figure.add_subplot(specific[(i*2)+1, 1])
    sns.histplot(data=s, stat="probability",ax=ax3);
    ax3.set_ylabel("")
    ax3.set_xlabel("")
    ax4 = figure.add_subplot(specific[(i*2)+1, 2])
    sns.boxplot(data=s, ax=ax4);

def draw_feature_figure(df):
    n_columns = len(df.columns)
    fig7 = plt.figure(constrained_layout=True, figsize=(12, 5*n_columns))
    spec7 = fig7.add_gridspec(2*n_columns, 3)
    for index, (index_name, column) in enumerate(df.iteritems()):
        draw_feature_charts(column, fig7, spec7, index)

In [None]:
draw_feature_figure(df_full_train_features.iloc[:, :10])
gc.collect();

In [None]:
draw_feature_figure(df_full_train_features.iloc[:, 10:20])
gc.collect();

In [None]:
draw_feature_figure(df_full_train_features.iloc[:, 20:30])
gc.collect();

In [None]:
draw_feature_figure(df_full_train_features.iloc[:, 30:40])
gc.collect();

In [None]:
draw_feature_figure(df_full_train_features.iloc[:, 40:50])
gc.collect();

In [None]:
draw_feature_figure(df_full_train_features.iloc[:, 50:60])
gc.collect();

In [None]:
draw_feature_figure(df_full_train_features.iloc[:, 60:70])
gc.collect();

In [None]:
draw_feature_figure(df_full_train_features.iloc[:, 70:80])
gc.collect();

In [None]:
draw_feature_figure(df_full_train_features.iloc[:, 80:90])
gc.collect();

In [None]:
draw_feature_figure(df_full_train_features.iloc[:, 90:100])
gc.collect();

In [None]:
draw_feature_figure(df_full_train_features.iloc[:, 100:110])
gc.collect();

In [None]:
draw_feature_figure(df_full_train_features.iloc[:, 110:120])
gc.collect();

In [None]:
draw_feature_figure(df_full_train_features.iloc[:, 120:130])
gc.collect();

In [None]:
del draw_feature_figure, draw_feature_charts
gc.collect();

In [None]:
sns.set(style="white")

In [None]:
df_full_features_corr = df_full_train_features.corr(method="pearson")
df_full_features_corr

In [None]:
fig10 = plt.figure(constrained_layout=True, figsize=(80, 80))
fig10.suptitle('Pearsons correlation coefficients', fontsize=100)
ax10 = fig10.add_subplot()
mask10 = np.triu(np.ones_like(df_full_features_corr, dtype=bool))
sns.heatmap(data=df_full_features_corr, mask=mask10, ax=ax10, annot=True, fmt=".2f", cmap="vlag", center=0, cbar=False);

In [None]:
del df_full_features_corr, fig10, ax10, mask10
gc.collect();

In [None]:
df_full_features_corr_p = df_full_train_features.corr(method=pearsonr_pval)
df_full_features_corr_p

In [None]:
fig11 = plt.figure(constrained_layout=True, figsize=(80, 80))
fig11.suptitle('Pearsons correlation p-value', fontsize=100)
ax11 = fig11.add_subplot()
mask11 = np.triu(np.ones_like(df_full_features_corr_p, dtype=bool))
cmap11 = sns.diverging_palette(145, 300, s=60, as_cmap=True)
sns.heatmap(data=df_full_features_corr_p, mask=mask11, ax=ax11, annot=True, fmt=".2f", cmap=cmap11, center=0.05, cbar=False);

In [None]:
del df_full_train_features, df_full_features_corr_p, fig11, ax11, mask11, cmap11
gc.collect();

# weight

In [None]:
df_full_train["weight"].isna().sum()

In [None]:
fig12 = plt.figure(constrained_layout=True, figsize=(12, 12))
spec12 = fig12.add_gridspec(2, 1)
fig12.suptitle('Weight')
ax12 = fig12.add_subplot(spec12[0,0])
sns.lineplot(data=df_full_train["weight"], ax=ax12, ci=None)
ax13 = fig12.add_subplot(spec12[1,0])
sns.scatterplot(data=df_full_train["weight"], ax=ax13, ci=None);

In [None]:
del fig12, ax12, ax13
gc.collect();

In [None]:
df_weights_0 = df_full_train[df_full_train["weight"] == 0]

In [None]:
len(df_weights_0)

In [None]:
df_weights_0[["resp", "resp_1", "resp_2", "resp_3", "resp_4"]]

In [None]:
df_weights_0[["resp", "resp_1", "resp_2", "resp_3", "resp_4"]].describe()

In [None]:
fig14 = plt.figure(constrained_layout=True, figsize=(12, 6))
fig14.suptitle('Resp for weights values 0')
ax14 = fig14.add_subplot()
sns.scatterplot(data=df_weights_0["resp"], ax=ax14, ci=None);

In [None]:
del df_weights_0, fig14, ax14
gc.collect();

# resps and other values

In [None]:
pd.set_option('display.max_columns', 140)

In [None]:
df_full_train_bool = copy.deepcopy(df_full_train)
df_full_train_bool["resp"] = df_full_train_bool["resp"].map(lambda x: True if x >= 0 else False)
df_full_train_bool 

In [None]:
def histplot_resp_true_false(df, column_name, figure, specific, i):
    ax = figure.add_subplot(specific[i,0])
    sns.histplot(data=df.loc[df["resp"] == True, [column_name]], palette=["#228B22"], ax=ax, stat="probability")
    label_string = column_name + " for resp>=0"
    ax.legend(labels=[label_string])
    ax.set_ylabel("")
    ax1 = figure.add_subplot(specific[i,1])
    sns.histplot(data=df.loc[df["resp"] == False, [column_name]], palette=["#DC143C"], ax=ax1, stat="probability");
    label_string1 = column_name + " for resp<0"
    ax1.legend(labels=[label_string1])
    ax1.set_ylabel("")
    # Bottom
    if ax.get_ylim()[0] < ax1.get_ylim()[0]:
        ax1.set_ylim(bottom=ax.get_ylim()[0])
    else:
        ax.set_ylim(bottom=ax1.get_ylim()[0])
    # Top
    if ax.get_ylim()[1] > ax1.get_ylim()[1]:
        ax1.set_ylim(top=ax.get_ylim()[1])
    else:
        ax.set_ylim(top=ax1.get_ylim()[1])
    # Right
    if ax.get_xlim()[1] < ax1.get_xlim()[1]:
        ax1.set_xlim(right=ax.get_xlim()[1])
    else:
        ax.set_xlim(right=ax1.get_xlim()[1])
    # Left
    if ax.get_xlim()[0] > ax1.get_xlim()[0]:
        ax1.set_xlim(left=ax.get_xlim()[0])
    else:
        ax.set_xlim(left=ax1.get_xlim()[0])
        
def draw_bool_resp(df):
    # resp in the first column
    n_columns = len(df.columns) - 1
    fig13 = plt.figure(constrained_layout=True, figsize=(12, 2*n_columns))
    spec13 = fig13.add_gridspec(n_columns, 2)
    for index, (index_name, column) in enumerate(df.iteritems()):
        if column.name != "resp":
            histplot_resp_true_false(df, column.name, fig13, spec13, index - 1)

features_columns_names = list(df_full_train_bool.columns)[7:137]

In [None]:
draw_bool_resp(df_full_train_bool[["resp"] + features_columns_names[:10]])
gc.collect();

In [None]:
draw_bool_resp(df_full_train_bool[["resp"] + features_columns_names[10:20]])
gc.collect();

In [None]:
draw_bool_resp(df_full_train_bool[["resp"] + features_columns_names[20:30]])
gc.collect();

In [None]:
draw_bool_resp(df_full_train_bool[["resp"] + features_columns_names[30:40]])
gc.collect();

In [None]:
draw_bool_resp(df_full_train_bool[["resp"] + features_columns_names[40:50]])
gc.collect();

In [None]:
draw_bool_resp(df_full_train_bool[["resp"] + features_columns_names[50:60]])
gc.collect();

In [None]:
draw_bool_resp(df_full_train_bool[["resp"] + features_columns_names[60:70]])
gc.collect();

In [None]:
draw_bool_resp(df_full_train_bool[["resp"] + features_columns_names[70:80]])
gc.collect();

In [None]:
draw_bool_resp(df_full_train_bool[["resp"] + features_columns_names[80:90]])
gc.collect();

In [None]:
draw_bool_resp(df_full_train_bool[["resp"] + features_columns_names[90:100]])
gc.collect();

In [None]:
draw_bool_resp(df_full_train_bool[["resp"] + features_columns_names[100:110]])
gc.collect();

In [None]:
draw_bool_resp(df_full_train_bool[["resp"] + features_columns_names[110:120]])
gc.collect();

In [None]:
draw_bool_resp(df_full_train_bool[["resp"] + features_columns_names[120:130]])
gc.collect();

In [None]:
del features_columns_names
gc.collect();

In [None]:
draw_bool_resp(df_full_train_bool[["resp", "resp_1", "resp_2", "resp_3", "resp_4", "weight"]])
gc.collect();

In [None]:
del draw_bool_resp, histplot_resp_true_false, df_full_train_bool
gc.collect();