<font size=6>Predicting Future Sales</font>  
<font size=5>An analysis of features distribution</font>

This notebook analyses the distribution of the features created in the kernel https://www.kaggle.com/sylvainfriot/an-economic-oriented-feature-engineering  
I do a lot of graphic tries and explorations.  

I notice two important facts :  
- a large imbalance in the target (item_count_month) with a lot of zero values. As a result, most features have a lot of 0 values or -1 values, according to the way they were calculated.  
- a lot of outliers (mostly in the high values) for a lot of features.  
  
Those two facts give some weird distribution curves.  
I also look at the distribution without the values (0 or -1) implied by the target zero values.  
**The main conclusion of this exploration is that models based on the normality asumption should be avoid to treat those data.**

# Notebook setup

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as st
import statsmodels.stats as sm
import statsmodels.stats.api as sma
import statsmodels.api as sa
import matplotlib.pyplot as plt
import seaborn as sns

**Graphs set-up**

In [None]:
# changes in matplotlib default parameters
def mydefault_plt_parameters(figsize=(12, 8), mult_param=1.0):
    plt.rcParams['figure.figsize'] = figsize
    plt.rcParams['font.size'] = np.around(18 * mult_param)
    plt.rcParams['axes.titlepad'] = np.around(20 * mult_param)
    plt.rcParams['axes.labelpad'] = np.around(15 * mult_param)
    plt.rcParams['figure.titleweight'] = 'bold'
    plt.rcParams['axes.titleweight'] = 'bold'
    plt.rcParams['legend.framealpha'] = 1
    plt.rcParams['legend.facecolor'] = (0.95,0.95,0.95)
    plt.rcParams['legend.edgecolor'] = (0.95,0.95,0.95)
    plt.rcParams['savefig.orientation'] = 'landscape'
    plt.rcParams['savefig.dpi'] = 300
    plt.rcParams['savefig.bbox'] = 'tight'

In [None]:
def set_sns_colors(is_mono=False, color_palette=None,
                   ncolors=None, desat=None):
    if color_palette is None:
        if is_mono:
            sns.set_palette(sns.light_palette("navy"))
        else:
            sns.set_palette("Set2")
    else:
        sns.set_palette(color_palette, ncolors, desat)
    return sns.color_palette()

In [None]:
sns.set_style("whitegrid")
list_colors = set_sns_colors()
mydefault_plt_parameters()

In [None]:
def test_normality(x, feature_name, alpha=0.05):
    x = np.array(x)
    if len(x) < 5000:
        my_index = ["Shapiro-Wilk", "Jarque-Bera", "Anderson-Darling"]
    else:
        my_index = ["Kolmogorov-Smirnov", "Jarque-Bera", "Anderson-Darling"]
    df_normality = pd.DataFrame(index=my_index, columns=["p_value"])
    if len(x) < 5000:
        df_normality.iloc[0, 0] = np.round(st.shapiro(x)[1], 4)
    else:
        forced_mean = x.mean()
        forced_var = x.var(ddof=1)
        df_normality.iloc[0, 0] = np.round(
            st.kstest(x, cdf="norm",
                      args=(forced_mean, np.sqrt(forced_var)))[1],
            4)
    df_normality.iloc[1, 0] = np.round(sm.stattools.jarque_bera(x)[1], 4)
    df_normality.iloc[2, 0] = np.round(sm.diagnostic.normal_ad(x)[1], 4)
    df_normality["Accept H0"] = [v >= alpha for v in df_normality.p_value]
    print("{} - Normality Test - Risk level = {:.0%}".format(
        feature_name, alpha))
    print("    H0 : Feature distribution is normal")
    print("    H1 : Feature distribution is not normal")
    print(df_normality.T)

In [None]:
def graph_boxplot(feature, feature_name, units, my_ax):
    bplot = my_ax.boxplot(feature, showmeans=True, widths=0.7,
                          vert=False, patch_artist=True)
    bplot['boxes'][0].set_facecolor(list_colors[0])
    bplot['boxes'][0].set_alpha(1)
    bplot['medians'][0].set_color('black')
    bplot['fliers'][0].set_markeredgecolor('grey')
    bplot['fliers'][0].set_markerfacecolor(list_colors[0])
    bplot['fliers'][0].set_alpha(1)
    bplot['means'][0].set_marker('o')
    bplot['means'][0].set_markeredgecolor('black')
    bplot['means'][0].set_markerfacecolor('black')
    my_ax.set_yticklabels([feature_name])
    my_ax.set_xlabel(units)

In [None]:
def graph_henry_plot(feature, my_ax):
    sa.qqplot(feature, fit=True, markeredgecolor=list_colors[0],
              markerfacecolor=list_colors[0], alpha=0.5, ax=my_ax)
    ymin, ymax = my_ax.get_ylim()
    xmin, xmax = my_ax.get_xlim()
    data_min = min(xmin, ymin)
    data_max = max(xmax, ymax)
    my_ax.plot([data_min, data_max], [data_min, data_max],
               color=list_colors[1])
    my_ax.set_xlabel("Theorical quantiles of the Normal law")
    my_ax.set_ylabel("Observed quantiles of the feature")
    my_ax.autoscale(enable=True, axis='both', tight=True)

In [None]:
def graph_normal_distribution(feature, feature_name, my_ax, y_max=None,
                              legend_loc="upper right", bins=20):
    my_ax.hist(feature, bins=bins, label=feature_name, density=True,
               color=list_colors[0], alpha=1)
    x_theo = np.linspace(feature.min(), feature.max(), 100)
    data_mean = feature.mean()
    data_std = feature.std(ddof=1)
    my_ax.plot(x_theo, st.norm.pdf(x_theo, loc=data_mean, scale=data_std),
               label="Normal law", color=list_colors[1], linewidth=1.5)
    my_ax.set_xlabel(feature_name)
    ymin, ymax = my_ax.get_ylim()
    if y_max is not None:
        ymax = y_max
    my_ax.set_ylim([ymin, ymax])
    my_ax.set_ylabel("Distribution (%)")
    my_ax.legend(loc=legend_loc)

In [None]:
def graph_quant_feature(x, feature_name, units, figsize=(12,9),
                        ymax=None, bins=20, legend_loc="upper right"):
    title = "{} : Feature distribution".format(feature_name)
    fig = plt.figure(figsize=figsize)
    fig.subplots_adjust(wspace=0.25, hspace=0.25)
    gs = plt.GridSpec(3, 2, figure=fig)
    fig.suptitle(title, y=1.05)
    ax = []
    ax.append(fig.add_subplot(gs[0, :]))
    ax.append(fig.add_subplot(gs[1:, 0]))
    ax.append(fig.add_subplot(gs[1:, 1]))
    
    graph_boxplot(feature, feature_name, units, ax[0])
    ax[0].set_title("Boxplot", fontweight='regular')
    
    graph_henry_plot(feature, ax[1])
    ax[1].set_title("Henry plot", fontweight='regular')
    
    graph_normal_distribution(feature, feature_name, ax[2], y_max=ymax,
                              bins=bins, legend_loc=legend_loc)
    ax[2].set_title("Distribution plot", fontweight='regular')
    fig.tight_layout()
    plt.show()

In [None]:
def graph_categ_feature(feature, feature_name, index_as_int=False,
                        ticklabels_rotation=None, labelsize=None,
                        force_pie=False, force_startangle=None,
                        figsize=(12,9)):
    distrib_values = feature.value_counts()
    if index_as_int:
        labels = distrib_values.index.values.astype(int)
    else:
        labels = distrib_values.index
    if force_startangle is None:
        force_startangle = 90
    if ticklabels_rotation is None:
        ticklabels_rotation = 30
    if (ticklabels_rotation > 0) & (ticklabels_rotation < 90):
        ha="right"
    else:
        ha="center"
    if labelsize is None:
        labelsize = 15
    title = "{} : Feature distribution".format(feature_name)
    fig = plt.figure(figsize=figsize)
    ax = plt.axes()
    fig.suptitle(title)
    if (len(distrib_values) <= 5) | force_pie:
        ax.pie(distrib_values, labels=labels, autopct="%1.1f%%",
               startangle=force_startangle, counterclock=False)
    else:
        sns.barplot(x=labels, y=distrib_values, ax=ax)
        ax.tick_params(axis="x", rotation=ticklabels_rotation,
                       labelsize=labelsize)
        ax.set_xlabel("")
        ax.set_ylabel("Number of occurrences")
    plt.show()

**Utility functions**

In [None]:
def my_signed_sqrt(x_serie):
    return np.sqrt(np.abs(x_serie)) * \
        np.sign(x_serie)

In [None]:
def downcast_df_int8(df):
    float_cols = [col for col in df
                  if df[col].dtype == "float64"]
    int_cols = [c for c in df
                if df[c].dtype in ["int64", "int32"]]
    if "item_id" in df.columns:
        int_cols.remove("item_id")
        df["item_id"] = df.item_id.astype(np.int16)
    if "shopitem_id" in df.columns:
        int_cols.remove("shopitem_id")
        df["shopitem_id"] = df.shopitem_id.astype(np.int32)
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int8)
    return df

# 1. Descriptive data

In [None]:
data = pd.read_csv("../input/an-economic-oriented-feature-engineering/alldata_descriptive.csv")
data = downcast_df_int8(data)
data.info()

<a id="11"></a>
## 1.1. Item Count Month

In [None]:
feature = data.item_cnt_month
feature_name = "Item Count Month"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name, "Monthly Sales")

More than 75% of monthly sales are equal to 0. This is due to the items that are no more sold, or that are sold from time to time by a shop.  
I focus on sold items, that is on positive item_cnt_month.  
More generally, the distribution of several features is disturbed by this imbalance. Values of sales equal to 0 or values of changes or relative sales equal to -1 should be put apart as soon as a mean number of sales is implied.  
In the same order of idea, there are often some large outliers in high values. I tend to put apart the wider ones to focus on the distribution of the majority of data.

In [None]:
feature = data[(data.item_cnt_month > 0) &
               (data.item_cnt_month <= 20)].item_cnt_month
feature_name = "Item Count Month"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name, "Monthly Sales")

When an item is sold, more than half of monthly counts are only one item sold by a given shop.  
Only one or two units are sold by a shop in a given month for most items.  
Let's clip the max number of sold items to 20 to get a better idea of distribution in small numbers of sales.

In [None]:
feature = np.log(
    data[(data.item_cnt_month > 0) &
         (data.item_cnt_month <= 20)].item_cnt_month)
feature_name = "Log of Item Count Month"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name, "Log of Monthly Sales")

The distribution of the log of positive item counts is still far from normality. It has a strong positive skewness. But it is more interesting than the core positive item count because the number and the range of outliers are lower.

In [None]:
feature = data.item_cnt_month.clip(0, 20)
feature_name = "Clipped Item Cnt Month"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name, "Clipped Monthly Sales")

In [None]:
feature = data[data.item_cnt_month > 0].\
    item_cnt_month.clip(0, 20)
feature_name = "Clipped Item Cnt Month"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name, "Clipped Monthly Sales")

There's a huge positive skewness. There are almost no items with 10 to 19 sales. The most interesting items are those with 20 sales (remember that we've clipped the max number of sales to 20 for this graph), because they will probably count for a big part of the income of the company.

Conclusions to keep in mind when doing predictions :
- most monthly sales are equal to 0 ;
- when there are sales, most of them are equal to 1 or 2 ;
- there are a significant number of items with 20 or more sales by month and shop. They may be the most important items for the company from an economical point of view.

In [None]:
feature = np.log(feature)
feature_name = "Log of Clipped Values"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name, "Log of Clipped Monthly Sales")

The distribution of the log of positive clipped item counts is the closer to the normal law.

## 1.2. Seniority features

I don't analyse the features related to the month of the first or last sale. I prefer to look at the seniority of items or at the time elapsed since last sale.

In [None]:
feature = data.item_seniority
feature_name = "Item seniority"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name, "Seniority (in months)",
                    bins=34, ymax=0.06)

In [None]:
feature = data.shop_seniority
feature_name = "Shop seniority"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name, "Seniority (in months)",
                    bins=34, ymax=0.06)

In [None]:
feature = data.shopitem_seniority
feature_name = "Shop/Item seniority"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name, "Seniority (in months)",
                    bins=34, ymax=0.07)

Only small shops have been stopped. So the curve of seniority is not hurted when a shop stops.  
The curve of items seniority is explained by the fact that data is only for the active shop/item pairs. The shape of the curve is due to the fact that an item is not sold by all shops at the month of its launch.

## 1.3. "Stopped" features

In [None]:
feature = data.item_stopped
feature_name = "Item stopped"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Months since stopped (in months)",
                    bins=34, ymax=0.06)

In [None]:
feature = data.shop_stopped
feature_name = "Shop stopped"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Months since stopped (in months)",
                    bins=34, ymax=0.06)

In [None]:
feature = data.shopitem_stopped
feature_name = "Shop/Item stopped"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Months since stopped (in months)",
                    bins=34, ymax=0.06)

Statistical test reject the normal distribution. However, the plots show that the distribution of the number of months since the last sale for a given item ou a given pair shop/item is close of normality.  
It reflects that items stop selling on a regular basis. There is a regular turnover in sold items.

## 1.4. Item seniority when first sell occurs

In [None]:
feature = data.shop_avg_itemseniority_firstsell
feature_name = "Shop Avg Itemseniority"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "In months")

In [None]:
feature = np.log(data.shop_avg_itemseniority_firstsell)
feature_name = "Log of Shop Avg Itemseniority"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Log of months", ymax=1.4)

In [None]:
feature = data.shop_min_itemseniority_firstsell
feature_name = "Shop Min Itemseniority"
feature.describe()

In [None]:
feature = data.shop_max_itemseniority_firstsell
feature_name = "Shop Max Itemseniority"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "In months")

The most interesting feature is the average. It is the closest to normal distribution. Using its log, we obtain a distribution really close to normality, except for its min value.  
The min and max features are not interesting to analyse.  
Min value : all shops sell at least one item the first month it is sold (I should say month zero).  
Max value : most shops never sell an existing item. This is due to the fact that some shops don't sell some categories.

In [None]:
feature = data.category_avg_itemseniority_firstsell
feature_name = "Cat Avg Itemseniority"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "In months")

In [None]:
feature = data.category_min_itemseniority_firstsell
feature_name = "Cat Min Itemseniority"
feature.describe()

In [None]:
feature = data.category_max_itemseniority_firstsell
feature_name = "Cat Max Itemseniority"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "In months")

Again, the most interesting feature is the average one.

In [None]:
feature = data.maincategory_avg_itemseniority_firstsell
feature_name = "MainCat Avg Itemseniority"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "In months")

In [None]:
feature = data.maincategory_min_itemseniority_firstsell
feature_name = "MainCat Min Itemseniority"
feature.describe()

In [None]:
feature = data.maincategory_max_itemseniority_firstsell
feature_name = "MainCat Max Itemseniority"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "In months")

One more time, the only feature of interest seems to be the average value by main category.  
The distribution by main category is a bit weird because values from Q1 to Q3 are almost the same. I prefer the shape of the distribution by category.

## 1.5. Date features

In [None]:
feature = data.month
feature_name = "Month"
feature.describe()

In [None]:
graph_categ_feature(feature, feature_name, index_as_int=True,
                   ticklabels_rotation=0)

We have less occurrences in months 11 and 12 because we have one year less in those two months.  
The number of occurrences increases month after month. This is logical as we keep stopped items with a sale number of 0.

In [None]:
feature = data.nb_days
feature_name = "Number of days in the month"
graph_categ_feature(feature, feature_name, index_as_int=True,
                   ticklabels_rotation=0)

In [None]:
feature = data.nb_mondays
feature_name = "Number of mondays in the month"
graph_categ_feature(feature, feature_name, index_as_int=True,
                   ticklabels_rotation=0)

In [None]:
feature = data.nb_mondays
feature_name = "Number of tuesdays in the month"
graph_categ_feature(feature, feature_name, index_as_int=True,
                   ticklabels_rotation=0)

In [None]:
feature = data.nb_wednesdays
feature_name = "Number of wednesdays in the month"
graph_categ_feature(feature, feature_name, index_as_int=True,
                   ticklabels_rotation=0)

In [None]:
feature = data.nb_thursdays
feature_name = "Number of thursdays in the month"
graph_categ_feature(feature, feature_name, index_as_int=True,
                   ticklabels_rotation=0)

In [None]:
feature = data.nb_fridays
feature_name = "Number of fridays in the month"
graph_categ_feature(feature, feature_name, index_as_int=True,
                   ticklabels_rotation=0)

In [None]:
feature = data.nb_saturdays
feature_name = "Number of saturdays in the month"

graph_categ_feature(feature, feature_name, index_as_int=True,
                   ticklabels_rotation=0)

In [None]:
feature = data.nb_sundays
feature_name = "Number of sundays in the month"
graph_categ_feature(feature, feature_name, index_as_int=True,
                   ticklabels_rotation=0)

Nothing special about the number of days in a month.

## 1.6. Categorical features

In [None]:
feature = data.shop_city
feature_name = "Shop City"
graph_categ_feature(feature, feature_name,
                   ticklabels_rotation=90)

A large part of sales occur in Moscow, where the company has several stores.

In [None]:
feature = data.shop_isonline
feature_name = "Online shops"
graph_categ_feature(feature, feature_name)

In [None]:
feature = data.item_category_id
feature_name = "Category ID"
graph_categ_feature(feature, feature_name,
                   ticklabels_rotation=90,
                   labelsize=11)

In [None]:
feature = data.main_category
feature_name = "Main Category"
graph_categ_feature(feature, feature_name,
                   ticklabels_rotation=90)

The sales between categories are unbalanced. One or two categories concentrate the sales within a given main category.

In [None]:
feature = data.category_online
feature_name = "Items sold only by the online warehouse"
graph_categ_feature(feature, feature_name)

In [None]:
feature = data.category_emergency
feature_name = "Items sold only by the online emergency"
graph_categ_feature(feature, feature_name)

Online stores don't account for a lot of sales.

I keep numeric features of interest to run a PCA analysis with other features.

In [None]:
desc_data = data[["date_block_num", "shop_id", "item_id",
                  "item_cnt_month", "item_seniority", "shop_seniority",
                  "shopitem_seniority", "item_stopped", "shopitem_stopped",
                  "shop_avg_itemseniority_firstsell",
                  "category_avg_itemseniority_firstsell",
                  "maincategory_avg_itemseniority_firstsell"]].copy()
desc_data["clipped_item_cnt_month"] = desc_data.item_cnt_month.clip(0, 20)

# 2. Quantitative data

In [None]:
data = pd.read_csv("../input/an-economic-oriented-feature-engineering/alldata_nolags.csv")
data = downcast_df_int8(data)
data.info()

## 2.1. Shop/Item data

In [None]:
feature = data.month_shopitem_price
feature_name = "Shop/Item Price"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Rubles")

In [None]:
feature = np.log(data.month_shopitem_price)
feature_name = "Log of Shop/Item Price"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Log of Rubles")

The distribution of the log of shop/item price is pretty close to the normal distribution.

In [None]:
feature = data.month_shopitem_price_change
feature_name = "Shop/Item Price Change"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "In percentage")

A price variation of more than 200% is pretty rare. So I limit the range analysis.

In [None]:
feature = data[data.month_shopitem_price_change < 2].\
    month_shopitem_price_change
feature_name = "Shop/Item Price Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "In percentage")

In [None]:
feature = data.month_shopitem_price_change.clip(-1, 2)
feature_name = "Shop/Item Price Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "In percentage")

In [None]:
feature = np.cbrt(
    data.month_shopitem_price_change.clip(-1, 2))
feature_name = "Cbrt of Price Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "In percentage")

It's difficult to get a close-to-normality distribution by transforming the shop/item price change. This is due to the fact that Q1 to Q3 values are equal to 0.

I remove -1 values of item count month change because they are linked to an item count moving from a positive value to a zero value.

In [None]:
feature = data[data.item_cnt_month_change > -1].item_cnt_month_change
feature_name = "Item Count Month Change"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

Monthly increase of over 200% are rare occurrences. I get rid of them for the analysis of the distribution.

In [None]:
feature = data[(data.item_cnt_month_change > -1) &
    (data.item_cnt_month_change < 2)].item_cnt_month_change
feature_name = "Item Count Month Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[data.item_cnt_month_change > -1].\
    item_cnt_month_change.clip(-1, 2)
feature_name = "Item Count Month Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

One more time, a lot of 0 values give a weird distribution curve. We have also a lot of 100% increase, that correspond when a product is sold again after a month without sale.  
I look at the distribution if I keep -1 values of Item Count Month Change.

In [None]:
feature = data.item_cnt_month_change.clip(-1, 2)
feature_name = "Item Count Month Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_shopitem_relative_price
feature_name = "Shop/Item Relative Price"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[data.month_shopitem_relative_price < 2].\
    month_shopitem_relative_price
feature_name = "Shop/Item Relative Price"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = \
    data.month_shopitem_relative_price.clip(-1, 2)
feature_name = "Shop/Item Relative Price"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = np.cbrt(
    data.month_shopitem_relative_price.clip(-1, 2))
feature_name = "Log of Relative Price"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

Still a lot of zero values (from Q1 to Q3). And the same conclusions.  
I prefer not to apply the log transformation because its distribution plot has several local maximum.

In [None]:
feature = data[data.month_shopitem_relative_sales > -1].\
    month_shopitem_relative_sales
feature_name = "Shop/Item Relative Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[(data.month_shopitem_relative_sales > -1) &
               (data.month_shopitem_relative_sales < 2)].\
    month_shopitem_relative_sales
feature_name = "Shop/Item Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[data.month_shopitem_relative_sales > -1].\
    month_shopitem_relative_sales.clip(-1, 2)
feature_name = "Shop/Item Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_shopitem_relative_sales.clip(-1, 2)
feature_name = "Shop/Item Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = np.log1p(
    data[data.month_shopitem_relative_sales > -1].\
    month_shopitem_relative_sales.clip(-1, 2))
feature_name = "Log of Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

Still a lot of 0 values.

In [None]:
feature = data.month_shopitem_compared_price
feature_name = "Shop/Item Compared Price"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[data.month_shopitem_compared_price < 2].\
    month_shopitem_compared_price
feature_name = "Shop/Item Compared Price"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_shopitem_compared_price.clip(-1, 2)
feature_name = "Shop/Item Compared Price"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

Still a weird distribution with a lot of zero values.

In [None]:
feature = data[data.month_shopitem_compared_sales > -1].\
    month_shopitem_compared_sales
feature_name = "Shop/Item Compared Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[(data.month_shopitem_compared_sales > -1) &
    (data.month_shopitem_compared_sales < 2)].\
    month_shopitem_compared_sales
feature_name = "Shop/Item Compared Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[data.month_shopitem_compared_sales > -1].\
    month_shopitem_compared_sales.clip(-1, 2)
feature_name = "Shop/Item Compared Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_shopitem_compared_sales.clip(-1, 2)
feature_name = "Shop/Item Compared Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = np.log1p(
    data[data.month_shopitem_compared_sales > -1].\
    month_shopitem_compared_sales.clip(-1, 2))
feature_name = "Log of Compared Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

Same weird distribution with a lot of zero values and high outliers.

## 2.2. Global data

In [None]:
feature = data.month_global_sumsales
feature_name = "Global Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data.month_global_revenue
feature_name = "Global Revenue"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Rubles")

In [None]:
feature = np.log(data.month_global_revenue)
feature_name = "Log of Global Revenue"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Rubles")

In [None]:
feature = data.month_global_sumsales_change
feature_name = "Global Sales Change"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_global_revenue_change
feature_name = "Global Revenue Change"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_global_relative_sumsales
feature_name = "Global Relative Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage", ymax=3.5)

In [None]:
feature = data.month_global_relative_revenue
feature_name = "Global Relative Revenue"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = np.log1p(data.month_global_relative_revenue)
feature_name = "Log of Global Relative Revenue"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

Global data don't necessary need transformation, except the global relative revenue. Extreme values are always a problem to get close to normality.

## 2.3. Item data

In [None]:
feature = data.month_item_price
feature_name = "Item Price"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Rubles")

In [None]:
feature = np.log(data.month_item_price)
feature_name = "Log of Item Price"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Rubles", ymax=0.5)

In [None]:
feature = data[data.month_item_sales > 0].month_item_sales
feature_name = "Item Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data[(data.month_item_sales > 0) &
               (data.month_item_sales <= 20)].month_item_sales
feature_name = "Item Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data[data.month_item_sales > 0].\
    month_item_sales.clip(0, 20)
feature_name = "Item Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data.month_item_sales.clip(0, 20)
feature_name = "Item Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = np.log1p(
    data.month_item_sales.clip(0, 20))
feature_name = "Log of Item Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = np.log(
    data[data.month_item_sales > 0].\
    month_item_sales.clip(0, 20))
feature_name = "Log of Item Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data.month_item_price_change
feature_name = "Item Price Change"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[data.month_item_price_change < 2].\
    month_item_price_change
feature_name = "Item Price Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_item_price_change.clip(-1, 2)
feature_name = "Item Price Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[data.month_item_sales_change > -1].\
    month_item_sales_change
feature_name = "Item Sales Change"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[(data.month_item_sales_change > -1) &
               (data.month_item_sales_change < 2)].\
    month_item_sales_change
feature_name = "Item Sales Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[data.month_item_sales_change > -1].\
    month_item_sales_change.clip(-1, 2)
feature_name = "Item Sales Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_item_sales_change.clip(-1, 2)
feature_name = "Item Sales Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_item_relative_price
feature_name = "Item Relative Price"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[data.month_item_relative_price < 2].\
    month_item_relative_price
feature_name = "Item Relative Price"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_item_relative_price.clip(-1, 2)
feature_name = "Item Relative Price"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = my_signed_sqrt(
    data.month_item_relative_price.clip(-1, 2))
feature_name = "Sqrt of Item Relative Price"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage", ymax=2.5)

The sign-corrected square root has a distribution closer to the normal law from the henry plot point of view, but not from the distribution plot view (multi-modal distribution).

In [None]:
feature = data[data.month_item_relative_sales > -1].\
    month_item_relative_sales
feature_name = "Item Relative Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[(data.month_item_relative_sales > -1) &
               (data.month_item_relative_sales < 2)].\
    month_item_relative_sales
feature_name = "Item Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage", ymax=2.5)

In [None]:
feature = data[data.month_item_relative_sales > -1].\
    month_item_relative_sales.clip(-1, 2)
feature_name = "Item Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage", ymax=2.5)

In [None]:
feature = data.month_item_relative_sales.clip(-1, 2)
feature_name = "Item Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage", ymax=2.5)

Same shape of distribution as Item Relative Prices.

## 2.4. Shop data

In [None]:
feature = data.month_shop_sumsales
feature_name = "Shop Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data[data.month_shop_sumsales > 0].\
    month_shop_sumsales
feature_name = "Shop Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = np.log(
    data[data.month_shop_sumsales > 0].\
    month_shop_sumsales)
feature_name = "Log of Shop Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = np.log1p(
    data.month_shop_sumsales)
feature_name = "Log of Shop Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data.month_shop_revenue
feature_name = "Shop Revenue"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Rubles")

In [None]:
feature = data[data.month_shop_revenue > 0].\
    month_shop_revenue
feature_name = "Shop Revenue"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Rubles")

In [None]:
feature = np.log(
    data[data.month_shop_revenue > 0].\
    month_shop_revenue)
feature_name = "Log of Shop Revenue"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Rubles")

In [None]:
feature = np.log1p(
    data.month_shop_revenue)
feature_name = "Log of Shop Revenue"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Rubles")

In [None]:
feature = data[data.month_shop_sumsales_change > -1].\
    month_shop_sumsales_change
feature_name = "Shop sales Change"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[(data.month_shop_sumsales_change > -1) &
    (data.month_shop_sumsales_change < 2)].month_shop_sumsales_change
feature_name = "Shop sales Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[data.month_shop_sumsales_change > -1].\
    month_shop_sumsales_change.clip(-1, 2)
feature_name = "Shop sales Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage", ymax=3.0)

In [None]:
feature = data.month_shop_sumsales_change.clip(-1, 2)
feature_name = "Shop sales Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage", ymax=3.0)

In [None]:
feature = data[data.month_shop_revenue_change > -1].\
    month_shop_revenue_change
feature_name = "Shop Revenue Change"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[(data.month_shop_revenue_change > -1) &
               (data.month_shop_revenue_change < 2)].\
    month_shop_revenue_change
feature_name = "Shop Revenue Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[data.month_shop_revenue_change > -1].\
    month_shop_revenue_change.clip(-1, 2)
feature_name = "Shop Revenue Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_shop_revenue_change.clip(-1, 2)
feature_name = "Shop Revenue Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[data.month_shop_relative_sumsales > -1].\
    month_shop_relative_sumsales
feature_name = "Shop Relative Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[(data.month_shop_relative_sumsales > -1) &
               (data.month_shop_relative_sumsales < 2)].\
    month_shop_relative_sumsales
feature_name = "Shop Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage", ymax=2.25)

In [None]:
feature = data[data.month_shop_relative_sumsales > -1].\
    month_shop_relative_sumsales.clip(-1, 2)
feature_name = "Shop Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage", ymax=2.25)

In [None]:
feature = data.month_shop_relative_sumsales.clip(-1, 2)
feature_name = "Shop Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage", ymax=2.25)

In [None]:
feature = data[data.month_shop_relative_revenue > -1].\
    month_shop_relative_revenue
feature_name = "Shop Relative Revenue"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[(data.month_shop_relative_revenue > -1) &
               (data.month_shop_relative_revenue < 2)].\
    month_shop_relative_revenue
feature_name = "Shop Relative Revenue"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage", ymax=1.8)

In [None]:
feature = data[data.month_shop_relative_revenue > -1].\
    month_shop_relative_revenue.clip(-1, 2)
feature_name = "Shop Relative Revenue"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage", ymax=2)

In [None]:
feature = data.month_shop_relative_revenue.clip(-1, 2)
feature_name = "Shop Relative Revenue"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage", ymax=2)

In [None]:
feature = np.log1p(
    data[data.month_shop_relative_revenue > -1].\
    month_shop_relative_revenue.clip(-1, 2))
feature_name = "Log of Shop Relative Revenue"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage", ymax=1.75)

## 2.5. Category Data

In [None]:
feature = data.month_category_sales
feature_name = "Category Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data[data.month_category_sales < 20].\
    month_category_sales
feature_name = "Category Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data.month_category_sales.clip(0, 20)
feature_name = "Category Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = np.log1p(data.month_category_sales.clip(0, 20))
feature_name = "Log of Category Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units", ymax=3)

In [None]:
feature = data.month_category_sales_change
feature_name = "Category Sales Change"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[data.month_category_sales_change > -1].\
    month_category_sales_change.clip(-1, 2)
feature_name = "Category Sales Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_category_sales_change.clip(-1, 2)
feature_name = "Category Sales Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_category_relative_sales
feature_name = "Category Relative Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data[data.month_category_relative_sales > -1].\
    month_category_relative_sales.clip(-1, 2)
feature_name = "Category Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_category_relative_sales.clip(-1, 2)
feature_name = "Category Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

## 2.6. Main Category Data

In [None]:
feature = data.month_maincategory_sales
feature_name = "Main Category Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data.month_maincategory_sales.clip(0, 20)
feature_name = "Main Category Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = np.log1p(
    data.month_maincategory_sales.clip(0, 20))
feature_name = "Log of Main Category Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data.month_maincategory_sales_change
feature_name = "Main Category Sales Change"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_maincategory_sales_change.clip(-1, 2)
feature_name = "Main Category Sales Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_maincategory_relative_sales
feature_name = "Main Category Relative Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_maincategory_relative_sales.clip(-1, 2)
feature_name = "Main Category Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

## 2.7. Shop/Category Data

In [None]:
feature = data.month_shopcategory_sales
feature_name = "Shop/Category Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data.month_shopcategory_sales.clip(0, 20)
feature_name = "Shop/Category Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = np.log1p(
    data.month_shopcategory_sales.clip(0, 20))
feature_name = "Log of Shop/Category Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data.month_shopcategory_sales_change
feature_name = "Shop/Category Sales Change"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_shopcategory_sales_change.clip(-1, 2)
feature_name = "Shop/Category Sales Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_shopcategory_relative_sales
feature_name = "Shop Category Relative Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_shopcategory_relative_sales.clip(-1, 2)
feature_name = "Shop Category Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

## 2.8. Shop/Main Category Data

In [None]:
feature = data.month_shopmaincategory_sales
feature_name = "Shop/Main Category Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data.month_shopmaincategory_sales.clip(0, 20)
feature_name = "Shop/Main Category Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = np.log1p(
    data.month_shopmaincategory_sales.clip(0, 20))
feature_name = "Log of Shop/Main Category Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Units")

In [None]:
feature = data.month_shopmaincategory_sales_change
feature_name = "Shop/Main Category Sales Change"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_shopmaincategory_sales_change.clip(-1, 2)
feature_name = "Shop/Main Category Sales Change"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_shopmaincategory_relative_sales
feature_name = "Shop/Main Category Relative Sales"
feature.describe()

In [None]:
test_normality(feature, feature_name)

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

In [None]:
feature = data.month_shopmaincategory_relative_sales.clip(-1, 2)
feature_name = "Shop/Main Category Relative Sales"
feature.describe()

In [None]:
graph_quant_feature(feature, feature_name,
                    "Percentage")

## 2.9. Suggestion of treatment for the quantitative features

There are a lot of outliers in the high values, as we've seen during the analysis.  
Changes or relative features can't be under -100%. On the upside, they sometimes reach really high values. An increase value of 200% is allready a big increase, as well as a relative value of 200% in comparison to its mean is allready a large value. I decide to ceil them all to 200% as the predictions are ceiled at 20 units. This is like grouping all large outliers values in the same value of +200%.  
I am open to discuss the logic of this decision in the comments.

In [None]:
data.drop(columns=["item_name", "month_shopitem_revenue",
                   "month_shopitem_hassales"], inplace=True)

In [None]:
data["item_cnt_month_change"] = data.item_cnt_month_change.clip(-1, 2)
data["month_shopitem_price"] = np.log(data.month_shopitem_price)
data["month_shopitem_price_change"] = np.cbrt(data.month_shopitem_price_change.clip(-1, 2))

data["month_shopitem_relative_sales"] = data.month_shopitem_relative_sales.clip(-1, 2)
data["month_shopitem_relative_price"] = data.month_shopitem_relative_price.clip(-1, 2)
data["month_shopitem_compared_sales"] = data.month_shopitem_compared_sales.clip(-1, 2)
data["month_shopitem_compared_price"] = data.month_shopitem_compared_price.clip(-1, 2)

data["month_global_sumsales"] = data.month_global_sumsales
data["month_global_revenue"] = np.log(data.month_global_revenue)
data["month_global_sumsales_change"] = data.month_global_sumsales_change
data["month_global_revenue_change"] = data.month_global_revenue_change
data["month_global_relative_sumsales"] = data.month_global_relative_sumsales
data["month_global_relative_revenue"] = np.log1p(data.month_global_relative_revenue)

data["month_item_price"] = np.log(data.month_item_price)
data["month_item_sales"] = np.log1p(data.month_item_sales.clip(0, 20))
data["month_item_price_change"] = data.month_item_price_change.clip(-1, 2)
data["month_item_sales_change"] = data.month_item_sales_change.clip(-1, 2)
data["month_item_relative_price"] = data.month_item_relative_price.clip(-1, 2)
data["month_item_relative_sales"] = data.month_item_relative_sales.clip(-1, 2)

data["month_shop_sumsales"] = data.month_shop_sumsales
data["month_shop_revenue"] = data.month_shop_revenue
data["month_shop_sumsales_change"] = data.month_shop_sumsales_change.clip(-1, 2)
data["month_shop_revenue_change"] = data.month_shop_revenue_change.clip(-1, 2)
data["month_shop_relative_sumsales"] = data.month_shop_relative_sumsales.clip(-1, 2)
data["month_shop_relative_revenue"] = data.month_shop_relative_revenue.clip(-1, 2)

data["month_category_sales"] = np.log1p(data.month_category_sales.clip(0, 20))
data["month_category_sales_change"] = data.month_category_sales_change.clip(-1, 2)
data["month_category_relative_sales"] = data.month_category_relative_sales.clip(-1, 2)

data["month_maincategory_sales"] = np.log1p(data.month_maincategory_sales.clip(0, 20))
data["month_maincategory_sales_change"] = data.month_maincategory_sales_change.clip(-1, 2)
data["month_maincategory_relative_sales"] = data.month_maincategory_relative_sales.clip(-1, 2)

data["month_shopcategory_sales"] = np.log1p(data.month_shopcategory_sales.clip(0, 20))
data["month_shopcategory_sales_change"] = data.month_shopcategory_sales_change.clip(-1, 2)
data["month_shopcategory_relative_sales"] = data.month_shopcategory_relative_sales.clip(-1, 2)

data["month_shopmaincategory_sales"] = np.log1p(data.month_shopmaincategory_sales.clip(0, 20))
data["month_shopmaincategory_sales_change"] = data.month_shopmaincategory_sales_change.clip(-1, 2)
data["month_shopmaincategory_relative_sales"] = data.month_shopmaincategory_relative_sales.clip(-1, 2)

Last conclusion : category-based data seem a better option than main-category-based data, from the distribution point of view.