In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
print('Done!')

In [None]:
# Just incase we need it
if False:
    !pip install --quiet --upgrade pip
    !pip install --quiet pyspark

    import pyspark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder \
            .master("local[*]") \
            .appName('pred_nn_model') \
            .config('spark.executor.memory','12gb') \
            .getOrCreate()

    sc = spark.sparkContext
    print('Spark Cluster Initialized Successfully!')

## 1. Import relevant libraries

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import tensorflow as tf
import tqdm
import time
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler  # for median/range or Robust Scaling
from sklearn.preprocessing import Normalizer  # for vector unit-length-norm

import warnings
warnings.filterwarnings("ignore")
from copy import deepcopy
# for the Q-Q plots
import scipy.stats as stats

!pip install --upgrade --quiet pip
!pip install --quiet feature-engine
from feature_engine.outliers import Winsorizer

!pip install --quiet gswrap
import gswrap


# to display the total number columns present in the dataset
pd.set_option('display.max_columns', None)

print('imported!')

## 2. TPU Check...

In [None]:
try: # detect TPUs
    tpu = None
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except Exception as e: # detect GPUs
    strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
    #strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines

print("Number of accelerators: ", strategy.num_replicas_in_sync)

## 3. Read in The Data Files...

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv', index_col=['id'])
print('Train data shape is:',train.shape)
train.head(3)

In [None]:
test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv', index_col=['id'])
print('Test data shape is:',test.shape)
test.head(3)

In [None]:
sample_sub = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')
print('Sample data shape is:',sample_sub.shape)
sample_sub.head(3)

## 4. Checking for missing Values

In [None]:
train.isna().any().sum()

In [None]:
test.isna().any().sum()

## 5. Let's see the summary info of the data

In [None]:
train.info()

## 6. Check Cardinality of Categorical Variables..

**Let's check the cardinality of each Cat variable and delete those with high cardinality...**

In [None]:
cat_vars = train.select_dtypes('object')
cat_vars.shape

**Count the cardinality values per categorical column**

In [None]:
cat_vars.nunique()

**Let's see one of the variables**

In [None]:
cat_vars['cat0'].unique()

**Let's make a plot with the cardinality of each categorical variable**

In [None]:
cat_vars.nunique().plot.bar(figsize=(12,6))
plt.ylabel('Number of unique categories')
plt.xlabel('Variables')
plt.title('Cardinality of Categorical Variables', fontweight='bold', fontsize=16)
plt.show()

**Let's define a method that plots each Cat feature and a threshold of percentage importance per Cat feature-value to each Cat feature.**

In [None]:
def category_freq(df, x, y, thresh):
    cols = list(df.columns)
    col1, col2 = cols[x], cols[y]
    fontdict_ = {'fontweight':'bold','fontsize':14}
    
    label_freq1 = df[col1].value_counts() / len(df)
    label_freq2 = df[col2].value_counts() / len(df)
    
    fig, ax = plt.subplots(1, 2, figsize=(16,5))

    fig1 = label_freq1.sort_values(ascending=False).plot.bar(ax=ax[0])
    fig1.axhline(y=thresh, color='red')
    fig1.set_ylabel('Pct. values per category', fontdict=fontdict_)
    fig1.set_xlabel(f'Variable: {col1}', fontdict=fontdict_)
    fig1.set_title('Identifying Rare Categories', fontdict=fontdict_)

    fig2 = label_freq2.sort_values(ascending=False).plot.bar(ax=ax[1], color='y')
    fig2.axhline(y=thresh, color='red')
    fig2.set_ylabel('Pct. values per category', fontdict=fontdict_)
    fig2.set_xlabel(f'Variable: {col2}', fontdict=fontdict_)
    fig2.set_title('Identifying Rare Categories', fontdict=fontdict_)

    plt.show()

## 7. Pinpointing rare categories in categorical variables

Categories that appear in a tiny proportion of the observations are rare. Typically, we consider a label to be rare when it appears in less than 5% or 1% of the population

In [None]:
# For the 1st and 2nd cat vars
category_freq(cat_vars, 0, 1, 0.1)

In [None]:
# For the 3rd and 4th cat vars
category_freq(cat_vars, 2, 3, 0.1)

In [None]:
# For the 5th and 6th cat vars
category_freq(cat_vars, 4, 5, 0.1)

In [None]:
# For the 7th and 8th cat vars
category_freq(cat_vars, 6, 7, 0.1)

In [None]:
# For the 9th and 10th cat vars
category_freq(cat_vars, 8, 9, 0.1)

## 8. Visualize the relationship between Categorical variables and Target

**Let's investigate the relationship between cat0 and Target**

In [None]:
def box_plots(df, col):
    plt.figure(figsize=(10,8))
    sns.set_style('ticks')

    plt.title(f'Boxplot showing relations between {col} and Target', fontsize=14, fontweight='bold')
    sns.boxplot(col, 'target', data=df)
    plt.xticks(color='red', fontsize=12, fontweight='bold')
    plt.yticks(color='red', fontsize=12, fontweight='bold')
    
    plt.show()

In [None]:
for col in cat_vars.columns:
    box_plots(train, col)

As much as possible, we want to have categorical variables in a column that seem to have distinct relationship with target, not too similar, hence they lose their predictive power.<br>
Let's see the average target score per categorical variable per column

In [None]:
# Average target for cat0

train[['cat0', 'target']].groupby('cat0').mean()

In [None]:
# Average target for cat1

train[['cat1', 'target']].groupby('cat1').mean()

In [None]:
# Average target for cat2

train[['cat2', 'target']].groupby('cat2').mean()

In [None]:
# Average target for cat3

train[['cat3', 'target']].groupby('cat3').mean()

In [None]:
# Average target for cat4

train[['cat4', 'target']].groupby('cat4').mean()

In [None]:
# Average target for cat5

train[['cat5', 'target']].groupby('cat5').mean()

## 9. One-Hot Encoding Categorical Variables

**Each column of values seem too alike and it makes no sense to keep all. We observe this through the boxplot as well as the average comparison to the target variable done above for each cat column. we'd drop Cat O-H-E columns that don't have significance up to a certain threshold we choose.**

In [None]:
# The original column names of Categorical columns
obj_cols = cat_vars.columns
print('Done!')

In [None]:
print('Before O-H-E, cat-vars has shape:',cat_vars.shape)

In [None]:
def drop_rare_cat_cols(df=cat_vars, thresh=0.1):
    """This method checks each Cat-column and saves the
        names of values with <= 10% contribution to the columns
        to be deleted after applying One-Hot-Endoding (O-H-E)
    """
    drop_cols = []
    for col in df.columns:
        below_par = (cat_vars[col].value_counts() / len(cat_vars)) <= thresh
        col_names = [col+'_' + name for name in list(below_par[below_par].index)]
        drop_cols.extend(col_names)

    return drop_cols

Here, we want to grab the rare cat-column names, just as they would appear after the O-H-E, so that we can drop them off as individual columns from cat_vars dataframe, immediately after applying O-H-E.

In [None]:
rare_categories = drop_rare_cat_cols()
print(len(rare_categories))
rare_categories[-5:]

In [None]:
cat_vars = pd.get_dummies(cat_vars)
print('After O-H-E, cat-vars has shape:',cat_vars.shape)
cat_vars.head(3)

## 10. Drop categorical columns with 10% or less value contribution per variable

**Let's first make a copy of the original train set and use this for the transformations**

In [None]:
train_copy = deepcopy(train)
print(train_copy.shape)

In [None]:
cat_vars.drop(rare_categories, axis=1, inplace=True)
train_copy.drop(list(obj_cols), axis=1, inplace=True)  # drop former cat column names since O-H-E

print('Done!')

**Merge both dataframes...**

In [None]:
print('After dropping rare-categories, cat_vars shape is now:', cat_vars.shape)
cat_vars.head(3)

In [None]:
train_copy.head(3)

**Since both dataframes have different columns, using pd.concat will return NAN values. So let's use a simple join, since they both have the same index col.**

In [None]:
train_copy = train_copy.join(cat_vars, on=cat_vars.index)
    
print('Train copy data shape is:', train_copy.shape)
train_copy.head(3)

**Double-check for possible NAN values after join**

In [None]:
# Double check for NAN values after join
train_copy.isna().any().sum()

In [None]:
# Let's delete cat_vars from memory
del cat_vars
print('Deleted!')

## 11. Visualizing The Relationship between Numerical Variables and Target

In [None]:
num_cols = [col for col in train_copy.columns if not col.startswith('cat')]
len(num_cols)

In [None]:
# plot both together to compare

def reg_plots(df, col1, col2):
    with strategy.scope():
        st=time.time()
        fig, ax = plt.subplots(1,2, figsize=(10,5))
        sns.regplot(df[col1], df['target'], color='r', line_kws={'color':'navy','linewidth':2.5}, ax=ax[0])
        ax[0].set_title(f"RegPlot: Target and {col1}")
        sns.regplot(df[col2], df['target'], color='yellow', line_kws={'color':'navy','linewidth':2.5}, ax=ax[1])
        ax[1].set_title(f"RegPlot: Target and {col2}")
        print(f'Took {time.time()-st} secs!')
        plt.show()

In [None]:
# For cont0 and cont1

reg_plots(train_copy, num_cols[0], num_cols[1])

In [None]:
# For cont2 and cont3

reg_plots(train_copy, num_cols[2], num_cols[3])

In [None]:
# For cont4 and cont5

reg_plots(train_copy, num_cols[4], num_cols[5])

In [None]:
# For cont6 and cont7

reg_plots(train_copy, num_cols[6], num_cols[7])

In [None]:
# For cont8 and cont9

reg_plots(train_copy, num_cols[8], num_cols[9])

In [None]:
# For cont10 and cont11

reg_plots(train_copy, num_cols[10], num_cols[11])

In [None]:
# For cont12 and cont13

reg_plots(train_copy, num_cols[12], num_cols[13])

## 12. Correlation Strength

**Looking at the regplot for each numerical variable and Target, there is a general weak linear relationship.<br>Let's investigate further with a correlation matrix**

In [None]:
corr_df = train_copy[num_cols]
corr_data = corr_df.corr()

sns.set_style('ticks')
plt.figure(figsize=(14,10))
plt.title('Numerical Variables Correlation Matrix', fontsize=16)

sns.heatmap(corr_data, annot=True)

plt.show()

**Clearly there is no linear relationship between each numerical variable and Target<br>The only way to learn any meaningful representation is to use a non-linear style regression**

**One more important thing we can learn from the corr-matrix is that some columns may be highly correlated or multi-collinearity issues. We can identify these by the much lighter colors in the matrix. We need to treat all such columns**

## 13. Checking for Multi-collinearity

In [None]:
corr_data

In [None]:
high_corrs = []
col_names = list(corr_data.columns)

for index, row in corr_data.iterrows():
    count = -1
    for r in row:
        count+=1
        if abs(r) >= 0.5:
            x = (index, col_names[count], round(r, 2))
            high_corrs.append(x)

# Let's remove the correlation of same to same columns = 1.0
high_corrs = [i for i in high_corrs if i[2] != 1.0]

In [None]:
high_corrs

## 14. Applying Variance-Inflation-Factor

**We shall use VIF to determine the overall columns with high multi-collinearity and seive them out...**

**[Link](https://github.com/Lawrence-Krukrubo/Understanding_Multiple_Linear_Regression/blob/master/coefficients_of_multiple_linear_regression.ipynb)**

In [None]:
corr_cols=set()
for i in high_corrs:
    corr_cols.add(i[0])
    corr_cols.add(i[1])
corr_cols

**First we make the unique high-corr data a dataframe**

In [None]:
data = train_copy[corr_cols]
data.head(3)

**Next, we standardize the data**

In [None]:
def standardize(data_features):
    data_features = (data_features - data_features.mean()) / data_features.std()
    return data_features

In [None]:
data = data.apply(standardize, axis=0)
data.head()

**Next, we apply the VIF**

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor


vif = pd.DataFrame()
vif["VIF_Factor"] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
vif["features"] = data.columns

vif

**The VIF is a measure of colinearity among predictor variables within a multiple regression. If the outcome is 1, it’s okay. If it’s between 1 and 5, it shows low to average colinearity, and above 5 generally means highly redundant and variable should be dropped.<br>In this case all VIF scores are just between 1 and 2.5 and this is not enough to drop the columns , so we continue...**

## 15. Dropping Target Variable

Most machine learning models work better with a normalized data set. We shall use the box-cox normalization for the numerical columns

In [None]:
# Let's remove the target column from numcols

num_cols.pop(-1)

In [None]:
target = train_copy.pop('target')
train.drop(['target'], axis=1, inplace=True)

train_copy.head(3)

## 16. Applying Box-Cox Transformation to Original Dataset...

**Here we use the Power Transform Function to Normalize the data**

First let's visualize the current shapes

In [None]:
def plot_hist(df,
              df_name,
              color=None,
              suptitle=None,
              size=(16,12),
             linewidth=1.5,
             edgecolor='black',
             density=True):
    
    global num_cols
    title = f'Numerical Values Distribution: {df_name}'
    if suptitle:
        title = df_name+': '+suptitle
        
    df[num_cols].hist(figsize=size,
                     linewidth=linewidth,
                     edgecolor=edgecolor,
                     color=color,
                     density=density)
    
    plt.suptitle(title, fontweight='bold', fontsize=16, y=0.95)
    plt.show()

In [None]:
plot_hist(train_copy, 'train-copy-Default')

Box_cox can't work with negative values, so let's confirm if we have negative values in the training set

In [None]:
check = train_copy[num_cols] < 0
check_dict = {col:sum(check[col]) for col in check.columns}
check_dict

**So we shall use standard scaler with the box-cox transformation**

In [None]:
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
print('Done!')

In [None]:
def apply_boxcox_scaler(dataset, test=None):
    
    # Select only Numeric cols:
    global num_cols
    dataset_copy = deepcopy(dataset)
    num_data = dataset_copy[num_cols]
    
    # first scale values to range(1,2),
    scaler = MinMaxScaler(feature_range=(1, 2))
    
    # Then apply boxcox transform
    power = PowerTransformer(method='box-cox')
    
    # After boxcox scale back to range(0,1)
    scaler2 = MinMaxScaler(feature_range=(0, 1))
    
    pipeline = Pipeline(steps=[('s1', scaler),('p', power), ('s2', scaler2)])
    data = pipeline.fit_transform(num_data)
    
    # convert the array back to a dataframe
    data = pd.DataFrame(data)
    data.columns = num_cols
    
    for col in data.columns:
        dataset_copy[col] = list(data[col])
        
    if test is not None:
        test_copy = deepcopy(test)
        test_num_data = test_copy[num_cols]
        test_data = pipeline.transform(test_num_data)
        # convert the array back to a dataframe
        test_data = pd.DataFrame(test_data)
        test_data.columns = num_cols
        for col in test_data.columns:
            test_copy[col] = list(test_data[col])
            return dataset_copy, test_copy
    
    return dataset_copy

**Let's see the normalized features**

In [None]:
train_boxcox = apply_boxcox_scaler(train_copy)
plot_hist(train_boxcox, 'BoxCox', color='green')

In [None]:
train_boxcox.head(3)

In [None]:
# confirm no missing values from transformation

train_boxcox.isna().any().sum() == train_copy.isna().any().sum() == 0

**Box-Cox has done a good transformation with the shape of the data, but looking at the numeric variables, they are not centred around zero. This data does not have a general MEAN of ~0 and STD of ~1. Let's consider the Z-score or Standardization method**

In [None]:
def mean_std_distance(df):
    """This method sums the absolute
        distance between each feature mean
        and zero and each feature std and one
        and returns a dictionary with total distance
        sums for all features for both mean and std
    """
    mean_serie = df.apply(np.mean, axis=0)
    std_serie = df.apply(np.std, axis=0)
    
    mean_sum = np.sum(np.abs(mean_serie)-0)
    std_sum = np.sum(1 - np.abs(std_serie))
    
    distance_dict = {'MEAN-Distance':np.round(mean_sum, 4),
                    'STD-Distance':np.round(std_sum, 4)}
    
    return distance_dict

In [None]:
train_boxcox_norm_dist =  mean_std_distance(train_boxcox[num_cols])
train_boxcox_norm_dist

**The MEAN-distance and STD-distance above show the sum-total of how far the numeric features in the DataFrame, in this case `train-boxcox` are far away from a MEAN of 0 and STD of 1**

**Let's see the distribution range of values after box-cox**

In [None]:
def plot_range(df, df_name):
    num_df = df.select_dtypes(['number'])
    (num_df.max() - num_df.min()).plot.bar(figsize=(14,7))
    title = f'Range of Numerical Values: {df_name}'
    plt.title(title, fontweight='bold', fontsize=16)
    plt.show()

In [None]:
plot_range(train_boxcox, 'train_boxcox')

**The range of distribution above, from the box-cox transformation is just awesomely perfect! Even though the MEAN and STD are not around 0 or 1**

## 17. Visualizing Variable Normality using a Q-Q Plot

**Normality can be also assessed by Q-Q plots. In a Q-Q plot we plot the quantiles of the variable in the y-axis and the expected quantiles of the normal distribution in the x-axis. If the variable follows a normal distribution, the dots in the Q-Q plot should fall in a 45 degree diagonal line.**

In [None]:
# For example, let's see the cont10 variable if it's normally distributed

# The blue dots should adjust to the 45 degree line

fig1 = stats.probplot(train_boxcox['cont10'], dist="norm", plot=plt)

plt.show()

In [None]:
# For example, let's see the cont10 variable if it's normally distributed

# The blue dots should adjust to the 45 degree line

fig1 = stats.probplot(train_boxcox['cont5'], dist="norm", plot=plt)

plt.show()

## 18. Exploring Outliers...

**An outlier is a data point that is significantly different from the remaining data. On occasions, outliers are very informative; for example, when looking for credit card transactions, an outlier may be an indication of fraud. In other cases, outliers are rare observations that do not add any additional value.**

**We'd apply the `inter-quartile range (IQR) proximity rule`. According to the IQR proximity rule, a value is an outlier if it falls outside these boundaries:**

```
Upper boundary = 75th quantile + (IQR * 1.5)

Lower boundary = 25th quantile - (IQR * 1.5)

Here, IQR is given by the following equation:

IQR = 75th quantile - 25th quantile
```

**Let's randomly plot two variables, we might see some outliers.<br>Running the cell below repeatedly plots different pairs of variables.**

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,5))

x, y = np.random.randint(0, len(num_cols), size=2)
col1, col2 = list(train_boxcox.columns)[x], list(train_boxcox.columns)[y]

fig1 = sns.boxplot(y=train_boxcox[col1], ax=ax[0], color='brown')
fig1.set_xlabel(f'{col1}',fontweight='bold', fontsize=12)
fig1.set_title(f'Box-Plot of {col1}',fontweight='bold', fontsize=14)

fig2 = sns.boxplot(y=train_boxcox[col2], ax=ax[1])
fig2.set_xlabel(f'{col2}',fontweight='bold', fontsize=12)
fig2.set_title(f'Box-Plot of {col2}',fontweight='bold', fontsize=14)

plt.show()

**I'd create a function that takes a dataframe and the factor (default is 1.5 but can be changed) to use in the IQR calculation and returns the IQR proximity rule boundaries:**

In [None]:
def find_boundaries(variable, df, distance):
    """This method computes and returns the upper
        and lower outlier boundaries for each variable.
    """

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary

In [None]:
def outliers_toDF(df, distance):
    outliers_dict = {}
    for col in df.columns:
        upper, lower = find_boundaries(col,df, distance)
        outliers = np.where(df[col] > upper, True,
        np.where(df[col] < lower, True, False))
        count_outliers = np.sum(outliers)
        outliers_dict[col] = [count_outliers]
    
    outliers_df = pd.DataFrame(outliers_dict).T
    outliers_df.columns = ['count']
    return outliers_df

**Let's find the extreme outliers for train-boxcox, these are outliers 3 times the IQR, rate. While normal outliers are outliers 1.5 the IQR**

In [None]:
outliers_trainBoxCox = outliers_toDF(train_boxcox, 3)
print(f'Train-BoxCox Extreme Outliers: {np.sum(outliers_trainBoxCox)}')
print()

outliers_to_data_size_percent = (np.divide(np.sum(outliers_trainBoxCox), train_boxcox.size))*100 
print('Extreme Outliers Pct to Data-Size:',outliers_to_data_size_percent)
outliers_trainBoxCox

**Well, it turns out that the percent of outliers to total data size is < 3% for `train-boxcox`.**

## 19. Applying Standardization to Train-Copy dataset...

**Let's try to reposition the data to have mean ~0 and Std ~1. Standardization is also called Z-Score-Norm**

In [None]:
def standardize_numCols(train_copy, test=None):
    global num_cols
    
    copy_df = deepcopy(train_copy)
    # Select only numeric cols
    num_df = copy_df[num_cols]
    means = num_df.mean()
    stds = num_df.std()
    num_df = (num_df - means) / stds
    
    for col in num_df.columns:
        copy_df[col] = list(num_df[col])
    
    if test is not None:
        copy_test = deepcopy(test)
        # Select only numeric cols
        test_num_df = copy_test[num_cols]
        test_num_df = (test_num_df - means) / stds
        
        for col in test_num_df.columns:
            copy_test[col] = list(test_num_df[col])
        
        return copy_df, copy_test

    return copy_df

In [None]:
train_stdize = standardize_numCols(train_copy)
plot_hist(train_stdize, 'Standardize-DF', color='y')

In [None]:
train_stdize.head(3)

In [None]:
# confirm no missing values from transformation

train_stdize.isna().any().sum() == train_copy.isna().any().sum() == 0

**Let's see how far away each numeric feature's MEAN and STD is from 0 and 1**

In [None]:
train_stdize_norm_dist =  mean_std_distance(train_stdize[num_cols])
train_stdize_norm_dist

In [None]:
# For example, let's see the cont10 variable if it's normally distributed

# The blue dots should adjust to the 45 degree line

fig1 = stats.probplot(train_stdize['cont5'], dist="norm", plot=plt)

plt.show()

**We can see that there is virtually no distance between the features MEAN and STD of `train_stdize`, as impress as this is, let's see the range of distribution.**

In [None]:
plot_range(train_stdize, 'train_stdize')

**Though the data is more central around the mean, the range is uneven and chaotic. Let's see the outliers**

In [None]:
outliers_train_stdize = outliers_toDF(train_stdize, 3)
print(f'Total Extreme Outliers for Train-Stdize: {np.sum(outliers_train_stdize)}')
print()

outliers_to_data_size_percent = (np.divide(np.sum(outliers_train_stdize), train_stdize.size))*100 
print('Extreme Outliers Pct to Data-Size:',outliers_to_data_size_percent)
outliers_train_stdize

#### train_stdize data has less than 3% outliers to dataset ratio...Let's try mean-normalization.

## 20. Applying Mean-Normalization to Original Dataset

**In mean normalization, we center the variable at zero and rescale the distribution to the value range. This procedure involves subtracting the MEAN from each observation and then dividing the result by the difference between the minimum and maximum values:**

In [None]:
def mean_norm(train_copy, test=None):
    """This method performs
        feature-wise mean norm
        by subtracting the mean and 
        dividing by the range of each
        feature distribution
    """
    global num_cols
    copy_train = deepcopy(train_copy)

    # Select only numeric cols
    num_df = copy_train[num_cols]
    
    # Let's learn the means
    means = num_df.mean(axis=0)
    
    # Let's learn the ranges
    ranges = num_df.max(axis=0) - num_df.min(axis=0)
    
    # Fit the learned means and ranges to the train set
    train_num_scaled = (num_df - means) / ranges
    
    for col in train_num_scaled:
        copy_train[col] = list(train_num_scaled[col])
    
    if test is not None:
        # If test, also fit it on the test set
        test_copy = deepcopy(test)
        test_df = test_copy[num_cols]
        test_num_scaled = (test_df - means) / ranges
        for col in test_num_scaled:
            test_copy[col] = list(test_num_scaled[col])
        return copy_train, test_copy
    
    return copy_train

In [None]:
mean_norm_df = mean_norm(train_copy)
plot_hist(mean_norm_df, 'Mean-Norm-DF', color='aqua')

In [None]:
mean_norm_df.head(3)

In [None]:
# confirm no missing values from transformation

mean_norm_df.isna().any().sum() == train_copy.isna().any().sum() == 0

In [None]:
plot_range(mean_norm_df, 'mean_norm_df')

**Let's see how far the numeric values are from the MEAN and STD**

In [None]:
mean_norm_dist =  mean_std_distance(mean_norm_df[num_cols])
mean_norm_dist

In [None]:
outliers_mean_norm = outliers_toDF(mean_norm_df, 3)
print(f'Total Extreme Outliers, Mean-Norm: {np.sum(outliers_mean_norm)}')
print()

outliers_to_data_size_percent = (np.divide(np.sum(outliers_mean_norm), mean_norm_df.size))*100 
print('Extreme Outliers Pct to Data-Size:',outliers_to_data_size_percent)
outliers_mean_norm

#### The decision is between Box-Cox and Mean-Norm. They both have a perfect range across all features between 0 and 1. But Mean-Norm balances the data better with zero deviation from the mean of 0 and slightly lower deviation from the std of 1 than Box-Cox. Also Box-cox is just slightly better on outliers as compared to Mean-norm. But since I'd treat outliers soon, I so far prefer the centralised distribution of Mean-norm.

#### Standardization is not an option because even though it perfectly balances the distribution of all variables at a MEAN of 0 and STD of 1, it causes a chaotic range of distribution and an outlier ratio worse than Box-cox and similar to Mean-norm... Let's try a couple more.

## 21. Applying Robust-Scaling to Train-copy

**This is also called scaling with median and quantiles.When scaling variables to the median and quantiles, the median value is deducted from the observations and the result is divided by the inter-quartile range (IQR). Robust scaling produces more robust estimates for the center and value range of the variable, and is recommended if the data contains outliers, just like our present data**

### X_scaled = X - X_median / ( X.quantile(0.75) - X.quantile(0.25) )

In [None]:
def robust_scaler(trainset, test=None):
    global num_cols
    trainset_copy = deepcopy(trainset)
    # set up the scaler
    scaler = RobustScaler()

    # fit the scaler to the train set, it will learn the parameters
    # Fit to only numerical columns
    num_arr = scaler.fit_transform(trainset_copy[num_cols])
    # convert back to df
    num_df = pd.DataFrame(num_arr, columns=num_cols)
    num_df.index.name='id'
    
    for col in num_df.columns:
            trainset_copy[col] = list(num_df[col])

    try:            
        # transform testset
        test_copy = deepcopy(test)
        test_num_arr = scaler.transform(test_copy[num_cols])
        # convert back to df
        test_df = pd.DataFrame(test_num_arr, columns=num_cols)
        test_df.index.name='id'
        for col in test_df.columns:
            test_copy[col] = list(test_df[col])
    except:
        return trainset_copy
    
    return trainset_copy, test_copy

In [None]:
robust_norm_df = robust_scaler(train_copy)
plot_hist(robust_norm_df, 'Robust-Scaler-DF', color='pink')

In [None]:
robust_norm_df.head(3)

In [None]:
# confirm no missing values from transformation

robust_norm_df.isna().any().sum() == train_copy.isna().any().sum() == 0

In [None]:
plot_range(robust_norm_df, 'robust_norm_df')

**Let's see how far away values are from the MEAN and STD**

In [None]:
mean_robust_norm_dist =  mean_std_distance(robust_norm_df[num_cols])
mean_robust_norm_dist

In [None]:
outliers_robust_norm = outliers_toDF(robust_norm_df, 3)
print(f'Total Extreme Outliers, Robust-Norm: {np.sum(outliers_robust_norm)}')
print()

outliers_to_data_size_percent = (np.divide(np.sum(outliers_robust_norm), robust_norm_df.size))*100 
print('Extreme Outliers Pct to Data-Size:',outliers_to_data_size_percent)
outliers_robust_norm

**We Can see that the distribution range of pure numerical values for Robust-norm is uneven and chaotic like Z-Score norm, going from 0 up to almost 5. While those of categorical values are within 0 to 1. Plus the MEAN-dist and STD-dist are worse than Mean-norm, but better than Boxcox. This makes Robust-norm not yet an ideal choice over Mean-Norm.**

## 22. Applying Scaling to Vector Unit-Length: L1-Norm.

**When scaling to vector unit length, we transform the components of a feature vector so that the transformed vector has a length of 1, or in other words, a norm of 1. Note that this scaling technique scales the feature vector, as opposed to each individual variable. A feature vector contains the values of each variable for a single observation. When scaling to vector unit length, we divide each feature vector by its norm, using either `l1` (manhattan-dist) or `l2` (euclidean-dist) norm.**

In [None]:
# norm takes a str value of: l1 or l2

def vector_unit_scaler(trainset, norm, test=None):
    # set up the scaler
    global num_cols
    train_copy = deepcopy(trainset)
    scaler = Normalizer(norm=norm)

    # fit/transform the scaler to the train set, it will learn the parameters
    # fit on only numeric columns
    num_arr = scaler.fit_transform(train_copy[num_cols])
    
    # convert back to df
    num_df = pd.DataFrame(num_arr, columns=num_cols)
    num_df.index.name='id'
    
    for col in num_df.columns:
        train_copy[col] = list(num_df[col])

    try:
        # transform testset
        test_copy = deepcopy(test)
        test_num_arr = scaler.transform(test_copy[num_cols])
        # convert back to df
        test_df = pd.DataFrame(test_num_arr, columns=num_cols)
        test_df.index.name='id'
        
        for col in test_df.columns:
            test_copy[col] = list(test_df[col])
    except:
        return train_copy
    
    return train_copy, test_copy

In [None]:
vectorl1_df = vector_unit_scaler(train_copy, 'l1')
plot_hist(vectorl1_df, 'Vector Unit-Length L1-Norm', color='lime')

In [None]:
vectorl1_df.head(3)

In [None]:
# confirm no missing values from transformation

vectorl1_df.isna().any().sum() == train_copy.isna().any().sum() == 0

In [None]:
plot_range(vectorl1_df, 'vectorl1_df')

In [None]:
mean_vectorl1_df_dist =  mean_std_distance(vectorl1_df[num_cols])
mean_vectorl1_df_dist

In [None]:
outliers_vectorl1_df_norm = outliers_toDF(vectorl1_df, 3)
print(f'Total Extreme Outliers, vectorl1_df_norm: {np.sum(outliers_vectorl1_df_norm)}')
print()

outliers_to_data_size_percent = (np.divide(np.sum(outliers_vectorl1_df_norm), vectorl1_df.size))*100 
print('Extreme Outliers Pct to Data-Size:',outliers_to_data_size_percent)
outliers_vectorl1_df_norm

## 23. Applying Scaling to Vector Unit-Length: L2-Norm.

**When scaling to vector unit length, we transform the components of a feature vector so that the transformed vector has a length of 1, or in other words, a norm of 1. Note that this scaling technique scales the feature vector, as opposed to each individual variable. A feature vector contains the values of each variable for a single observation. When scaling to vector unit length, we divide each feature vector by its norm, using either `l1` (manhattan-dist) or `l2` (euclidean-dist) norm.**

In [None]:
vectorl2_df = vector_unit_scaler(train_copy, 'l2')
plot_hist(vectorl2_df, 'Vector Unit-Length L2-Norm', color='gold')

In [None]:
vectorl2_df.head(3)

In [None]:
# confirm no missing values from transformation

vectorl2_df.isna().any().sum() == train_copy.isna().any().sum() == 0

In [None]:
plot_range(vectorl2_df, 'vectorl2_df')

In [None]:
mean_vectorl2_df_dist =  mean_std_distance(vectorl2_df[num_cols])
mean_vectorl2_df_dist

In [None]:
outliers_vectorl2_df_norm = outliers_toDF(vectorl2_df, 3)
print(f'Total Extreme Outliers, vectorl2_df_norm: {np.sum(outliers_vectorl2_df_norm)}')
print()

outliers_to_data_size_percent = (np.divide(np.sum(outliers_vectorl2_df_norm), vectorl2_df.size))*100 
print('Extreme Outliers Pct to Data-Size:',outliers_to_data_size_percent)
outliers_vectorl2_df_norm

## 24. Summarizing Feature Scaling/Normalization Choices

**Looking at the above charts for choice of Feature-Norm/Scaling let's create a function that basically summarizes the key similarities or differences of each choice in a table...**

In [None]:
df_list = [train_boxcox,
          train_stdize,
          mean_norm_df,
          robust_norm_df,
          vectorl1_df,
          vectorl2_df]

def norm_scale_summary(df_list):
    global num_cols
    
    summary_dict = {'BoxCox':[], 
                    'Standardize':[], 
                    'MeanNorm':[], 
                    'RobustScale':[],
                   'VectorUnitL1':[],
                   'VectorUnitL2':[]}
    
    for ind, df in enumerate(df_list): 
        for key in list(summary_dict.keys())[ind:]:
            tot_outliers = np.sum(outliers_toDF(df, 1.5))[0]
            pct_outliers = np.round(np.divide(tot_outliers, df.size)*100,2)
            details = [tot_outliers,
                      pct_outliers,
                      np.round(list(mean_std_distance(df[num_cols]).values())[0],2),
                      np.round(list(mean_std_distance(df[num_cols]).values())[1],2),
                      np.round(np.min(df).min(), 2),
                      np.round(np.max(df).max(),2),
                      np.round(np.max(df).max()-np.min(df).min(),2)]
            
            summary_dict[key].extend(details)
            break
            
    summary_df = pd.DataFrame(summary_dict, index=['Total_Outliers',
                                                    'Pct_Outliers',
                                                    'Sum_MEAN_Dist',
                                                    'Sum_STD_Dist',
                                                    'Min_Value',
                                                    'Max_Value',
                                                    'Range'])
    
    summary_df.loc['Uniform_Range'] = ['Yes', 'No', 'Yes', 'No', 'Almost','Almost']
    summary_df.index.name='Metrics'
    
    return summary_df

In [None]:
summary_df = norm_scale_summary(df_list)
summary_df

## 25. Winsorization to Address Outliers...

**Winsorization, or winsorizing, is the process of transforming the data by limiting the extreme values, that is, the outliers, to a certain arbitrary value, closer to the mean of the distribution. Winsorizing is different from trimming because the extreme values are not removed, but are instead replaced by other values. A typical strategy involves setting outliers to a specified percentile.**

**Let's create a Winsorizer object to Cap outliers based on the same Inter-Quantile-Range we specified earlier**

In [None]:
winsorizer = Winsorizer(capping_method='iqr', fold=3, tail='both')
print('Done!')

**Next, we'd fit_transform the training data using the winsorizer and then transform the test data soon, with the learned parameters from the training data using the transform function of the Winsorizer**

**But first let's sample a few variables and see the outliers before and after applying winsorization. Let's look at variables `cont8` and `cont2` of mean_norm_df.**

In [None]:
# function to create histogram, Q-Q plot and
# boxplot of specific variables

def diagnostic_plots(df, var_name, when):
    # function takes a dataframe (df) and
    # the variable of interest as arguments
    fontdict={'fontweight':'bold', 'fontsize':'14'}
    # define figure size
    plt.figure(figsize=(16, 4))

    # histogram
    plt.subplot(1, 3, 1)
    sns.distplot(df[var_name], bins=30)
    plt.title('Histogram',fontdict=fontdict)

    # Q-Q plot
    plt.subplot(1, 3, 2)
    stats.probplot(df[var_name], dist="norm", plot=plt)
    plt.ylabel(f'{var_name} quantiles')
    plt.title('Probability Plot', fontdict=fontdict)

    # boxplot
    plt.subplot(1, 3, 3)
    sns.boxplot(y=df[var_name])
    plt.title('Boxplot', fontdict=fontdict)
    
    plt.suptitle(f'Histogram, Q-Q-Plot and Box-plot of Variable: {var_name} {when} Winsorization',
                 fontweight='bold',
                 fontsize=16,
                 y=1.08)
    plt.show()

In [None]:
# Let's look at variable cont8 before applying winsorization

diagnostic_plots(vectorl1_df, 'cont8', when='before')

**We can see the general bell-curve shape of the distribution from the histogram. This is the result of the mean-norm we did earlier. We can also see the 45-deg line and the blue dots that roughly keep to the red-line of the Q-Q Plot (Probability-Plot), this also indicates presence of a normal distribution. Finally, we can see the thick dotted ouliers above point 4on the y-axis of the Box-plot, indicating the presence of outliers.**

In [None]:
chck = winsorizer.fit_transform(vectorl1_df)
print('Done!')

In [None]:
# Let's look at variable cont8 before applying winsorization

diagnostic_plots(chck, 'cont8', when='after')

**We can see that the dotted outliers have disappeared after applying winsorization to the `vectorl1_df` data**

## 26. Kurtosis AKA The 4th Statistical-Moment

**Now just before we fit the Winsorizer on the dataset, let's compute the Kurtosis score of the data, which is a score that computes how much outliers are in the dataset. Kurtosis indicates the outlier content within the data.
The higher the Kurtosuis measure is, the more outliers are present and the longer the tails in the distribution of the histogram are**

In [None]:
def calc_kurtosis(df):
    n = len(df)
    step1 = ((df - df.mean())**4) / (df.std())**4
    step2 = (np.sum(step1).sum()) / n
    
    return np.round(step2, 4)

**Let's return to the `mean_norm_df` data and apply kurtosis**

In [None]:
kurtosis_before_winsorizer = calc_kurtosis(mean_norm_df)
kurtosis_before_winsorizer

**Now, let's apply the winsorizer**

In [None]:
mean_norm_df = winsorizer.fit_transform(mean_norm_df)
print('Done!')

**First let's calculate the kurtosis score and see if it's gone down...**

In [None]:
kurtosis_after_winsorizer = calc_kurtosis(mean_norm_df)
kurtosis_after_winsorizer

**Yep!! it's gone down from 245 to a mere 41, because the winsorizer has fixed a lot of the outliers in the data!, the only outlies left may warrant us to be more strict with our IQR rate down to 1.5 from 3.0 to capture them all**

In [None]:
# Let's see the data
mean_norm_df.head(3)

In [None]:
# Let's see the normalized data
summary = 'Numerical Values Distribution after Winsorizer'
plot_hist(mean_norm_df, 'Mean-Norm', color='aqua', suptitle=summary)

In [None]:
plot_range(mean_norm_df, 'mean_norm_df')

**We can see that by removing outliers, certain categorical variables basically have no more distribution. It seems these variables are just around 0 all through. Let's print their min and max values to be certain.**

In [None]:
queer_vars = ['cat0_A', 'cat2_A', 'cat4_B', 'cat6_A', 'cat7_E', 'cat8_A', 'cat8_G', 'cat9_I', 'cat9_L']
df = mean_norm_df
[(min(df[i]), max(df[i])) for i in queer_vars]

**Just as suspected, all have a min equal min values and max values. These are the values whose relative values were dropped for being smaller than the threshold we set earlier. So let's continue by keeping these variables....**

In [None]:
print(f'mean-norm-df shape is {mean_norm_df.shape}')

**Let's re-check the distance of means from 0 and stds from 1 in mean-norm-df**

In [None]:
mean_norm_dist =  mean_std_distance(mean_norm_df[num_cols])
mean_norm_dist

In [None]:
outliers_mean_norm = outliers_toDF(mean_norm_df, 1.5)
print(f'Total Outliers, Mean-Norm: {np.sum(outliers_mean_norm)}')
print()

outliers_to_data_size_percent = (np.divide(np.sum(outliers_mean_norm), mean_norm_df.size))*100 
print('Outliers Pct to Data-Size:',outliers_to_data_size_percent)
outliers_mean_norm

**The percentage of outliers to data size has also reduced from 3.01 to 0.33. This balance of 0.33 refers to the few original numerical variables that have moderate outliers around 1.5 times IQR.**

In [None]:
print(mean_norm_df.shape)
mean_norm_df.head(3)

## 27. Performing Data Transformations on Test set

#### We also need to transform the test set on values learnt on the training set.<br>The functions below perform all the transformations we've been doing for specific activation functions in addition to reducing the train, test and target datasets and returning these fit for machine learning....

### Note that each transformation function must only contain one key-word amongst other words.<br>Key words are:- 

1. BOX (for boxcox activation)
2. MEAN (for mean-norm activation)
3. ROBUST (for robust scaling)
4. L1 (for vector unit-length L1)
5. L2 (for vector unit-length L2)
6. STANDARD (for standardization)

In [None]:
class FinalPrep(object):

    def __init__(self, 
                 train=train, 
                 test=test, 
                 train_copy=train_copy, 
                 num_cols=num_cols, 
                 OHE=False, 
                 transform='MEAN', 
                 method='iqr', 
                 fold=3):
        self.train = train
        self.test = test
        self.train_copy = train_copy
        self.num_cols = num_cols
        self.OHE = OHE
        self.transform = transform
        self.method = method
        self.fold = fold
        
    def _copy_test(self):
        """Make a deep copy of
            the test set
        """
        test_copy = deepcopy(self.test)

        try:
            assert test_copy.shape == self.test.shape
            assert list(test_copy.columns) == list(self.test.columns)
        except AssertionError as e:
            print('ERROR: Copy-Test:')
            return e

        return test_copy

    ######################################

    def _keep_same_cols(self):
        """Align test set columns 
            to trainset columns
        """
        test_copy =  self._copy_test()
        df = None
        train = None
        if self.OHE:
            train = self.train_copy
            df = pd.get_dummies(test_copy)
            df = df[train.columns]
            
        else:
            train = self.train
            df = test_copy[train.columns]
            

        try:
            assert df.shape[1] == train.shape[1]
            assert list(df.columns) == list(train.columns)
        except AssertionError as e:
            print('ERROR: Keep-Same-Cols:')
            return e

        return df

    ######################################

    def _apply_transform(self):
        """Apply specific scaling or
            normalizing method to the
            trainset and transform the
            test set with same params.
        """
        activation = self.transform
        test_copy = self._keep_same_cols()
        train = self.train
        
        if self.OHE:
            train = self.train_copy
        
        # ensure activation is all lower-case
        activation = activation.lower()
        X, y = None, None

        if 'box' in activation:
            print('Applying Boxcox transformation...')
            X, y = apply_boxcox_scaler(train, test_copy)
        elif 'mean' in activation:
            print('Applying Mean-norm transformation...')
            X, y = mean_norm(train, test_copy)
        elif 'standard' in activation:
            print('Applying Standard transformation...')
            X, y = standardize_numCols(train, test_copy)
        elif 'robust' in activation:
            print('Applying Robust transformation...')
            X, y = robust_scaler(train, test_copy)
        elif 'l1' in activation:
            print('Applying Vector Unit-Scaler L1 transformation...')
            X, y = vector_unit_scaler(train, 'l1', test_copy)
        else:
            print('Applying Vector Unit-Scaler L2 transformation...')
            X, y = vector_unit_scaler(train, 'l2', test_copy)

        try:
            assert X.shape[1] == y.shape[1]
            assert list(X.columns) == list(y.columns)
        except AssertionError as e:
            print('ERROR: Apply-Scale-Norm:')
            return e

        return X, y

    ######################################
    
    @staticmethod
    def clean_winsorizer(df):
        """Helper function for
            winsorization function.
        """
        
        queer_vars = [col for col in df.columns if round(df[col].min(),1) == round(df[col].max(),1)]
        return queer_vars

    def _winsorization(self):
        X, y = self._apply_transform()
        method = self.method
        fold = self.fold
        
        winsorizer = Winsorizer(capping_method=method, 
                                fold=fold, 
                                tail='both')

        X_winsored = winsorizer.fit_transform(X)
        y_winsored = winsorizer.transform(y)
        
        if self.OHE:
            queer = FinalPrep.clean_winsorizer(X_winsored)
            #X_winsored.drop(queer, axis=1, inplace=True)
            #y_winsored.drop(queer, axis=1, inplace=True)

        try:
            assert X_winsored.shape[1] == y_winsored.shape[1]
            assert list(X_winsored.columns) == list(y_winsored.columns)
        except AssertionError as e:
            print('ERROR: Winsorization:')
            return e

        return X_winsored, y_winsored

    ######################################

    def _reduce_datasets(self):
        train_winsored, test_winsored = self._winsorization()
        num_cols = self.num_cols
        
        train_size = train_winsored.memory_usage().sum()
        test_size = test_winsored.memory_usage().sum()

        # Reducing Num_cols to Float 32
        for col in train_winsored[num_cols]:
            train_winsored[col] = train_winsored[col].astype('float32')
            test_winsored[col] = test_winsored[col].astype('float32')

        # Reducing Cat_cols to Int 32 if OHE
        if self.OHE:
            for col in set(train_winsored.columns) - set(num_cols):
                train_winsored[col] = train_winsored[col].astype('int32')
                test_winsored[col] = test_winsored[col].astype('int32')

        print(f'train memory reduced by {100-((train_winsored.memory_usage().sum()/train_size)*100)} Pct!')
        print(f'test memory reduced by {100-((test_winsored.memory_usage().sum()/test_size)*100)} Pct!')
        print()

        return train_winsored, test_winsored
    
    def train_test(self):
        return self._reduce_datasets()

### Instantiate an instance of the FinalPrep class, passing if we want One-Hot-Encoding or Not and the type of transformation for the final test and train data sets.

In [None]:
transformations = ['boxcox', 'mean-norm', 'robust-scale', 'standardize', 'L1', 'L2']
datasets = []

In [None]:
for transform in transformations:
    final_prep = FinalPrep(OHE=True, transform=transform)
    (train_, test_) = final_prep.train_test()
    datasets.append((train_, test_))

## 28. Saving the pre-processed train, target and test sets...

In [None]:
datasets[0][0].to_csv('train_boxcox.csv')
datasets[0][1].to_csv('test_boxcox.csv')
datasets[1][0].to_csv('train_meannorm.csv')
datasets[1][1].to_csv('test_meannorm.csv')
datasets[2][0].to_csv('train_robust.csv')
datasets[2][1].to_csv('test_robust.csv')
datasets[3][0].to_csv('train_standard.csv')
datasets[3][1].to_csv('test_standard.csv')
datasets[4][0].to_csv('train_l1.csv')
datasets[4][1].to_csv('test_l1.csv')
datasets[5][0].to_csv('train_l2.csv')
datasets[5][1].to_csv('test_l2.csv')

print('Datasets Saved!')

Finally reduce the target size and save it..

In [None]:
target = target.astype('float32')
target.to_csv('target.csv')

In [None]:
!ls

In [None]:
pd.read_csv('train_standard.csv', index_col='id').head(3)