# Introduction

In this challenge, Kaggle invites the competitors to solve a regression problem.

# Prepare the analysis


## Load packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Load the data

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/test.csv")

## Glimpse the data

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.describe()

In [None]:
test_df.describe()

# Data exploration


## Scatter plot of features


Scatter plot of train and test features, feature by feature.

In [None]:
def plot_feature_scatter(df1, df2, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(5, 3,figsize=(14, 24))

    for feature in features:
        i += 1
        plt.subplot(5, 3, i)
        plt.scatter(df1[feature], df2[feature], marker='+', color='green')
        plt.xlabel(feature, fontsize=9)
    plt.show();


In [None]:
features = ['cont1', 'cont2','cont3','cont4', 'cont5', 'cont6', 'cont7',
           'cont8', 'cont9','cont10','cont11', 'cont12', 'cont13', 'cont14']
plot_feature_scatter(train_df[::15],test_df[::10], features)


## Scatter plot of features vs. target

Scatter plot of each feature in train vs. target values.

In [None]:
def plot_feature_target_scatter(df1, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(5, 3,figsize=(14, 24))

    for feature in features:
        i += 1
        plt.subplot(5, 3, i)
        plt.scatter(df1[feature], df1['target'], marker='+', color='blue')
        plt.xlabel(feature, fontsize=9)
    plt.show();

In [None]:
plot_feature_target_scatter(train_df[::15], features)

## Features distribution

In [None]:
def plot_feature_distribution(df1, df2, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(5, 3,figsize=(14, 24))

    for feature in features:
        i += 1
        plt.subplot(5, 3,i)
        sns.distplot(train_df[feature],color="orange", kde=True,bins=120, label='train')
        sns.distplot(test_df[feature],color="darkblue", kde=True,bins=120, label='test')
        plt.xlabel(feature, fontsize=9); plt.legend()
    plt.show();


In [None]:
plot_feature_distribution(train_df[::15],test_df[::10], features)

## Target distribution

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of target values")
sns.distplot(train_df['target'],color="darkblue", kde=True,bins=120, label='target')
plt.legend(); plt.show()

## Features aggregated functions distribution

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of sum values per row of the train and test set")
sns.distplot(train_df[features].sum(axis=1),color="orange", kde=True,bins=120, label='train')
sns.distplot(test_df[features].sum(axis=1),color="darkblue", kde=True,bins=120, label='test')
plt.legend(); plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of min values per row of the train and test set")
sns.distplot(train_df[features].min(axis=1),color="orange", kde=True,bins=120, label='train')
sns.distplot(test_df[features].min(axis=1),color="darkblue", kde=True,bins=120, label='test')
plt.legend(); plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of max values per row of the train and test set")
sns.distplot(train_df[features].max(axis=1),color="orange", kde=True,bins=120, label='train')
sns.distplot(test_df[features].max(axis=1),color="darkblue", kde=True,bins=120, label='test')
plt.legend(); plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of mean values per row of the train and test set")
sns.distplot(train_df[features].mean(axis=1),color="orange", kde=True,bins=120, label='train')
sns.distplot(test_df[features].mean(axis=1),color="darkblue", kde=True,bins=120, label='test')
plt.legend(); plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of median values per row of the train and test set")
sns.distplot(train_df[features].median(axis=1),color="orange", kde=True,bins=120, label='train')
sns.distplot(test_df[features].median(axis=1),color="darkblue", kde=True,bins=120, label='test')
plt.legend(); plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of skew values per row of the train and test set")
sns.distplot(train_df[features].skew(axis=1),color="orange", kde=True,bins=120, label='train')
sns.distplot(test_df[features].skew(axis=1),color="darkblue", kde=True,bins=120, label='test')
plt.legend(); plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of kurtosis values per row of the train and test set")
sns.distplot(train_df[features].kurtosis(axis=1),color="orange", kde=True,bins=120, label='train')
sns.distplot(test_df[features].kurtosis(axis=1),color="darkblue", kde=True,bins=120, label='test')
plt.legend(); plt.show()

## Features correlation

In [None]:
plt.figure(figsize=(16, 16))
heatmap = sns.heatmap(np.round(test_df[features].corr(), 3), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features correlation', fontdict={'fontsize':10}, pad=10)
plt.title("Spearman correlation - test data")
plt.show()

In [None]:
features_target = features + ['target']
plt.figure(figsize=(16, 16))
heatmap = sns.heatmap(np.round(train_df[features_target].corr(), 3), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features correlation', fontdict={'fontsize':10}, pad=10)
plt.title("Spearman correlation - train data")
plt.show()

## New features

In [None]:
for df in [train_df, test_df]:
    df['sum'] = df[features].sum(axis=1)
    df['min'] = df[features].min(axis=1)
    df['max'] = df[features].max(axis=1)
    df['mean'] = df[features].mean(axis=1)
    df['median'] = df[features].median(axis=1)
    df['skew'] = df[features].skew(axis=1)
    df['kurtosis'] = df[features].kurtosis(axis=1)

In [None]:
new_features = ['sum', 'min', 'max', 'mean', 'median', 'skew', 'kurtosis']

In [None]:
features_target = features + new_features + ['target']
plt.figure(figsize=(18, 18))
heatmap = sns.heatmap(np.round(train_df[features_target].corr(), 3), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features correlation', fontdict={'fontsize':10}, pad=10)
plt.title("Spearman correlation - train data (including new features)")
plt.show()

In [None]:
features_all = features + new_features
plt.figure(figsize=(18, 18))
heatmap = sns.heatmap(np.round(test_df[features_all].corr(), 3), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features correlation', fontdict={'fontsize':10}, pad=10)
plt.title("Spearman correlation - test data (including new features)")
plt.show()