# Introduction

In this challenge, Kaggle invites the competitors to solve a regression problem.

<center><img src="https://images.unsplash.com/photo-1579343580826-6323adb66545?ixid=MXwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHw%3D&ixlib=rb-1.2.1&auto=format&fit=crop&w=1050&q=80" width=600></img></center>

# Prepare the analysis


## Load packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")

## Load the data

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")

## Glimpse the data

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.describe()

In [None]:
test_df.describe()

# Data exploration


There are (besides id and target) a number of 10 categorical features and 14 continous features.

We will look for the categorical features to the class count of the feature in train and test set and for the continous features to the distribution.



## Categorical features

We will use category count for the categorical features.

In [None]:
def feature_count(all_data, feature, title, size=3):
    f, ax = plt.subplots(1,1, figsize=(2*size,4))
    sns.countplot(all_data[feature], hue=all_data['set'])
    plt.xlabel(feature)
    plt.ylabel(f'Count')
    plt.title(title)
    plt.show()

In [None]:
all_data = pd.concat([train_df, test_df], axis=0)
all_data['set'] = 'train'
all_data.loc[all_data.target.isna(), 'set'] = 'test'
print(all_data.shape, "\n", all_data.set.value_counts())

In [None]:
feature_count(all_data, 'cat0', 'cat0',2)

In [None]:
feature_count(all_data, 'cat1', 'cat1',2)

In [None]:
feature_count(all_data, 'cat2', 'cat2',2)

In [None]:
feature_count(all_data, 'cat3', 'cat3',3)

In [None]:
feature_count(all_data, 'cat4', 'cat4',3)

In [None]:
feature_count(all_data, 'cat5', 'cat5',3)

In [None]:
feature_count(all_data, 'cat6', 'cat6',4)

In [None]:
feature_count(all_data, 'cat7', 'cat7',4)

In [None]:
feature_count(all_data, 'cat8', 'cat8',4)

In [None]:
feature_count(all_data, 'cat9', 'cat9',6)

## Scatter plot of continous features (train/test)


We will use scatter plot of train and test features, feature by feature.

In [None]:
def plot_feature_scatter(df1, df2, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(5, 3,figsize=(14, 24))

    for feature in features:
        i += 1
        plt.subplot(5, 3, i)
        plt.scatter(df1[feature], df2[feature], marker='+', color='green')
        plt.xlabel(feature, fontsize=9)
    plt.show();


In [None]:
features = ['cont0','cont1', 'cont2','cont3','cont4', 'cont5', 'cont6', 'cont7',
           'cont8', 'cont9','cont10','cont11', 'cont12', 'cont13']
plot_feature_scatter(train_df[::15],test_df[::10], features)


## Scatter plot of continous features vs. target

Scatter plot of each feature in train vs. target values.

In [None]:
def plot_feature_target_scatter(df1, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(5, 3,figsize=(14, 24))

    for feature in features:
        i += 1
        plt.subplot(5, 3, i)
        plt.scatter(df1[feature], df1['target'], marker='+', color='blue')
        plt.xlabel(feature, fontsize=9)
    plt.show();

In [None]:
plot_feature_target_scatter(train_df[::15], features)

## Target distribution

In [None]:
plt.figure(figsize=(16,6))
plt.title("Distribution of target values")
sns.distplot(train_df['target'],color="darkblue", kde=True,bins=120, label='target')
plt.legend(); plt.show()

## Continous features distribution

In [None]:
def plot_feature_distribution(df1, df2, features, label1="train", label2="test"):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(5, 3,figsize=(14, 24))

    for feature in features:
        i += 1
        plt.subplot(5, 3,i)
        sns.distplot(df1[feature],color="orange", kde=True,bins=60, label=label1)
        sns.distplot(df2[feature],color="darkblue", kde=True,bins=60, label=label2)
        plt.xlabel(feature, fontsize=9); plt.legend()
    plt.show();

In [None]:
plot_feature_distribution(train_df[::60],test_df[::40], features, 'train-feb','test-feb')

## Comparison with January competition data

### Features

In [None]:
new_features = ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']

In [None]:
trj_df = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/train.csv")
tej_df = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/test.csv")

plot_feature_distribution(trj_df[::60],tej_df[::40], new_features, 'train-jan', 'test-jan')

In [None]:
def plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb, title="Jan vs. Feb data"):
    fig, ax1 = plt.subplots(figsize=(10,6))
    plt.title(title)
    sns.distplot(train_jan_sample_df, hist=False, color='r')
    ax1.set_ylabel(f'{f_jan} (Jan)', color='r')
    plt.legend([f'{f_jan} (Jan)'], loc=(0.01, 0.95))
    ax2 = ax1.twinx()
    sns.distplot(train_feb_sample_df, hist=False, color='b')
    ax2.set_ylabel(f'{f_feb} (Feb)', color='b')
    plt.legend([f'{f_feb} (Feb)'], loc=(0.01, 0.9))
    plt.grid(True)



In [None]:
f_jan = 'cont1'
f_feb = 'cont0'
train_jan_sample_df = trj_df[f_jan].values[::30]
train_feb_sample_df = train_df[f_feb].values[::30]
plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb)
del train_jan_sample_df
del train_feb_sample_df

In [None]:
f_jan = 'cont2'
f_feb = 'cont1'
train_jan_sample_df = trj_df[f_jan].values[::30]
train_feb_sample_df = train_df[f_feb].values[::30]
plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb)
del train_jan_sample_df
del train_feb_sample_df

In [None]:
f_jan = 'cont3'
f_feb = 'cont2'
train_jan_sample_df = trj_df[f_jan].values[::30]
train_feb_sample_df = train_df[f_feb].values[::30]
plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb)
del train_jan_sample_df
del train_feb_sample_df

In [None]:
f_jan = 'cont4'
f_feb = 'cont3'
train_jan_sample_df = trj_df[f_jan].values[::30]
train_feb_sample_df = train_df[f_feb].values[::30]
plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb)
del train_jan_sample_df
del train_feb_sample_df

In [None]:
f_jan = 'cont5'
f_feb = 'cont4'
train_jan_sample_df = trj_df[f_jan].values[::30]
train_feb_sample_df = train_df[f_feb].values[::30]
plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb)
del train_jan_sample_df
del train_feb_sample_df

In [None]:
f_jan = 'cont6'
f_feb = 'cont5'
train_jan_sample_df = trj_df[f_jan].values[::30]
train_feb_sample_df = train_df[f_feb].values[::30]
plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb)
del train_jan_sample_df
del train_feb_sample_df

In [None]:
f_jan = 'cont7'
f_feb = 'cont6'
train_jan_sample_df = trj_df[f_jan].values[::30]
train_feb_sample_df = train_df[f_feb].values[::30]
plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb)
del train_jan_sample_df
del train_feb_sample_df

In [None]:
f_jan = 'cont8'
f_feb = 'cont7'
train_jan_sample_df = trj_df[f_jan].values[::30]
train_feb_sample_df = train_df[f_feb].values[::30]
plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb)
del train_jan_sample_df
del train_feb_sample_df

In [None]:
f_jan = 'cont9'
f_feb = 'cont8'
train_jan_sample_df = trj_df[f_jan].values[::30]
train_feb_sample_df = train_df[f_feb].values[::30]
plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb)
del train_jan_sample_df
del train_feb_sample_df

In [None]:
f_jan = 'cont10'
f_feb = 'cont9'
train_jan_sample_df = trj_df[f_jan].values[::30]
train_feb_sample_df = train_df[f_feb].values[::30]
plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb)
del train_jan_sample_df
del train_feb_sample_df

In [None]:
f_jan = 'cont11'
f_feb = 'cont10'
train_jan_sample_df = trj_df[f_jan].values[::30]
train_feb_sample_df = train_df[f_feb].values[::30]
plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb)
del train_jan_sample_df
del train_feb_sample_df

In [None]:
f_jan = 'cont12'
f_feb = 'cont11'
train_jan_sample_df = trj_df[f_jan].values[::30]
train_feb_sample_df = train_df[f_feb].values[::30]
plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb)
del train_jan_sample_df
del train_feb_sample_df

In [None]:
f_jan = 'cont13'
f_feb = 'cont12'
train_jan_sample_df = trj_df[f_jan].values[::30]
train_feb_sample_df = train_df[f_feb].values[::30]
plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb)
del train_jan_sample_df
del train_feb_sample_df

### Target data

In [None]:
f_jan = 'target'
f_feb = 'target'
train_jan_sample_df = trj_df[f_jan].values[::30]
train_feb_sample_df = train_df[f_feb].values[::30]
plot_data(train_jan_sample_df, train_feb_sample_df, f_jan, f_feb)
del train_jan_sample_df
del train_feb_sample_df

## Features correlation

In [None]:
plt.figure(figsize=(16, 16))
heatmap = sns.heatmap(np.round(test_df[features].corr(), 3), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features correlation', fontdict={'fontsize':10}, pad=10)
plt.title("Spearman correlation - test data")
plt.show()

In [None]:
features_target = features + ['target']
plt.figure(figsize=(16, 16))
heatmap = sns.heatmap(np.round(train_df[features_target].corr(), 3), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features correlation', fontdict={'fontsize':10}, pad=10)
plt.title("Spearman correlation - train data")
plt.show()