In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Table of Contents
<a id = "table-of-contents"></a>
- [Preparations](#0)
    [1 Importing Necessary Libraries](#0.1)
    [2 Loading The Dataset](#0.2)
- [1 Dataset Overview](#1)
    - [1.1 Size of Dataset. Features, Target column.](#1.1)
    - [1.2 Brief look at Dataset](#1.2)
    - [1.3 Distribution of Target Column](#1.3)
    - [1.4 Unique Values in Each Column](#1.4)
- [2 Variable Analysis](#2)
    - [2.1 Distribution: Train Vs Test](#2.1)
    - [2.2 Corelation Analysis](#2.2)
- [3 Baseline Models](#3)
    - [3.1 Preprocessing](#3.1)
    - [3.2 Model Building](#3.1)
    - [3.3 Leaderboard Submission](#3.2)


<a id="0"></a>
# Preparations

<a id="0.1"></a>
### Importing Necessary Libraries

In [None]:
# Data Handling
import pandas as pd
pd.set_option('display.max_columns', 150)
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='dark')

# Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

# Evaluation
from sklearn.metrics import mean_squared_error

seed = 1999

<a id="0.2"></a>
### Loading The Dataset

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

---
[back to top](#table-of-contents)
<a id="1"></a>
# Dataset Overview


<a id="1.1"></a>
### 1.1 Size of Dataset. Features, Target column. 

In [None]:
print(f"Shape of Train is : {train.shape}")
print(f"Shape of Test is : {test.shape}")
print(f"Shape of sample_submission is : {sample_submission.shape}")

target = 'loss'
id_col = 'id'
features = [col for col in train.columns if col not in [id_col, target]]

print(f"\nThe Dataset have total {len(features)} features")

<a id="1.2"></a>
### 1.2 Brief look at Dataset

In [None]:
train.head()

In [None]:
train.describe().T.style

<a id="1.3"></a>
### 1.3 Distribution of Target Column

In [None]:
f, ax = plt.subplots(2, 1, figsize = (20, 10))
axx = ax.flatten()

sns.kdeplot(data = train, x = target, ax = axx[0], color = 'Blue', fill=True)
sns.boxplot(data = train, x = target, ax = axx[1], color = 'Blue')

Observations:
1. Target Variable is Highly Right Skewed.
2. Most of the losses are in between 0 to 10.

<a id="1.4"></a>
### 1.4 Unique Values in Each Column

In [None]:
df = pd.concat((train.nunique(), test.nunique()), axis = 1)
df.rename(columns={0: "Train", 1: "Test"}, inplace=True)
df = df.T
df

---
[back to top](#table-of-contents)
<a id="2"></a>
# Variable Analysis

<a id="2.1"></a>
### 2.1 Distribution: Train Vs Test

In [None]:
f, ax = plt.subplots(10, 20, figsize = (120, 60))
axx = ax.flatten()

index = 0

for col in features:
    sns.kdeplot(data = train, x = col, ax = axx[index], color = 'Blue', fill = True)
    axx[index].set_title(f'Train {col}', loc = 'right', weight = 'bold', fontsize = 12)
    index+=1
    sns.kdeplot(data = test, x = col, ax = axx[index], color = 'Red', fill = True)
    axx[index].set_title(f'Test {col}', loc = 'right', weight = 'bold', fontsize = 12)
    index+=1

<a id="2.2"></a>
### 2.2 Corelation Analysis

In [None]:
plt.figure(figsize=(14 , 14))

corr = train.corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr, square=True, mask = mask, cmap='coolwarm_r', annot_kws={'size':20}) 

---
[back to top](#table-of-contents)
<a id="3"></a>
# Baseline Models

<a id="3.1"></a>
### 3.1 Preprocessing

**1. Since our Target Column is Right Skewed, We will do Log Transformation.**

    Note: Do not forget to convert predicted values to normal form when using log transformations on target variable.

In [None]:
train[target] = train[target]+1
train[target] = np.log(train[target])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train[features], train[target], test_size = 0.25, random_state = seed)

<a id="3.2"></a>
### 3.2 Model Building

In [None]:
model_dict = {}

model_dict['Linear Regression'] = LinearRegression()
model_dict['DecisionTree Regressor'] = DecisionTreeRegressor(random_state = seed)
model_dict['LGBM Regressor'] = LGBMRegressor(random_state = seed)

In [None]:
def model_evaluation(X_trn, X_val, y_trn, y_val, model, model_name):
    model.fit(X_trn,y_trn)
    y_pred = model.predict(X_val)
    y_pred = np.exp(y_pred)-1
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
    print('======================================{}======================================='.format(model_name))
    print('RMSE is : {}'.format(RMSE))
    print()
    print()

In [None]:
%%time
for model_name,model in model_dict.items():
    model_evaluation(X_train, X_test, y_train, y_test, model, model_name)

**Because of the limitation of computational power i've only evaluated 3 models. You can evaluate as mony models as you want.** 

<a id="3.3"></a>
### 3.3 Leaderboard Submission

**I am going to Use LGBMRegressor to make a submission**

In [None]:
%%time
model = LGBMRegressor(random_state = seed)
model.fit(train[features], train[target])

preds = model.predict(test[features])
preds = np.exp(preds) + 1

sample_submission[target] = preds
sample_submission.to_csv('sub1.csv', index = False)

# Please Do Upvote If You Like The Notebook. And Feel free to give suggestions about improving my work. Thank You. 

# Stay Tuned For Advance Model Building