In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id = "table-of-contents"></a>
# Table of Contents

- [1 Introduction](#1)
    - [1.1 Problem Statement](#1.1)
    - [1.2 The Goal](#1.2)
- [2 Preparations ](#2)
    - [2.1 Importing Packages](#2.1)
    - [2.2 Loading The Dataset](#2.2)
- [3 Getting Basic Understanding of The Data](#3)
    - [3.1 Seeing the data and shape](#3.1)
    - [3.2 Statistics](#3.2)
    - [3.3 Number of Unique Values and Missing in Each Column](#3.3)
    - [3.4 Distribution of target Variable](#3.4)
    - [3.5 Observations](#3.5)
- [4 Univariate Analysis](#4)
- [5 Bivariate Analysis](#5)

<a id="1"></a>
# 1. Introduction

<a id="1.1"></a>
### 1.1 Problem Statement
As per the description,
The dataset is used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting whether a claim will be made on an insurance policy. Although the features are anonymized, they have properties relating to real-world features.

<a id="1.2"></a>
### 1.2 The Goal
In this competition, we have to build a model to predict whether a customer would be interested in insurance. 

The evaluation metrics for this problem statement is area under the ROC curve.

---

<a id="2"></a>
# 2. Preparations

Importing packages and loading the data that will be used in the analysis process. Packages that will be loaded are mainly for data manipulation and data visualization. 

<a id="2.1"></a>


### 2.1 Importing Packages

In [None]:
#### Data Manipulation
import pandas as pd
import numpy as np
import warnings

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

#### Data Visulization 
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib import ticker
import seaborn as sns
sns.set(style = 'white')

############## Libraries for Machine Learning Modeling ###############

# Model Building
from sklearn.model_selection import train_test_split, KFold

# Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Evaluation
from sklearn.metrics import mean_squared_error, r2_score


from IPython.core.display import display, HTML

def display_side_by_side(dfs:list, captions:list):
    """Display tables side by side to save vertical space
    Input:
        dfs: list of pandas.DataFrame
        captions: list of table captions
    """
    output = ""
    combined = dict(zip(captions, dfs))
    for caption, df in combined.items():
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption).set_table_styles([{'selector':'caption', 'props':[('color', '6a63ea'), ('font-size', '16px')]}]).bar(color = '#6a63ea').background_gradient(cmap = 'Reds', axis =1)._repr_html_()
        output += 20 * "\xa0\xa0\xa0"
    display(HTML(output))

<a id = "2.2"></a>

### 2.2 Loading The Dataset

In [None]:
train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
ss = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

---
<a id = '3'></a>
# 3. Getting Basic Understanding of The Dataset

[back to top](#table-of-contents)
<a id="table-of-contents"></a>


<a id = "3.1"></a>
### 3.1 Seeing the data and shape

In [None]:
train.head()

In [None]:
print(f"Shape of the Train Set is: {train.shape}")
print(f"Shape of the Test Set is: {train.shape}")
print(f"The Dataset has {len(train.columns) - 1} features and 1 target variable: {train.columns[-1]}")

target = 'claim'

<a id = "3.2"></a>

### 3.2 Statistics

In [None]:
train.describe().style.bar(color = '#6a63ea').background_gradient(axis = 0)

<a id = "3.3"></a>
### 3.3 Number of Unique Values and Missing Values in Each Columns

In [None]:
temp = pd.DataFrame([train.nunique(), test.nunique()]).T
temp.columns = ['Train', 'Test']

missing = pd.DataFrame([train.isna().sum(), test.isna().sum()]).T
missing.columns = ['Train', 'Test']

display_side_by_side([temp, missing], ['Number of Unique Values in Each Column', 'Number os missing values in Each Column'])

<a id = "3.4"></a>

### 3.4 Distribution of Target variable

In [None]:
fig = plt.figure(figsize=(22, 12))
gs = fig.add_gridspec(2, 1)

background_color = "#eae9fc"
color_palette = ["#6a63ea", "#dee84a"]

# Fig 1
ax0 = fig.add_subplot(gs[0, 0])

fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color)

for s in ['right', 'top']:
    ax0.spines[s].set_visible(False)

ax0.grid(color='gray', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(train["claim"], palette=color_palette, ax=ax0, zorder=3)
ax0.set_xlabel("")
ax0.set_ylabel("")

# Fig 2
ax1 = fig.add_subplot(gs[1, 0])

fig.patch.set_facecolor(background_color)
ax1.set_facecolor(background_color)

ax1.grid(color='gray', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
train[target].value_counts().plot.pie(autopct='%1.1f%%', colors = color_palette, ax = ax1)
ax1.set_xlabel("")
ax1.set_ylabel("")

_ = plt.title('Target Distribution',fontsize=30, y = 2.24, x = 0.45, fontweight='bold', fontfamily='serif', color="#323232")

---
<a id = '4'></a>
# 4. Univariate Analysis

[back to top](#table-of-contents)
<a id="table-of-contents"></a>

In [None]:
id_col = 'id'
features = [col for col in train.columns if col not in [target, id_col]]

In [None]:
f, ax = plt.subplots(17, 7, figsize = (23, 51))
axx = ax.flatten()
f.patch.set_facecolor(background_color)

for index, col in enumerate(features):
    plt.ticklabel_format(style='plain')
    axx[index].set_facecolor(background_color)
    sns.histplot(ax = axx[index], x = col, data = train, color = '#6a63ea')
    axx[index].set_xlabel("")
    axx[index].set_ylabel("")
    axx[index].tick_params(labelsize=4, width=0.5)
    axx[index] = axx[index].twinx()
    plt.ticklabel_format(style='plain')
    sns.kdeplot(ax = axx[index], x=col, data = train, color='#6a63ea')
    axx[index].set_xlabel("")
    axx[index].set_ylabel("")
    axx[index].tick_params(labelsize=0)
    axx[index].set_xlabel(col, fontsize=4, fontweight='bold')
for s in ['right', 'top', 'bottom', 'left']:
    axx[-1].spines[s].set_visible(False)
axx[-1].get_xaxis().set_visible(False)
axx[-1].get_yaxis().set_visible(False)

---
<a id = '5'></a>
# 5. Bivariate Analysis

[back to top](#table-of-contents)
<a id="table-of-contents"></a>

In [None]:
f, ax = plt.subplots(17, 7, figsize = (24, 57))
axx = ax.flatten()
f.patch.set_facecolor(background_color)

for index, col in enumerate(features):
    plt.ticklabel_format(style='plain')
    axx[index].set_facecolor(background_color)
    sns.boxplot(ax = axx[index], x = target, y = col,data = train, palette= color_palette)
    axx[index].set_xlabel("")
    axx[index].set_ylabel("")
    axx[index].tick_params(labelsize=4, width=0.5)
for s in ['right', 'top', 'bottom', 'left']:
    axx[-1].spines[s].set_visible(False)
axx[-1].get_xaxis().set_visible(False)
axx[-1].get_yaxis().set_visible(False)

# Please Do Upvote If You Like The Notebook. And Feel free to give suggestions about improving my work. Thank You.

## Stay Tuned For Further Updates. 