In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

from scipy import stats
from scipy.stats import norm, skew

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, GroupKFold, GridSearchCV, StratifiedKFold

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import BayesianRidge,LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, OrthogonalMatchingPursuit
from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.neighbors import KNeighborsRegressor, KernelDensity, KDTree
from sklearn.metrics import *

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import sys, os
import random 

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
from IPython import display, utils

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train  = pd.read_csv('../input/imbalanced-data-practice/aug_train.csv')
train.head()

Those who bought the policy have **Response = 1** and those who didnt bought the policy have **Response = 0**

In [None]:
train.shape

### Plot missing values(if any)

In [None]:
plt.figure(figsize=(17, 5))
sns.heatmap(train.isnull(), cbar=True, cmap='Set3')
plt.xlabel("Column_Name", size=14, weight="bold")
plt.title("Places of missing values in column",fontweight="bold",size=14)
plt.show()

In [None]:
#missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

It shows that our training set doesnt contain any missing values.

In [None]:
train.describe()

As we can observe here that:
1. The **Age** ranges from 25 to 85
2. The **Annual premium** ranges from 2630 to 540165.

One of the most effective starting tools is the **pairs plot** (also called a **scatterplot matrix**). A pairs plot allows us to see both distribution of single variables and relationships between two variables. 

In [None]:
sns.pairplot(train,vars = ['Age', 'Vintage',"Annual_Premium","Policy_Sales_Channel"], hue ='Response',diag_kind = "kde",kind = "scatter", palette = "husl")



### The following figures give us the correlation between variables with kendall method:

In [None]:
plt.subplots(figsize=(15,6))
sns.heatmap(train.corr(method='kendall'), annot=True,cmap='Blues')
plt.title('Correlation between variables with kendall method')

### Check for duplicates

In [None]:
train.duplicated().sum()

### Distribution of Age

In [None]:
f, ax = plt.subplots(figsize=(10, 8))
sns.despine(f)
sns.distplot(train['Age'],bins=30,color="g")
plt.title('Distribution of Age')
plt.show()

Majority of people are between 20 to 50 years of age

In [None]:
print(train['Driving_License'].value_counts())
plt.figure(figsize=(6, 10))
sns.countplot(train['Driving_License'],palette='Set2')
plt.xticks(rotation=90)
plt.title('Presence of driving license?')
plt.show()

Less than 1% of the samples are individuals without a license. People without a drivers license would not usually be interested in vehicle insurance, therefore I'm going to check to see if any of these individuals actually have a positive response:

In [None]:
len(train[(train['Driving_License']  == 0) &
          (train['Response']  == 1)])

Because only 37 of the people without licenses are actually wanting insurance, I don't see much use for this feature without something else to go along with it. Such as a feature that asks if a person will be getting a new vehicle soon (or a drivers license). Therefore, I opt to remove it.

In [None]:
train.drop(columns='Driving_License', inplace=True)

We'll also one hot encode **Gender**, **Vehicle_Age** and **Region_Code** and **Vehicle_Damage**.

In [None]:
#from sklearn.preprocessing import OneHotEncoder

#enc = OneHotEncoder(handle_unknown='ignore')


### Plot distribution of target variable i.e Response

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(train['Response'],palette='Set2')
plt.xticks(rotation=90)
plt.title('Distribution of Response')
plt.show()

We can see that our target variable is highly imbalanced.