In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# VEHICLE INSURANCE PREDICTION

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

In [None]:
train=pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/train.csv")
test=pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction/test.csv")


In [None]:
train.head()

In [None]:
df=train.drop('id',axis=1)
df.head()

In [None]:
print(df.columns)

In [None]:
df.describe()

**Checking for null values in training data**
* **There is no null value in any column of the data**

In [None]:
train.isnull().sum()

**Looking for correlation between any two features**

*  **As we can there is no significant relation between any two features so we will go ahead with all features we have**

In [None]:
plt.figure(figsize=(20,8))
sns.heatmap(data=df.corr(),annot=True,cmap="Greens")

# **VISUALIZATION AND ANALYSIS**

**TARGET VARIABLE**

**Checking if data is imbalanced or not**
* **Target value is highly imbalanced i.e. 87.7% are -ve responses and 12.3% are +ve responses**

In [None]:
def with_hue(data,feature,ax):
    
    #Numnber of categories
    num_of_cat=len([x for x in data[feature].unique() if x==x])
    
    bars=ax.patches
    
    for ind in range(num_of_cat):
        ##     Get every hue bar
        ##     ex. 8 X categories, 4 hues =>
        ##    [0, 8, 16, 24] are hue bars for 1st X category
        hueBars=bars[ind:][::num_of_cat] 
        # Get the total height (for percentages)
        total=sum([x.get_height() for x in hueBars])
        #Printing percentages on bar
        for bar in hueBars:
            percentage='{:.1f}%'.format(100 * bar.get_height()/total)
            ax.text(bar.get_x()+bar.get_width()/2.0,
                   bar.get_height(),
                   percentage,
                    ha="center",va="bottom",fontweight='bold')
    

    
def without_hue(data,feature,ax):
    
    total=float(len(data))
    bars_plot=ax.patches
    
    for bars in bars_plot:
        percentage = '{:.1f}%'.format(100 * bars.get_height()/total)
        x = bars.get_x() + bars.get_width()/2.0
        y = bars.get_height()
        ax.text(x, y,(percentage,bars.get_height()),ha='center',fontweight='bold')


In [None]:
plt.figure(figsize=(15,7))
plt.text(0.7,220000,"Data is highly Imbalanced",fontweight='bold',fontsize=15)
sns.set_theme(context='notebook',style='darkgrid')
a=sns.countplot(x=train["Response"],palette="gnuplot")
without_hue(df,'Response',a)

In [None]:
df.head()

**UNIVARIATE ANALYSIS**

**CATEGORICAL VALUES**
* **Gender,Driving_License,Previously_Insured,Vehicle_Age,Vehicle_Damage**

In [None]:
f,ax=plt.subplots(nrows=5,ncols=2,figsize=(20,50), 
                  gridspec_kw={'width_ratios': [10,10],
                               'height_ratios': [10,10,10,10,10],'wspace': 0.2,
                       'hspace': 0.4})


a1=sns.countplot(data=df,x="Gender",ax=ax[0][0],palette="Set1")
without_hue(df,'Gender',a1)
a2=sns.countplot(data=df,x='Gender',hue='Response',palette="gnuplot",ax=ax[0][1])
with_hue(df,'Gender',a2)

b1=sns.countplot(data=df,x="Driving_License",palette="gnuplot",ax=ax[1][0])
without_hue(df,"Driving_License",b1)
b2=sns.countplot(data=df,x="Driving_License",hue='Response',palette="gnuplot",ax=ax[1][1])
with_hue(df,"Driving_License",b2)

c1=sns.countplot(data=df,x="Previously_Insured",palette="gnuplot",ax=ax[2][0])
without_hue(df,"Previously_Insured",c1)
c2=sns.countplot(data=df,x="Previously_Insured",hue='Response',palette="gnuplot",ax=ax[2][1])
with_hue(df,"Previously_Insured",c2)

d1=sns.countplot(data=df,x="Vehicle_Age",palette="gnuplot",ax=ax[3][0])
without_hue(df,"Vehicle_Age",d1)
d2=sns.countplot(data=df,x="Vehicle_Age",hue='Response',palette="gnuplot",ax=ax[3][1])
with_hue(df,"Vehicle_Age",d2)

e1=sns.countplot(data=df,x="Vehicle_Damage",ax=ax[4][0],palette="Set1")
without_hue(df,"Vehicle_Damage",e1)
e2=sns.countplot(data=df,x="Vehicle_Damage",hue='Response',palette="gnuplot",ax=ax[4][1])
with_hue(df,"Vehicle_Damage",e2)


**CONCLUSIONS FROM CATEGOICAL FEATURES**

* **13.8% males and 10.4% females responded +ve**
* **People who don't have license are not responding but with driving license only 12.3% are responding**
* **Person who is not previously insured are responding i.e. 22.5% , but who is already insured are not responding**
* **People whose vehicle age is greater than 1 year are reasponding more frequently**
* **23.8% of people whose Vehicle is Damaged are responding +ve**

In [None]:
df.head()

**CONTINUOUS VALUES**
* **Age,Region_Code,Annual_Premium,Policy_Sales_Channel,Vintage**

**AGE COLUMN**

In [None]:
f,ax=plt.subplots(nrows=1,ncols=2,figsize=(20,10))
ax[0].text(50,17000,"Age data is right Skewed",fontweight='bold',fontsize=15)
sns.histplot(data=df,x="Age",palette='gnuplot',kde=True,ax=ax[0],binwidth=1)
ax[1].text(50,17000,"Age data is little right Skewed\nwith hueness",fontweight='bold',fontsize=15)
sns.histplot(data=df,x="Age",palette='gnuplot',kde=True,ax=ax[1],hue="Response",binwidth=1)

**Let's See if how many outliers are present in age?**

In [None]:
df_age=sorted(df['Age'])
Q1,Q3=np.percentile(df_age,[25,75])
IQR= Q3-Q1
lower_range= Q1-(1.5*IQR)
upper_range=Q3+(1.5*IQR)

print("Lower Range : ",lower_range)      
print("Upper Range : ",upper_range)
df_lower_outliers=df[df.Age<lower_range]
df_upper_outliers=df[df.Age>upper_range]


In [None]:
#NO LOWER OUTLIERS
df_lower_outliers

In [None]:
#NO UPPER OUTLIERS
df_upper_outliers

In [None]:
plt.figure(figsize=(20,10))
plt.text(55,-0.2,"There is not outlier in age feature",fontsize='20',fontweight='bold')
sns.boxplot(data=df,x="Age",palette='gnuplot')

**Let's plot log distribution of age column to see if we can reduce the skewness**
* **Log Distribution doesn't make any specific change in the distribution so we will go ahead without doing any changes in Age column**

In [None]:
f,ax=plt.subplots(nrows=1,ncols=2,figsize=(20,10))
sns.histplot(data=df,x=np.log(df["Age"]),palette='gnuplot',kde=True,ax=ax[0],binwidth=0.04)
sns.histplot(data=df,x=np.log(df["Age"]),palette='gnuplot',kde=True,ax=ax[1],hue="Response",binwidth=0.04)

**REGION CODE**
* **This Column is randomly distributed , there is not any significant observation we can get from this feature**
* **In my opinion we should not work on this feature further**

In [None]:
f,ax=plt.subplots(nrows=1,ncols=2,figsize=(15,10))
ax[0].text(35,70000,"Region_Code is\nrandomly distributed",fontweight='bold',fontsize=12)
sns.histplot(data=df,x="Region_Code",palette='gnuplot',kde=True,ax=ax[0],binwidth=1)
ax[1].text(35,70000,"Region_Code is\nrandomly distributed\nwith hueness",fontweight='bold',fontsize=12)
sns.histplot(data=df,x="Region_Code",palette='gnuplot',kde=True,ax=ax[1],hue="Response",binwidth=1)

**ANNUAL PREMIUM**

In [None]:
f,ax=plt.subplots(nrows=1,ncols=2,figsize=(20,5))
ax[0].text(300000,30000,"Normally distributed with\nlittle right skewed",fontweight='bold',fontsize=12)
sns.histplot(data=df,x="Annual_Premium",palette='gnuplot',kde=True,ax=ax[0])
ax[1].text(300000,30000,"Normally distributed with\nlittle right skewed\nwith hueness",fontweight='bold',fontsize=12)
sns.histplot(data=df,x="Annual_Premium",palette='gnuplot',kde=True,ax=ax[1],hue="Response")

In [None]:
plt.figure(figsize=(20,10))
plt.text(250000,-0.2,"2.7% data points are upper outliers in Annual_Premium feature",fontsize=15,fontweight='bold')
sns.boxplot(data=df,x="Annual_Premium",palette='gnuplot')

In [None]:
#IQR (Inter quartile range)
df_prem=sorted(df['Annual_Premium'])
Q1,Q3=np.percentile(df_prem,[25,75])
IQR= Q3-Q1
lower_range= Q1-(1.5*IQR)
upper_range=Q3+(1.5*IQR)

print("Lower Range : ",lower_range)      
print("Upper Range : ",upper_range)
df_lower_outliers=df[df.Annual_Premium<lower_range]
df_upper_outliers=df[df.Annual_Premium>upper_range]


In [None]:
#There is not lower outliers
df_lower_outliers

In [None]:
#There are 10320 people in data whose annual_premium is greater than 61892.5 i.e. 2.7%
df_upper_outliers

* **Question here is should we remove these ouliers in Annual_Premium data , but I think if a person is paying money more than Rs61892 annually , may he/she is a rich person who is capable of paying that much more money than others**
* **10392 people are outliers if we remove them we can loss data from the table**
* **So the conclusion is we won't change anything is annual_premium data**

**VINTAGE**

**This Column is uniformly distributed we can't do nothing much to this column**

In [None]:
f,ax=plt.subplots(nrows=1,ncols=2,figsize=(20,10))
sns.histplot(data=df,x="Vintage",palette='gnuplot',kde=True,ax=ax[0])
sns.histplot(data=df,x="Vintage",palette='gnuplot',kde=True,ax=ax[1],hue="Response")

**POLICY SALES CHANNEL**

In [None]:
f,ax=plt.subplots(nrows=1,ncols=2,figsize=(20,10))
ax[0].text(40,100000,"Randomly Distributed",fontweight='bold',fontsize=15)
sns.histplot(data=df,x="Policy_Sales_Channel",palette='gnuplot',kde=True,ax=ax[0])
ax[1].text(40,100000,"Randomly Distributed\nwith hueness",fontweight='bold',fontsize=15)
sns.histplot(data=df,x="Policy_Sales_Channel",palette='gnuplot',kde=True,ax=ax[1],hue="Response")

**BIVARIATE ANALYSIS**

In [None]:
df["Gender"]=df["Gender"].map({"Female":"0","Male":"1"}).astype('int')

In [None]:
df1=df.copy()

In [None]:
df1=pd.get_dummies(df1,drop_first=True)
df1

In [None]:
df1=df1.rename(columns={"Vehicle_Age_< 1 Year": "Vehicle_Age_1_Year", 
                            "Vehicle_Age_> 2 Years": "Vehicle_Age_2_Year"})
df1

In [None]:
df1['Vehicle_Age_1_Year']=df1['Vehicle_Age_1_Year'].astype('int')
df1['Vehicle_Age_2_Year']=df1['Vehicle_Age_2_Year'].astype('int')
df1['Vehicle_Damage_Yes']=df1['Vehicle_Damage_Yes'].astype('int')


In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df1.corr(),annot=True,cmap="Greens")

**Pairplot**

In [None]:
sns.pairplot(data=df1,palette='gnuplot')

# **MODEL AND PREDICTION(WITHOUT OVERSAMPLING)**

**IMPORTING LIBRARIES**

In [None]:
from sklearn.model_selection import train_test_split , cross_val_score , RandomizedSearchCV,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score,confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report 
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import optuna


In [None]:
#SCALING CONTINUOUS FEATURES
'''from sklearn.preprocessing import MinMaxScaler , StandardScaler
ss=StandardScaler()
train[['Age']]=ss.fit_transform(train[['Age']])
train[['Vintage']]=ss.fit_transform(train[['Vintage']])
train[['Annual_Premium']] = ss.fit_transform(train[['Annual_Premium']])'''


In [None]:
Y=df1["Response"]
X=df1.drop(["Response"],axis=1)

# USING OVERSAMPLING TECHNIQUE(SMOTE)


In [None]:
df2=df1.copy()
df2

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
#SPLITTING BEFORE SAMPLING
x_train_sam,x_test_sam,y_train_sam,y_test_sam=train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
y_train_sam.value_counts()

In [None]:
y_test_sam.value_counts()

In [None]:
#SAMPLING OF ONLY TRAIN DATA TO AVOID DATA LEAKAGE
sm=SMOTE()
x_train_sampling,y_train_sampling=sm.fit_resample(x_train_sam,y_train_sam)

In [None]:
y_train_sampling.value_counts()

**RANDOM FOREST CLASSIFIER**

In [None]:
def objective(trial):
    
    n_estimators = trial.suggest_int('n_estimators', 2, 200)
    max_depth = int(trial.suggest_loguniform('max_depth', 1, 40))
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    return cross_val_score(clf, x_train_sampling, y_train_sampling, 
           n_jobs=-1, cv=5,scoring='f1').mean()

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)

In [None]:
trial=study.best_trial
print(trial.values)
print(trial.params)

In [None]:
clf=RandomForestClassifier(n_estimators=199,max_depth=40,class_weight='balanced')
clf.fit(x_train_sampling,y_train_sampling)

In [None]:
pred=clf.predict(x_test_sam)
print(accuracy_score(y_test_sam,pred))

In [None]:
print(classification_report(y_test_sam,pred))

In [None]:
print("F1 Score with oversampling : ", f1_score(y_test_sam,pred,average='micro'))

In [None]:
plt.figure(figsize=(20,10))
y_score=clf.predict_proba(x_test_sam)[:,1]

fpr,tpr,_=roc_curve(y_test_sam,y_score)

plt.title('Random Forest ROC curve: CC Fraud')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')

plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))


**LGBM CLASSIFIER**

In [None]:
import lightgbm as lgb

In [None]:
def objective_lgbm(trial):
    
    n_estimators = trial.suggest_int('n_estimators', 2, 300)
    max_depth = int(trial.suggest_loguniform('max_depth', 2, 50))
    learning_rate=trial.suggest_loguniform('learning_rate',0.001,1)
    colsample_bytree=trial.suggest_loguniform("colsample_bytree",0.1, 1)
    num_leaves=trial.suggest_int('num_leaves',10,300)
    reg_alpha= trial.suggest_loguniform('reg_alpha',0.1,1)
    reg_lambda= trial.suggest_loguniform('reg_lambda',0.1,1)
    min_split_gain=trial.suggest_loguniform('min_split_gain',0.1,1)
    subsample=trial.suggest_loguniform('subsample',0.1,1)    
    clf = lgb.LGBMClassifier(n_estimators=n_estimators, max_depth=max_depth,
                            learning_rate=learning_rate,colsample_bytree=colsample_bytree,
                            num_leaves=num_leaves,reg_alpha=reg_alpha,reg_lambda=reg_lambda,
                            min_split_gain=min_split_gain,subsample=subsample)
    
    return cross_val_score(clf, x_train_sampling, y_train_sampling, 
           n_jobs=-1, cv=5,scoring='f1').mean()


In [None]:
study_lgbm= optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=25)

In [None]:
trial_lgbm= study_lgbm.best_trial
print(trial_lgbm.value)
print(trial_lgbm.params)

In [None]:
model_lgbm=lgb.LGBMClassifier(n_estimators=83, max_depth=25, learning_rate=0.17179233498193255, 
                              colsample_bytree=0.6038190843157162, num_leaves=254, reg_alpha=0.838971567767778, 
                              reg_lambda=0.2832341981958901, 
                              min_split_gain=0.3099589058884009,subsample=0.5504932545076727,class_weight='balanced')

In [None]:
model_lgbm.fit(x_train_sampling,y_train_sampling)

In [None]:
pred_lgbm=model_lgbm.predict(x_test_sam)
print(accuracy_score(y_test_sam,pred_lgbm))

In [None]:
print(classification_report(y_test_sam,pred_lgbm))

In [None]:
print("F1 Score : ", f1_score(y_test_sam,pred_lgbm,average='micro'))

In [None]:
plt.figure(figsize=(20,10))
y_score2=model_lgbm.predict_proba(x_test_sam)[:,1]

fpr2,tpr2,_=roc_curve(y_test_sam,y_score2)

plt.title('Random Forest ROC curve: CC Fraud')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')

plt.plot(fpr2,tpr2)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr2,tpr2))


In [None]:
roc_auc_score(y_test_sam,y_score2)