In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import time
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn import metrics
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMModel,LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import random

In [None]:
#loading csv
df=pd.read_csv("train_auto.csv")
df.head()

In [None]:
df.shape

In [None]:
#missing value analysis
(df.isnull().sum()/len(df))*100

In [None]:
#dropping na values
df.dropna(inplace=True)

#### Since the missing percentage at max is 6%, we can drop these rows, however while fine tuning the model we can try to impute this values using IterativeImputer.

In [None]:
df.describe()

# Data Exploration

In [None]:
df.dtypes

In [None]:
print((df.TARGET_FLAG.value_counts()/len(df))*100,"%")

#### Approximately 30% of audience has filled for claims at some point or another.Also the dataset is imbalanced as expected.

## Distibution of target amt

In [None]:
sns.set(rc={'figure.figsize':(4.5,4.5)})
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x=df[df["TARGET_AMT"]!=0].TARGET_AMT).set_title('TARGET_AMT distribution')

#### There are a lot of outliers present so we can visualize the same distribution on amounts greater than 20,000.


In [None]:
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x=df[df["TARGET_AMT"]>20000].TARGET_AMT).set_title('TARGET_AMT distribution greater than 20,000')

## Overall amount distribution of all claims

In [None]:
#selecting values where claims have been made
df1=df[df['TARGET_AMT']!=0]
#customized bins for count distribution
bins = [0,1500,5000,7500,10000,100000]
df1 = df1.groupby(pd.cut(df1[df1["TARGET_AMT"]!=0].TARGET_AMT, bins=bins)).TARGET_AMT.count()
df1.plot(kind='bar',title='Target amount distribution')

#### From the above graph it is observable that most of the insurance claims lie between 1500-5000 and most less than 200 people make the heighest claims of >10K

## Cleaning up the income and other monetary columns and plotting income distribution

In [None]:
#cleaning the columns by replacing ",","$" and and converting it into integer
cols= ['INCOME','HOME_VAL','BLUEBOOK','OLDCLAIM']
for c in cols:
    df[c] = df[c].str.replace(',', '')
    df[c] = df[c].str.replace('$', '')
    df[c]=df[c].astype(int)

In [None]:
df2=df[df["TARGET_FLAG"]!=0]
ax = sns.boxplot(x=df2['INCOME']).set_title('Income distribution of individuals making claims')

## Distribution of claim amount wrt the income and age

In [None]:
sns.set(rc={'figure.figsize':(7,4.5)})

sns.scatterplot(data=df2, x="INCOME", y="TARGET_AMT",hue="AGE", size="AGE",sizes=(20, 200)).set_title("Distribution of claim amount wrt the income and age")

#### Highest insurance claims come from individuals within the bracket [0,150,000] and not the top earners.
#### Their Age ranges from 20-40. There are some points eg target_amt 80000 made by a 20 year old,which could be suspicious


In [None]:
sns.scatterplot(data=df2, y="TARGET_AMT", x="TRAVTIME",hue="CAR_USE", size="CAR_USE",sizes=(20, 200)).set_title('Travel time versus target amount')

#### Most claims are moade by commercial vehicals as compared to private.


In [None]:
ax = sns.boxplot(x=df2['CAR_AGE']).set_title('Car age when claim was made')

#### Cars between range 1-10 are most likely to make insurance claims


In [None]:
df.head(2)

### Distribution of car type and area wrt claim made

In [None]:
sns.set(rc={'figure.figsize':(24,24)})

g = sns.catplot(hue="URBANICITY", x="CAR_TYPE", col="TARGET_FLAG",
                data=df, kind="count")
g.set_xticklabels(rotation=90)


##### The ratio of suv to minivans are higher when the claims are made as compared to when they are not
##### Significantly higher claims are made in urban areas
##### Most claims in the rural areas are made by SUV owners, whereas none are made by van and panel truck owners


In [None]:
sns.set(rc={'figure.figsize':(6,6)})
sns.scatterplot(data=df, y="OLDCLAIM", x="HOME_VAL", hue="TARGET_FLAG")


A large number of old claims are less than 10000 in value

In [None]:
sns.set_theme(style="white")

# Compute the correlation matrix
corr = df.corr()

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr,  cmap=cmap, vmax=.3, center=0,mask=mask,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

### It is observable that target flag is negatively correlated to home_val and income.It is positively correlated to claim frequency and mvr pts
### Within the varibles there is a strong correlation between income,home_val and bluebook with car age.This could be taken into account while tuning logistic regression model.

# Classification model

In [None]:
#one hot encoding
df=pd.get_dummies(df, columns = ['PARENT1','MSTATUS','SEX','EDUCATION','JOB','CAR_USE','CAR_TYPE','RED_CAR','REVOKED','URBANICITY'])

In [None]:
#distribution in X and y for training models
X=df[df.columns[~df.columns.isin(['TARGET_FLAG','INDEX','TARGET_AMT'])]]
y = df['TARGET_FLAG']


In [None]:
#defining multiple models for classification
lreg=LogisticRegression(C= 100, penalty= 'l2', solver= 'newton-cg')
sgd=SGDClassifier(max_iter=1000, tol=1e-3)
neigh = KNeighborsClassifier(n_neighbors=3)
gnb=GaussianNB()
Lgb = LGBMClassifier(n_estimators=90, silent=False, random_state =94, max_depth=2,num_leaves=31,objective='binary')
ada=AdaBoostClassifier()
rfc_cv=RandomForestClassifier()

#creating a for loop for testing all models
models=[lreg,rfc_cv,sgd,neigh,gnb,Lgb,ada]
for m in models:
    print(m)
    # adding smote to use representative sampling 
    steps = [('over', SMOTE()), ('model', m)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    for scoring in["recall", "precision","accuracy"]:
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
        scores = cross_val_score(pipeline, X, y, scoring=scoring, cv=cv, n_jobs=-1)
        print("Model", scoring, " mean=", scores.mean() , "stddev=", scores.std())
        

#### From the above pipeline we can observe that GaussianNB,LGBM and AdaBoost are the top three performers. 

Since the tuning of LGBM was giving a run time error I will try to hypertune Adaboost, on recall score to successfully classify as many claims as possible.

## Hyperparameter tuning of ADA-Boost

In [None]:
 # define model
model = AdaBoostClassifier()
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
print("baseline", np.mean(cross_val_score(model, X, y, scoring='recall', cv=cv, n_jobs=-1)))

### Confusion matrix on original ada boost

In [None]:
y_pred = cross_val_predict(model, X, y,  cv=10, n_jobs=-1)
CM = confusion_matrix(y, y_pred)
TN = CM[0][0] 
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print("      Positives: ", 100*round(FN/(TP+FN),2), "% misclassifed     ", FN, '/',TP+FN)
print("      Negatives: ", 100*round(FP/(TN+FP),2), "% misclassifed     ",FP, '/',TN+FP)

### Adding smoteenn sampling to above

In [None]:
# define resampling
resample = SMOTEENN()
# define pipeline
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
# evaluate model
scores = cross_val_score(pipeline, X, y, scoring='recall', cv=cv, n_jobs=-1)
# summarize performance
print('Score: %.3f' % np.mean(scores))

#### We can see that the score is improved,possible due to proper sampling

In [None]:
y_pred = cross_val_predict(pipeline, X, y,  cv=10, n_jobs=-1)
CM = confusion_matrix(y, y_pred)
TN = CM[0][0] 
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print("AdaBoost",np.round(np.mean(scores),3))
print("      Positives: ", 100*round(FN/(TP+FN),2), "% misclassifed     ", FN, '/',TP+FN)
print("      Negatives: ", 100*round(FP/(TN+FP),2), "% misclassifed     ",FP, '/',TN+FP)

### Using grid search CV to estimate the best parameters

In [None]:
crossvalidation=KFold(n_splits=10,shuffle=True,random_state=1)
ada=AdaBoostClassifier()
#setting up the grid search parameters
search_grid={'m__n_estimators':[200,500,1000,2000],
             'm__learning_rate':[.001,.01,.1,.2]}
#applying grid search on the pipeline
search=GridSearchCV(pipeline,param_grid=search_grid,scoring='recall',n_jobs=1,cv=crossvalidation)
search.fit(X,y)
search.best_params_

### Running the model with updated parameters

In [None]:
model = AdaBoostClassifier(learning_rate=0.01,n_estimators=500)
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
y_pred = cross_val_predict(pipeline, X, y,  cv=10, n_jobs=-1)
CM = confusion_matrix(y, y_pred)
TN = CM[0][0] 
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print("AdaBoost",np.round(np.mean(scores),3))
print("      Positives: ", 100*round(FN/(TP+FN),2), "% misclassifed     ", FN, '/',TP+FN)
print("      Negatives: ", 100*round(FP/(TN+FP),2), "% misclassifed     ",FP, '/',TN+FP)

### We can see that the performance of the model has improved from the baseline.However,it is not significantly higher than that obtained after re-sampling

### Getting feature importance

In [None]:
#feature columns
features =X.columns
#defining model
model = AdaBoostClassifier(learning_rate=0.01,n_estimators=500)
model.fit(X,y)
#storing feature importance from sklearn
importances = model.feature_importances_
#sorting the feature importance
indices = np.argsort(importances)

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (8,20)

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

### OLD_CLAIM,HOME_VAL,Urban/Rural are the most important features for classification.

# Storing model to pickle and using it to make predictions

In [None]:

import pickle
# Save to file in the current working directory
pkl_filename = "pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)



In [None]:
# Load from file
with open(pkl_filename, append'rb') as file:
    pickle_model = pickle.load(file)
    

In [None]:
#loading Xtest csv and performing basic cleaning operations before making predictions
Xtest=pd.read_csv("test_auto.csv")
#dropping target columns
Xtest=Xtest[Xtest.columns[~Xtest.columns.isin(['TARGET_FLAG','INDEX','TARGET_AMT'])]]
#dropping na values
Xtest.dropna(inplace=True)
#converting monetary columns into int after cleaning
cols= ['INCOME','HOME_VAL','BLUEBOOK','OLDCLAIM']
for c in cols:
    Xtest[c] = Xtest[c].str.replace(',', '')
    Xtest[c] = Xtest[c].str.replace('$', '')
    Xtest[c]=Xtest[c].astype(int)
#label encoding columns
Xtest=pd.get_dummies(Xtest, columns = ['PARENT1','MSTATUS','SEX','EDUCATION','JOB','CAR_USE','CAR_TYPE','RED_CAR','REVOKED','URBANICITY'])

Xtest.shape

In [None]:
#converting predicitions into dataframe
Ypredict = pickle_model.predict(Xtest)
Ypredict=pd.DataFrame(Ypredict)
Xtest['TARGET_FLAGS'] = Ypredict

In [None]:
Ypredict.rename(columns={ Ypredict.columns[0]: "TARGET_PREDICTIONS" }, inplace = True)

In [None]:
Ypredict.to_csv(r'/home/jovyan/survey/Descartes/TARGET_PREDICTIONS.csv', index = False)


# Work in progress

### So far the predictions were made by dropping the na rows,however we canimpute the same by using Bayesian imputer and Categorical Imputer from Sklearn

### Model tuning can also be done on other better performing models