In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

## id 	Unique ID for the customer
## Gender 	Gender of the customer
## Age 	Age of the customer
## Driving_License 	0 : Customer does not have DL, 1 : Customer already has DL
## Region_Code 	Unique code for the region of the customer
## Previously_Insured 	1 : Customer already has Vehicle Insurance, 0 : Customer doesn't have Vehicle Insurance
## Vehicle_Age 	Age of the Vehicle
## Vehicle_Damage 	1 : Customer got his/her vehicle damaged in the past. 0 : Customer didn't get his/her vehicle damaged in the past.
## Annual_Premium 	The amount customer needs to pay as premium in the year
## PolicySalesChannel 	Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.
## Vintage 	Number of Days, Customer has been associated with the company
## Response 	1 : Customer is interested, 0 : Customer is not interested

In [None]:
train = pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv')
test = pd.read_csv('../input/health-insurance-cross-sell-prediction/test.csv')

In [None]:
train.info()
## Great Not Even 1 Null Value

In [None]:
train.drop(columns=['id', 'Policy_Sales_Channel'], inplace=True)
test.drop(columns=['id', 'Policy_Sales_Channel'],inplace=True)

### Work With Gender: 

In [None]:
train['Gender'].value_counts()

In [None]:
train.plot(kind='box', figsize=(20, 12))

## Remove Outlier :

In [None]:
train['Annual_Premium'] = train['Annual_Premium'].astype('int')
test['Annual_Premium'] = test['Annual_Premium'].astype('int')

In [None]:
print(train['Annual_Premium'].describe())
print('-----------------TEST------------------')
print(test['Annual_Premium'].describe())


In [None]:
train = train[(train['Annual_Premium'] > 24405) & (train['Annual_Premium'] <39400)]
test = test[(test['Annual_Premium'] > 2630) & (test['Annual_Premium'] <39408)]

In [None]:
sns.countplot(train['Gender'])

In [None]:
train['Age'].describe()

# Remove Person That's Not Have Driver Lincense

In [None]:
train1 = train[train['Driving_License'] != 0]
test1 = test[test['Driving_License'] != 0]

In [None]:
# We Have 53 Region 
train1['Region_Code'].nunique()
train1['Region_Code'] = train1['Region_Code'].astype('int')

In [None]:
plt.figure(figsize=(20, 12))
sns.countplot(x='Region_Code', hue='Gender', data=train1)

In [None]:
plt.figure(figsize=(20, 12))
sns.countplot(x='Previously_Insured', hue='Gender',data=train1)

In [None]:
how_g_r_p = train1.pivot_table(index='Gender', columns='Region_Code', values='Previously_Insured', aggfunc='size')

In [None]:
how_g_r_p.plot(kind='bar', figsize=(20, 20))

In [None]:
train1['Vehicle_Age'].value_counts()

In [None]:
def age_car_categori(x):
    if x in '< 1 Year':
        return 0
    elif x in '1-2 Year':
        return 1
    else:
        return 2
train1['Vehicle_Age'] =  train1['Vehicle_Age'].apply(age_car_categori)
test1['Vehicle_Age'] =  test1['Vehicle_Age'].apply(age_car_categori)

In [None]:
train1.pivot_table(index='Vehicle_Age',columns='Gender' ,values='Previously_Insured', aggfunc='size').plot(kind='bar')

In [None]:
train1.groupby(['Vehicle_Age','Vehicle_Damage', 'Previously_Insured']).size().plot(kind='bar', color='y')

In [None]:
train1['Vintage'].describe()

In [None]:
def convert_to_month(x):
    return x / 30
train1['Vintage'] = train1['Vintage'].apply(convert_to_month)
train1['Vintage'] = train1['Vintage'].astype('int')
test1['Vintage'] = test1['Vintage'].apply(convert_to_month)
test1['Vintage'] = test1['Vintage'].astype('int')

In [None]:
train1['Annual_Premium'].plot(kind='hist', bins=200)

In [None]:
train['Age'].describe()

In [None]:
bins = [19, 37, 55, 86]
train1['Age'] = pd.cut(train1['Age'], bins, labels=['young', 'Middle age', 'senior'])
test1['Age'] = pd.cut(test1['Age'], bins, labels=['young', 'Middle age', 'senior'])

In [None]:
train2 = pd.get_dummies(train1)
test2 = pd.get_dummies(test1)

In [None]:
train2

In [None]:
train3 = train2.drop(['Driving_License', 'Region_Code', 'Gender_Female', 'Vehicle_Damage_No'], axis=1)
test3 = test2.drop(['Driving_License', 'Region_Code', 'Gender_Female', 'Vehicle_Damage_No'], axis=1)

In [None]:
train3['Previously_Insured'] = pd.get_dummies(train3['Previously_Insured'])[0]
test3['Previously_Insured'] = pd.get_dummies(test3['Previously_Insured'])[0]

In [None]:
train3.plot(kind='box', figsize=(20, 12))

## Balance The Data

In [None]:
count_class_0, count_class_1 = train3['Response'].value_counts()
train3_res_0 = train3[train3['Response'] == 0]
train3_res_1 = train3[train3['Response'] == 1]
print('befor balance data:')
print(train3_res_0.shape, train3_res_1.shape)
train3_res_0 = train3_res_0.sample(count_class_1)
print('after balance data:')
print(train3_res_0.shape, train3_res_1.shape)
main_train = pd.concat([train3_res_1, train3_res_0], ignore_index=True)

In [None]:
test3.shape, main_train.shape

In [None]:
test3.shape, main_train.shape

In [None]:
y_train = main_train['Response']
np.random.shuffle(y.values)
main_train.drop('Response', axis=1, inplace=True)
x_train = main_train
x_test = test3[:43104]

## RandomForest:

In [None]:
from sklearn.ensemble import RandomForestClassifier
rn = RandomForestClassifier(max_depth=5, n_estimators=1000)
rn.fit(x_train, y_train)
print(rn.score(x_train, y_train))

## XGBoost:

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier(max_depth=8, eta=0.6, subsample=0.7)
model.fit(x_train, y_train)
print(model.score(x_train, y_train))
y_predict = model.predict(x_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

## Because Of Not At The Same Size Get Lower Acc 
## We Balance The Data 

In [None]:
print(classification_report(y_train, y_predict))