In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

**Importing Train data and exporing it**

In [None]:
df = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/train.csv')

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

**Finding Correlation**

In [None]:
corr = df.corr()
sns.heatmap(corr,square=True,
           xticklabels=corr.columns.values,
           yticklabels=corr.columns.values,
           cmap= 'coolwarm')

In [None]:
corr

**Visualizing Data to find patterns**

In [None]:
sns.distplot(df['Annual_Premium'], bins=50)

In [None]:
sns.boxplot(df['Annual_Premium'])

In [None]:
fig = plt.figure()
plt.hist(df['Annual_Premium'], bins=50)
plt.show()

In [None]:
sns.distplot(df['Policy_Sales_Channel'], bins=50)

In [None]:
sns.countplot(df['Previously_Insured'])

In [None]:
sns.countplot(df['Driving_License'])

In [None]:
sns.distplot(df['Age'], bins=50)

In [None]:
plt.figure(figsize = (15, 6))
sns.distplot(df.loc[(df['Gender'] == 'Male'), 'Age'], kde_kws = {"color": "b", "lw": 1, "label": "Male"})
sns.distplot(df.loc[(df['Gender'] == 'Female'), 'Age'], kde_kws = {"color": "g", "lw": 1, "label": "Female"})
plt.title('Age distribution by Gender', fontsize = 15)
plt.show()

In [None]:
plt.figure(figsize = (15, 6))
sns.distplot(df.loc[(df['Gender'] == 'Male'), 'Annual_Premium'], kde_kws = {"color": "b", "lw": 1, "label": "Male"})
sns.distplot(df.loc[(df['Gender'] == 'Female'), 'Annual_Premium'], kde_kws = {"color": "r", "lw": 1, "label": "Female"})
plt.title('Annual Premium distribution by Gender', fontsize = 15)
plt.show()

In [None]:
plt.figure(figsize = (15, 6))
sns.distplot(df.loc[(df['Driving_License'] == 0), 'Age'], kde_kws = {"color": "b", "lw": 1, "label": "Not Licensed"})
sns.distplot(df.loc[(df['Driving_License'] == 1), 'Age'], kde_kws = {"color": "r", "lw": 1, "label": "Licensed"})
plt.title('Age distribution by Driving License', fontsize = 15)
plt.show()

In [None]:
plt.figure(figsize = (15, 6))
sns.distplot(df.loc[(df['Driving_License'] == 0), 'Annual_Premium'], kde_kws = {"color": "b", "lw": 1, "label": "Not Licensed"})
sns.distplot(df.loc[(df['Driving_License'] == 1), 'Annual_Premium'], kde_kws = {"color": "r", "lw": 1, "label": "Licensed"})
plt.title('Annual_Premium distribution by Driving License', fontsize = 15)
plt.show()

**Importing different models and finding best one**

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['Vehicle_Damage'] = le.fit_transform(df['Vehicle_Damage'])
df['Vehicle_Age'] = le.fit_transform(df['Vehicle_Age'])
df.head(1)

In [None]:
X = df.drop(columns=['Response'])
y = df['Response']

In [None]:
lr = LogisticRegression()
lr.fit(X,y)
lr_pred = lr.predict(X)
print(roc_auc_score(y,lr_pred))

In [None]:
rf = RandomForestClassifier(n_estimators=40)
rf.fit(X,y)
rf_pred = rf.predict(X)
print (roc_auc_score(y,rf_pred))

In [None]:
gs = GaussianNB()
gs.fit(X,y)
gs_pred = gs.predict(X)
print (roc_auc_score(y,gs_pred))

In [None]:
test = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/test.csv')

In [None]:
test['Gender'] = le.fit_transform(test['Gender'])
test['Vehicle_Damage'] = le.fit_transform(test['Vehicle_Damage'])
test['Vehicle_Age'] = le.fit_transform(test['Vehicle_Age'])
test.head(1)

In [None]:
testIDs = test['id']
testIDs[:5]

In [None]:
Final_preds = [predClass[1] for predClass in rf.predict_proba(test)]

In [None]:
submission = pd.DataFrame(data = {'id': testIDs, 'Response': Final_preds})
submission.to_csv('Health_Insurance_v1.csv', index = False)
submission.head()