In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings  
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

# **Read Dataset**

In [None]:
train_data = pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv')
test_data = pd.read_csv('../input/health-insurance-cross-sell-prediction/test.csv')
train_data

In [None]:
test_data

In [None]:
train_data.isnull().sum()

In [None]:
print ("Rows     : " ,train_data.shape[0])
print ("Columns  : " ,train_data.shape[1])
print ("\nFeatures : \n" ,train_data.columns.tolist())
print ("\nMissing values :  ", train_data.isnull().any())
print ("\nUnique values :  \n",train_data.nunique())

In [None]:
train_data.info()

In [None]:
train_data.head()

In [None]:
train_data.tail()

In [None]:
train_data.head().T

In [None]:
train_data.describe()

# **Check For Missing Values**

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(train_data.isnull())
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(train_data.isnull())
plt.show()

# **EDA**

In [None]:
sns.countplot(train_data['Vehicle_Age'])

In [None]:
sns.countplot(train_data['Gender'])

In [None]:
sns.countplot(train_data['Driving_License'])

In [None]:
sns.countplot(train_data['Previously_Insured'])

In [None]:
sns.countplot(train_data['Vehicle_Damage'])

In [None]:
sns.distplot(x=train_data['Age'])

In [None]:
sns.distplot(x=train_data['Region_Code'])

In [None]:
sns.kdeplot(
   data=train_data, x="Annual_Premium", hue="Response",
   fill=True, common_norm=False, palette="crest",
   alpha=.5, linewidth=0,
)

In [None]:
sns.kdeplot(
   data=train_data, x="Policy_Sales_Channel", hue="Response",
   fill=True, common_norm=False, palette="crest",
   alpha=.5, linewidth=0,
)

In [None]:
sns.kdeplot(
   data=train_data, x="Vintage", hue="Response",
   fill=True, common_norm=False, palette="crest",
   alpha=.5, linewidth=0,
)

In [None]:
sns.catplot(x='Gender', y='Age', hue = 'Response', kind = 'bar', data = train_data)

In [None]:
sns.catplot(x='Driving_License', y='Previously_Insured', hue='Gender', kind = 'bar', data = train_data)

In [None]:
sns.catplot(x='Vehicle_Age', y='Annual_Premium', hue='Response', kind = 'bar', data = train_data)

In [None]:
sns.catplot(x='Vehicle_Damage', y='Vintage', hue='Response', kind = 'box', data = train_data)

In [None]:
sns.lineplot(data= train_data, x='Vintage',y='Annual_Premium', color='goldenrod')

# **Lable Encoding Categorical Data**

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_data['Age'] = le.fit_transform(train_data['Age'])
train_data['Vehicle_Age'] = le.fit_transform(train_data['Vehicle_Age'])
train_data['Vehicle_Damage'] = le.fit_transform(train_data['Vehicle_Damage'])
train_data['Gender'] = le.fit_transform(train_data['Gender'])
test_data['Age'] = le.fit_transform(test_data['Age'])
test_data['Vehicle_Age'] = le.fit_transform(test_data['Vehicle_Age'])
test_data['Vehicle_Damage'] = le.fit_transform(test_data['Vehicle_Damage'])
test_data['Gender'] = le.fit_transform(test_data['Gender'])

In [None]:
del test_data['id']
del train_data['id']

# ***EDA-Dataset***

In [None]:
plt.figure(figsize = (10,10))
plt.title("Correlation Plot")
sns.heatmap(train_data.corr(), linewidth = 5, annot = True, square = True, annot_kws={'size': 10}, cmap="YlGnBu")

In [None]:
fig, ax = plt.subplots(figsize=(18, 12))
corr = train_data.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
ax.text(-1.1, -0.7, 'Correlation between the Features', fontsize=20, fontweight='bold', fontfamily='serif')
sns.heatmap(corr, mask=mask, annot=False, fmt='.2f', linewidth=0.2, cbar=True, cmap='coolwarm');

In [None]:
# kendall
fig, ax = plt.subplots(1, 3, figsize=(17 , 5))



corr = train_data.corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True


for idx, method in enumerate(['pearson', 'kendall', 'spearman']):
    sns.heatmap(train_data.corr(method=method), ax=ax[idx],
            square=True, annot=True, fmt='.1f', center=0, linewidth=2,
            cbar=False, cmap=sns.diverging_palette(240, 10, as_cmap=True),
            mask=mask
           ) 
    ax[idx].set_title(f'{method.capitalize()} Correlation', loc='left', fontweight='bold')     

plt.show()

In [None]:
a = train_data.drop(['Response'], axis=1)
a.corrwith(train_data['Response']).plot(kind='bar', figsize=(18,11), color=['salmon'])
plt.title('Correlation b/n target and Independant features')
plt.xticks(size=15)
plt.yticks(size=15)
plt.show()

**Scatter Plot**

In [None]:
from pandas.plotting import scatter_matrix
fig = plt.figure()
scatter_matrix(train_data,figsize =(25,25),alpha=0.9,diagonal="kde",marker="o");

# **Histogram For All**

In [None]:

train_data.hist(figsize=(25,25),bins=50);

# ***EDA WITH SWEETVIZ LIBRARY***

In [None]:
!pip install sweetviz

In [None]:
   import sweetviz as sv
   my_report = sv.analyze(train_data)
   my_report.show_notebook(w="100%", h="full")

# ***Installing and Importing library for LightAutoml***

In [None]:
!pip install -U lightautoml

In [None]:
# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler

import torch# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler

import torch
from tqdm.notebook import tqdm
from copy import deepcopy

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 300 # Time in seconds for automl run

np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [None]:
def acc_score(y_true, y_pred, **kwargs):
    return accuracy_score(y_true, (y_pred > 0.5).astype(int), **kwargs)

def f1_metric(y_true, y_pred, **kwargs):
    return f1_score(y_true, (y_pred > 0.5).astype(int), **kwargs)

task = Task('binary', metric = f1_metric)

roles = {
    'target': 'Response',
}

In [None]:
%%time 
automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'n_jobs': N_THREADS})
oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

In [None]:
%%time
test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(acc_score(train_data['Response'].values, oof_pred.data[:, 0])))

#  ***Saving Sample***

In [None]:
submission = pd.read_csv('../input/health-insurance-cross-sell-prediction/sample_submission.csv')

In [None]:
submission['Response'] = (test_pred.data[:, 0] > 0.5).astype(int)
submission.to_csv('lightautoml_utilized_300s_f1_metric.csv', index = False)
submission.head()

Done###############################