In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import featuretools as ft

In [None]:
# ignore warnings from pandas
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('/kaggle/input/jobathon-analytics-vidhya/train.csv')
test = pd.read_csv('/kaggle/input/jobathon-analytics-vidhya/test.csv')

In [None]:
pip install utils

In [None]:
import utils 
import featuretools as ft
print('Feature tool version {}'.format(ft.__version__))

In [None]:
train.columns

In [None]:
train.info()

In [None]:
import featuretools.variable_types as vtypes
variable_vtypes = {'ID': vtypes.Ordinal,
                  'City_Code': vtypes.Categorical,
                  'Region_Code': vtypes.Categorical,
                  'Accomodation_Type': vtypes.Categorical,
                  'Reco_Insurance_Type': vtypes.Categorical,
                  'Upper_Age': vtypes.Ordinal,
                  'Lower_Age': vtypes.Ordinal,
                  'Is_Spouse': vtypes.Categorical,
                  'Health Indicator': vtypes.Categorical,
                  'Holding_Policy_Duration': vtypes.Ordinal,
                  'Holding_Policy_Type': vtypes.Categorical,
                  'Reco_Policy_Cat': vtypes.Categorical, 
                  'Reco_Policy_Premium': vtypes.Ordinal,
                  'Response': vtypes.Boolean}

In [None]:
es = ft.EntitySet('HealthInsurance')


In [None]:
es = es.entity_from_dataframe(
                              entity_id = 'HealthInsurance',
                              dataframe = train,
                              index = 'Health_Policy_Type',
                              variable_types = variable_vtypes)

es['HealthInsurance']

In [None]:
# We build new enitties from our existing one using normalize_entity

es.normalize_entity('HealthInsurance', 'Reco', 'Reco_Policy_Cat',
                   additional_variables = ['City_Code',
                                           'Region_Code',
                                           'Accomodation_Type',
                                           'Reco_Insurance_Type',
                                           'Upper_Age',
                                           'Lower_Age',
                                           'Is_Spouse',
                                           'Health Indicator',
                                           'Reco_Policy_Premium',
                                           'Response'])



In [None]:
es.plot()

In [None]:
# Generate features using the constructed entity set

fm, features = ft.dfs(entityset = es,
                     target_entity = 'HealthInsurance',
                     agg_primitives = ['count', 'percent_true'],
                     max_depth = 3,
                     approximate = '6h',
                     verbose = True)

In [None]:
cols = [c for c in fm.columns]
fm = fm[cols]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

X = fm.copy()
label = train.pop('Response')

In [None]:
X['Holding_Policy_Duration'] = X['Holding_Policy_Duration'].fillna(str(0.0))
X['Holding_Policy_Type'] = X['Holding_Policy_Type'].fillna('0')

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

X['Holding_Policy_Duration'] = le.fit_transform(X['Holding_Policy_Duration'])
X['Reco.City_Code'] = le.fit_transform(X['Reco.City_Code'])
X['Reco.Accomodation_Type'] = le.fit_transform(X['Reco.Accomodation_Type'])
X['Reco.Reco_Insurance_Type'] = le.fit_transform(X['Reco.Reco_Insurance_Type'])
X['Reco.Health Indicator'] = le.fit_transform(X['Reco.Health Indicator'])
X['Reco.Is_Spouse'] = le.fit_transform(X['Reco.Is_Spouse'])

In [None]:
X_train , X_test, y_train, y_test = train_test_split(X, label, test_size = 0.3, shuffle = False)

In [None]:
clf = RandomForestClassifier(n_estimators = 150)
clf.fit(X_train, y_train)
probs = clf.predict_proba(X_test)

In [None]:
print('AUC score of {:.3f}'.format(roc_auc_score(y_test, probs[:,1])))

# ROC_AUC Increased up a bit

In [None]:
X_train.columns

In [None]:
feature_imps = [(imp, X.columns[i]) for i, imp in enumerate(clf.feature_importances_)]
feature_imps.sort()
feature_imps.reverse()
print('Random Forest Feature Importances:')
for i, f in enumerate(feature_imps[0:8]):
    print('{}: {} [{:.3f}]'.format(i + 1, f[1], f[0]/feature_imps[0][0]))

In [None]:
pip install bokeh

In [None]:
import utils

In [None]:
p1 = utils.plot_roc_auc(y_test, probs)
p2 = utils.plot_f1(y_test, probs, 1000)
p3 = utils.plot_kfirst(y_test, probs, 300)

In [None]:
from bokeh.io import show
from bokeh.layouts import gridplot

In [None]:
p4 = utils.plot_locations(fm)
p5 = utils.plot_noshow_by_loc(fm)
p6 = utils.plot_ages(fm)
p7 = utils.plot_noshow_by_age(X)
