# Kick-off

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Imports

In [None]:
import pandas as pd
print("Pandas version:", pd.__version__)

import numpy as np
print("Numpy version:", np.__version__)

import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator

from scipy import stats
from math import floor

# For outlier detection
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer

# Utility Functions

In [None]:
def null_ratio(df, null_column):
  columns = list(df.columns)
  columns.remove(null_column)
  null_rows = df[df[null_column].isnull()][columns]
  denominator = len(columns) * len(null_rows)
  numerator = denominator - null_rows.isnull().sum().sum()
  return numerator/denominator

In [None]:
def overlapping_histogram(data1, data2,column, label1=None, label2=None,  bins=100):
  plt.figure(figsize=(8,6))
  plt.hist(data1[column], alpha=0.5, label=label1, bins=bins)
  plt.hist(data2[column], alpha=0.5, label=label2, bins=bins)
  plt.xlabel("Data", size=14)
  plt.ylabel("Count", size=14)
  plt.title(column)
  plt.legend(loc='upper right')

In [None]:
def plot_by_id(column):
  np.random.seed(42)
  fig, ax  = plt.fig, axs = plt.subplots(figsize=(15,5))
  ax.scatter(train_data['Id'], train_data[column] )
  ax.scatter(train_data['Id'], train_data[column].isnull(), c=train_data[column].isnull() );
  ax.axhline(y=train_data[column].median(), color = 'r', linestyle = '-')
  ax.set_title(column)
  ax.text(x=-0.25, y=train_data[column].median(), s=train_data[column].median(), weight='bold');


In [None]:
def show_heatmap(data,figsize=(12,8) , highest_only=False , thresold=0.7, annot=False):
  correlation_matrix = data.corr()
  high_corr = correlation_matrix[np.abs(correlation_matrix )>= thresold]

  plt.figure(figsize=figsize)

  if highest_only:
    sns.heatmap(high_corr, annot=annot,cmap="YlGnBu",  linecolor='black', linewidths=0.2)
  else:
    sns.heatmap(correlation_matrix, annot=annot)

In [None]:
def display_skew(data, columns, title='Skewness', figsize=(8,4)):
  data_sub = data[columns]
  plt.figure(figsize=figsize)
  plt.bar(data_sub.skew().index, data_sub.skew().values)
  plt.grid(axis='y')
  plt.axhline(y = 1, color = 'r', linestyle = '-')
  plt.axhline(y = -1, color = 'r', linestyle = '-', label='-1')

  # if the size of the width is less than number of columns, make xticks vertical
  if len(columns) > figsize[0]:
    plt.xticks(rotation='vertical')

  plt.title(title)

# Meet and Greet the data
<br>[Kaggle URL](https://www.kaggle.com/c/prudential-life-insurance-assessment/data)

In [None]:
train_data = pd.read_csv('/kaggle/input/prudential-life-insurance-assessment/train.csv.zip')
test_data = pd.read_csv('/kaggle/input/prudential-life-insurance-assessment/test.csv.zip')
submission = pd.read_csv('/kaggle/input/prudential-life-insurance-assessment/sample_submission.csv.zip')

train_data.shape, test_data.shape

# Exploratory Data Analysis

In [None]:
drop_columns=set()
skewed_columns=set()
outliers=set()

## Breaking data for convenience
There are too many columns. Lets divided them into logical segments

Too many columns, lets them separate out by two parts

* Basic Info - All columns except product, medical history & medical keywords
* Product Info
* Medical History
* Medical keywords

In [None]:
basic_cols = (train_data.columns[~train_data.columns.str.startswith('Medical') & ~train_data.columns.str.startswith('Product')]).tolist()
product_col = (train_data.columns[train_data.columns.str.startswith('Product')]).tolist()
medical_hist_col = (train_data.columns[train_data.columns.str.startswith('Medical_History')]).tolist()
medical_kw_col = (train_data.columns[train_data.columns.str.startswith('Medical_Keyword')]).tolist()

## Correlation

Columns with missing values - Test Data

In [None]:
basic_train = train_data[basic_cols]
basic_train.info()

In [None]:
show_heatmap(basic_train, highest_only=True, annot=True, figsize=(18,7))

#### BMI
As expected BMI is highly correlated with Weight. But surpisingly height is not. That means BMI may be not rightly calculated. One reason can be for different person BMI is calculated using different unit of height. 
Considering above it is worth droping BMI from features.

In [None]:
drop_columns.add('BMI')
drop_columns

#### Employment Info

In [None]:
employment_info_cont = ['Employment_Info_1', 'Employment_Info_4', 'Employment_Info_6']
basic_train[employment_info_cont].hist(figsize=(12,6))
plt.suptitle("Employee Info Continous values Histogram");

Employment_Info_1, Employment_Info_4, Employment_Info_6 are continous variable and looks to be highly skewed. Lets check the skewness level

In [None]:
display_skew(train_data,employment_info_cont, 'Employment Skewness')

In [None]:
skewed_columns.update({'Employment_Info_1', 'Employment_Info_4'})
skewed_columns

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(15,5))
for idx, col in enumerate(employment_info_cont):
  sns.lineplot(ax=ax[idx],y="Response", x=col, data=train_data)
plt.suptitle('Employment Info Continous values to Response');

* Employment info 6: seems to be less correlated with response
* Employment info 1: higher value gives less chance of getting response between 3-5
* Employment info 4: values between 4-8 does not result in response 1-3


In [None]:
employment_info_cat = ['Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5']
basic_train[employment_info_cat].hist(figsize=(12,6))
plt.suptitle("Employee Info Categorical values Histogram");
# sns.catplot(x="Employment_Info_2", col="Response", data=train_data, kind="count", col_wrap=3);

In [None]:
sns.catplot(data=train_data[employment_info_cat])
plt.suptitle('Employment Info Category');

Employment info 2 have large number of categories. Lets see employment info relation with response

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(15,5))
for idx, col in enumerate(employment_info_cat):
  p = sns.scatterplot(ax=ax[idx],x="Response", y=col, hue=col,data=train_data)
  p.legend_.remove()
plt.suptitle('Employment Info Categorical values to Response');

* Employment Info 2 values between 20-25 does not favour response 2,3, 4 and 5.
* Other two variable does not have much impact on the response

#### Insured Info

In [None]:
insured_info_col = (train_data.columns[train_data.columns.str.startswith('InsuredInfo')]).tolist()

All insured info seems very skewed, lets check their skewness

In [None]:
fig, ax = plt.subplots(3, 3, figsize=(21, 12))
for col, an_axes in zip(insured_info_col, ax.flatten()):
    sns.countplot(data=train_data, x=col, ax=an_axes)
plt.delaxes(ax[2][1])
plt.delaxes(ax[2][2])
plt.suptitle('Insured Info Count Plot');

In [None]:
display_skew(train_data,insured_info_col, figsize=(6,4), title='Insured Info Skewness')

Most of the insured info are skewed

In [None]:
fig, ax = plt.subplots(3, 3, figsize=(21, 12))
for col, an_axes in zip(insured_info_col, ax.flatten()):
    sns.boxplot(data=train_data, x=col, y='Response', ax=an_axes)

plt.delaxes(ax[2][1])
plt.delaxes(ax[2][2])
plt.suptitle('Insured Info vs Response');

Insured Info 2,3 & 7 have similar distribution also have some relation with response. Lower category value tends to give higher response.

In [None]:
# Check basic columns has correlation with non basic columns
basic_corr = train_data.corr()[basic_cols]
set(basic_corr[basic_corr[np.abs(basic_corr) > 0.7 ].notnull().sum(axis=1) > 1].index) - set(basic_cols)

This proves that none of the basic colums are correlated with non basic columns

### Insurance_History

In [None]:
insured_hist_col = (train_data.columns[train_data.columns.str.startswith('Insurance_History')]).tolist()
insured_hist_col

Interestingly there is no Insurance_History_6
<br>Only Insurance_History_5 is continous, lets look at it

There are outliers in Insurance_History_5 , lets try to remove them and see

In [None]:
fig,ax = plt.subplots(ncols=3, figsize=(16,4))
sns.scatterplot(ax=ax[0],data=train_data, x='Insurance_History_5', y='Response')
sns.histplot(ax=ax[1],data=train_data, x='Insurance_History_5', bins=3)
sns.boxplot(ax=ax[2],data=train_data, x='Insurance_History_5')
plt.suptitle('Insurace_History_5');

Data are too much concentrated near zero. This is definatley candidate for outlier and skewness removal

In [None]:
outliers.add('Insurance_History_5')

In [None]:
# Categorical columns of Insured History
insured_history_cat = ['Insurance_History_1',
 'Insurance_History_2',
 'Insurance_History_3',
 'Insurance_History_4',
 'Insurance_History_7',
 'Insurance_History_8',
 'Insurance_History_9']

# display the box plot for categorical insured histry
fig, ax = plt.subplots(2, 4, figsize=(15, 8))
for col, an_axes in zip(insured_history_cat, ax.flatten()):
    sns.boxplot(data=train_data, x=col, y='Response', ax=an_axes)
plt.delaxes(ax[1][3])
plt.suptitle('Insured History Categorical vs Response');

Conclusion
* All of them have skewed single tail
* Insured History 2 & 3 value 2 directly indicates 8 & 1 respectively

### Family History

Only Family History 1 is categorical

In [None]:
sns.countplot(data=train_data, x='Family_Hist_1', hue='Response');

In [None]:
sns.violinplot(data=train_data, x='Family_Hist_1', y='Response');

Family history 1 also have very skewed data. Spcially we have very lower percentage of data of Family History 1 for response 3 & 4. This may be because of data collection restriction for response 3 & 4 . Prudential definitely need to look into collecting Family History 1 for response 3 & 4.

In [None]:
family_hist_cont = ['Family_Hist_2', 'Family_Hist_3', 'Family_Hist_4', 'Family_Hist_5']

In [None]:
train_data[family_hist_cont].hist(figsize=(8,6));

Wow, first time we have a columns which resembles normal distribution

In [None]:
sns.boxplot(data=train_data[family_hist_cont]);

There are outliers which needs taken care of

In [None]:
outliers.update({'Family_Hist_2', 'Family_Hist_3', 'Family_Hist_4', 'Family_Hist_5'})
outliers

In [None]:
# display the box plot for famil history continous data
fig, ax = plt.subplots(2, 2, figsize=(24, 8))
for col, an_axes in zip(family_hist_cont, ax.flatten()):
    sns.lineplot(data=train_data, x=col, y='Response',ax=an_axes)
    an_axes.grid()

plt.suptitle('Family History continous vs Response')
plt.show()


*   Lower Family History 2 near to zero have more chance of response 1-5
*   Response is unstable at after value 0.8
*   As we have seen earlier there are outliers for Family History 2, 3 & 6



### Product info

Only Product_Info_4 is continous

In [None]:
sns.histplot(data=train_data, x='Product_Info_4');

In [None]:
ax = sns.lineplot(data=train_data, x='Product_Info_4', y='Response')
ax.grid(which='minor')
ax.minorticks_on()
plt.show()

* Value near to zero don't have response more than 5
* Values more than 8.15 also doesn't have response less than 6


In [None]:
product_info_cat =['Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_5', 'Product_Info_6', 'Product_Info_7']

In [None]:
fig, ax = plt.subplots(3, 2 , figsize=(15, 12))
for col, an_axes in zip(product_info_cat, ax.flatten()):
    sns.countplot(data=train_data, x=col, hue='Response', ax=an_axes)
    an_axes.tick_params(axis='x', rotation=90)
# plt.xticks(rotation='vertical')
plt.show();

* Again the date are very skewed
* The bar for response 8 is taller in every chart

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train_data['Product_Info_2'])
train_data['Product_Info_2_code'] = le.transform(train_data['Product_Info_2'])
test_data['Product_Info_2_code'] = le.transform(test_data['Product_Info_2'])


# train_data.drop('Product_Info_2', axis=1, inplace=True)
drop_columns.add('Product_Info_2')
product_info_cat.remove('Product_Info_2')
product_info_cat.append('Product_Info_2_code')

In [None]:
product_info_cat

In [None]:
fig, ax = plt.subplots(3, 2 , figsize=(15, 12))
for col, an_axes in zip(product_info_cat, ax.flatten()):
    sns.boxplot(data=train_data, x=col, y='Response', ax=an_axes)
    an_axes.tick_params(axis='x', rotation=90)
plt.show();

In [None]:
outliers.update({'Product_Info_3', 'Product_Info_5', 'Product_Info_2_code'})
outliers

### Medical History

Discrete columns

In [None]:
medical_hist_disc = ['Medical_History_1', 'Medical_History_10', 'Medical_History_15', 'Medical_History_24', 'Medical_History_32']

In [None]:
train_data[medical_hist_disc].hist(figsize=(15,8));

In [None]:
fg = sns.catplot(data=train_data[medical_hist_disc])
fg.axes[0][0].tick_params(axis='x', rotation=90)
plt.show();

* These discrete categories si not suitable for encoding, however we have to normalize them as the upper range is high.
* This kind of data very prone to outliers lets check the boxplot

In [None]:
fig, ax = plt.subplots(3, 2 , figsize=(15, 12))
for col, an_axes in zip(medical_hist_disc, ax.flatten()):
    sns.boxplot(data=train_data, y=col, x='Response', ax=an_axes, showmeans=True)
    an_axes.tick_params(axis='x', rotation=90)
plt.delaxes(ax[2][1])
plt.show();

In [None]:
outliers.update({'Medical_History_1', 'Medical_History_10', 'Medical_History_15', 'Medical_History_24', 'Medical_History_32'})

In [None]:
fig, ax = plt.subplots(3, 2 , figsize=(15, 12))
for col, an_axes in zip(medical_hist_disc, ax.flatten()):
    sns.pointplot(data=train_data, y=col, x='Response', ax=an_axes)
plt.delaxes(ax[2][1])
plt.show();

As shown above specially for Medical History 1 outlier removal is important.

# Missing Values

In [None]:
train_missing = train_data.isnull().sum()[train_data.isnull().sum(axis=0) > 0].sort_values()
fig, ax = plt.subplots(figsize=(15,5))
ax.barh(train_missing.index, train_missing)
  
for i, v in enumerate(train_missing):
    ax.text(v + 3, i, str(v))
plt.title('Missing Values')
plt.show()

In [None]:
test_missing = test_data.isnull().sum()[test_data.isnull().sum(axis=0) > 0].sort_values()
fig, ax = plt.subplots(figsize=(15,5))
ax.barh(test_missing.index, test_missing)
  
for i, v in enumerate(test_missing):
    ax.text(v + 3, i, str(v))
plt.title('Missing Values')
plt.show()

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
mean_column=set()
zero_column=set()
iterative_column=set()
knn_column=set()

### Employment_Info_1 - Continuous

In [None]:
overlapping_histogram(train_data, test_data, 'Employment_Info_1','train', 'test')

Number of missing values for Employment_Info_1 is low, hence it is unlikely change the distribution if we fill it with mean.

In [None]:
mean_column.add('Employment_Info_1')
mean_column

## Employment_Info_4   - Continous    
Missing count: 6779

In [None]:
overlapping_histogram(train_data, test_data, 'Employment_Info_4', 'train', 'test')

In [None]:
correlation_matrix = train_data.corr()
correlation_matrix[np.abs(correlation_matrix['Employment_Info_4']) > 0.30]['Employment_Info_4']

In [None]:
np.random.seed(42)
fig, ax  = plt.fig, axs = plt.subplots(figsize=(15,5))
ax.scatter(train_data['Id'], train_data['Employment_Info_4'] )
ax.scatter(train_data['Id'], train_data['Employment_Info_4'].isnull(), c=train_data['Employment_Info_4'].isnull() );

In [None]:
null_ratio(train_data, 'Employment_Info_4')

* Employee Info 4 does not have much correlation with other columns
* Missing values are spread accross the dataset
* The rows with null Employee_Info_4 have 95% completeness score for other columns, hence there is no point of dropping this rows
* Data is very much left skewed, hence it is not a good candidate for mean imputation as it may change the distribution of the data.
* One of the possible way to imput with 0 and add missing indicator column


In [None]:
zero_column.add('Employment_Info_4')
zero_column

## Medical_History_1 - Discrete
Missing counts: 8889

In [None]:
overlapping_histogram(train_data, test_data, 'Medical_History_1', 'train', 'test')

In [None]:
correlation_matrix[np.abs(correlation_matrix['Medical_History_1']) > 0.30]['Medical_History_1']

In [None]:
np.random.seed(42)
fig, ax  = plt.fig, axs = plt.subplots(figsize=(15,5))
ax.scatter(train_data['Id'], train_data['Medical_History_1'] )
ax.scatter(train_data['Id'], train_data['Medical_History_1'].isnull(), c=train_data['Medical_History_1'].isnull() );
ax.axhline(y=train_data['Medical_History_1'].median(), color = 'r', linestyle = '-')
ax.text(x=-0.25, y=train_data['Medical_History_1'].median(), s=train_data['Medical_History_1'].median(), c='w',weight='bold');


In [None]:
null_ratio(train_data, 'Medical_History_1', )

* Again there is low correlation with other columns
* Data is skewed hence mean imputation is not advisable
* Missing count is large hence can't imput with 0
* Consider this as candiate for iterative imputer

In [None]:
knn_column.add('Medical_History_1')

## Employment_Info_6 - Continous

In [None]:
overlapping_histogram(train_data, test_data, 'Employment_Info_6', 'train', 'test')

In [None]:
correlation_matrix[np.abs(correlation_matrix['Employment_Info_6']) > 0.30]['Employment_Info_6']

In [None]:
plot_by_id('Employment_Info_6')

Again this is a candidate for iterative imputer

In [None]:
iterative_column.add('Employment_Info_6')

## Family Hist 4 -  Continous

In [None]:
overlapping_histogram(train_data, test_data, 'Family_Hist_4', 'train', 'test')

In [None]:
plot_by_id('Family_Hist_4')

In [None]:
correlation_matrix[np.abs(correlation_matrix['Family_Hist_4']) > 0.30]['Family_Hist_4'].sort_values()

Again consider this for iterative imputation, though this was a ideal candidate for KNN imputer but let see how its perform with iterative

In [None]:
iterative_column.add('Family_Hist_4')
iterative_column

## Insurance_History_5 - Continous

In [None]:
overlapping_histogram(train_data, test_data, 'Insurance_History_5', 'train', 'test')

In [None]:
plot_by_id('Insurance_History_5')

In [None]:
correlation_matrix[np.abs(correlation_matrix['Insurance_History_5']) > 0.30]['Insurance_History_5'].sort_values()


*   There are outliers
*   Not much Correlation with others
*   Median is very close to zero



In [None]:
zero_column.add('Insurance_History_5')
zero_column

## Family_Hist_2 - Continous

In [None]:
overlapping_histogram(train_data, test_data, 'Family_Hist_2', 'train', 'test')

In [None]:
plot_by_id('Family_Hist_2')

In [None]:
correlation_matrix[np.abs(correlation_matrix['Family_Hist_2']) > 0.30]['Family_Hist_2'].sort_values()

In [None]:
iterative_column.add('Family_Hist_2')
iterative_column

## Family_Hist_3 - Continous

In [None]:
overlapping_histogram(train_data, test_data,  'Family_Hist_3', 'train', 'test')

In [None]:
plot_by_id('Family_Hist_3')

In [None]:
correlation_matrix[np.abs(correlation_matrix['Family_Hist_3']) > 0.30]['Family_Hist_3'].sort_values()

* High number of missing values
* Similar distribution as other family history
* Looks like there are outliers

Though median imputation would have been fine considerting the large missing values lets go for iterative approach

In [None]:
iterative_column.add('Family_Hist_3')
iterative_column

## Family_Hist_5 - Continous

In [None]:
overlapping_histogram(train_data, test_data,  'Family_Hist_5', 'train', 'test')
plot_by_id('Family_Hist_5')
correlation_matrix[np.abs(correlation_matrix['Family_Hist_5']) > 0.30]['Family_Hist_5'].sort_values()


very similar to Family_Hist_3

In [None]:
iterative_column.add('Family_Hist_5')
iterative_column

## Non Zero columns
We don't expect Ins_Age, Ht, Wt, BMI to have zeros, lets find out how many

In [None]:
print("Age 0\t:",(train_data['Ins_Age'] == 0 ).sum())
print("Ht 0\t:",(train_data['Ht'] == 0 ).sum())
print("Wt 0\t:",(train_data['Wt'] == 0 ).sum())
print("BMI 0\t:",(train_data['BMI'] == 0 ).sum())

Count is enogh low to be ignored

## Rest of missing values
* 'Medical_History_10'
* 'Medical_History_15'
* 'Medical_History_24'
* 'Medical_History_32'

These columns have almost more than 50% columns missing, lets see how much correlation they have with other columns

In [None]:
large_missing_columns = ['Medical_History_10', 'Medical_History_15', 'Medical_History_24', 'Medical_History_32']

for a_missing_col in large_missing_columns:
  print(a_missing_col,":")
  print(correlation_matrix[(np.abs(correlation_matrix[a_missing_col]) > 0.30) & (np.abs(correlation_matrix[a_missing_col]) < 1)][a_missing_col].sort_values())
  print("")

In [None]:
# How many row have all those four columns as null
missing_rows = np.full(train_data.shape[0], True, dtype=bool)
for a_missing_col in large_missing_columns:
  missing_rows = train_data[a_missing_col].isnull() & missing_rows

missing_rows.sum()

In [None]:
fig, ax  = plt.fig, axs = plt.subplots(figsize=(15,5))
for a_missing_col in large_missing_columns:
  ax.scatter(train_data['Id'], train_data[a_missing_col], label=a_missing_col)
plt.legend(loc='upper left');
plt.show()

Lets drop these columns because
* 90% rows have all these 4 columns empty
* These four columns have good correleation with other columns, hence we will get some information still keeping even after we drop them
 

In [None]:
drop_columns.update(large_missing_columns)
drop_columns

In [None]:
# Additionaly drop the medical keyword column as they are dummy
keyword_col = train_data.columns[train_data.columns.str.startswith('Medical_Keyword')].tolist()
drop_columns.update(keyword_col)
drop_columns.add('Id')
print(drop_columns)

# Split data into training and validation

In [None]:
print("Drop Columns:", drop_columns)

In [None]:
train_data.drop(drop_columns, axis=1, inplace=True)
X = train_data.drop('Response', axis=1)
y = train_data['Response']
X.head(5)

In [None]:
X_train, X_val , y_train, y_val = train_test_split(X,y,random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [None]:
X_test = test_data.drop(drop_columns, axis=1)

# Feature Selection - One way annova

This is a temporary imputation just to enable feature selection. We will do the actual missing value treatment later.
* Since oneway annova deals with mean, imputation with mean is not going to change the mean, hence outcome will not be affected by imputation. 

In [None]:
# # Dummy imputation just to enable the feature selection
tmp_X_train = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler()).fit_transform(X_train)
tmp_X_train_df = pd.DataFrame(tmp_X_train, columns=X_train.columns)

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
k_best = SelectKBest(score_func=f_classif, k='all')
k_best.fit(tmp_X_train_df, y_train)

# transform train data
tmp_X_train = k_best.transform(tmp_X_train_df)
tmp_X_train_df = pd.DataFrame(tmp_X_train_df, columns=X_train.columns)

In [None]:
# An thresold score selected for experiment
selected_thresold = 70
print("=>", selected_thresold,( k_best.scores_ >= selected_thresold ).sum())

# See how other thresold looks
print("=> 250",( k_best.scores_ >= 250 ).sum())
print("=> 75",( k_best.scores_ >= 70 ).sum())
print("=> 10",( k_best.scores_ >= 10 ).sum())
print("< 10",( k_best.scores_ > 10 ).sum())

In [None]:
pd.DataFrame(k_best.scores_.reshape(1,73), columns=tmp_X_train_df.columns)

In [None]:
# plot the scores
plt.figure(figsize=(14,6))
plt.bar(tmp_X_train_df.columns, k_best.scores_)
plt.axhline(y=selected_thresold, color = 'r', linestyle = '-.')
plt.axhline(y=250, color = 'b', linestyle = '-.')
plt.axhline(y=10, color = 'violet', linestyle = '-.')

plt.xticks(rotation='vertical')
plt.title('F Score using Annova')
plt.show()

In [None]:
selected_features = set(list(tmp_X_train_df.columns[k_best.scores_ >= selected_thresold]))
print(f"Number of selected features:{len(selected_features)}")
print(selected_features)

Looks like chosing number of features to select to that score makes sense because
* The selected columns covers all the groups... 
<br>age, hight, weight, employment info, insured info, insured history, family history, medical history and product info
* At least 2 columns selected from each group

In [None]:
X_train_selected = X_train[selected_features]
X_val_selected = X_val[selected_features]
X_test_selected = X_test[selected_features]

# Outlier detection

In [None]:
outliers = outliers.intersection(selected_features)
outliers

In [None]:
df_for_outlier = X_train[~X_train['Family_Hist_4'].isna()].copy()
isf=IsolationForest()
isf.fit(df_for_outlier[['Family_Hist_4']])

df_for_outlier['scores']  =isf.decision_function(df_for_outlier[['Family_Hist_4']])
df_for_outlier['anomaly'] =isf.predict(df_for_outlier[['Family_Hist_4']])

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(12,6))
sns.countplot(data=df_for_outlier, x='anomaly', ax=ax[0]);
sns.scatterplot(data=df_for_outlier, x='Family_Hist_4', y='scores', hue="anomaly", ax=ax[1]);

In [None]:
# Create an outlier transformer for pipeline
class OutlierTransformer(object):
  """
  This transformer will add a Anomaly columns
  Return a Pandas Data frame
  """
  def __init__(self, transformer, outlier_cols, df_columns):
    # The outlier predictor, example isolation forest
    self.transformer = transformer
    # The columns which has outliers, this is list
    self.outlier_cols = outlier_cols
    # All the columns in the incoming dataframe
    self.df_columns = df_columns

  def fit(self,X,y=None):
    return self

  def transform(self,X,y=None):
    X_ = X.copy()
    """
    Previous steps in pipeline outputs numpy array rather than dataframe
    Here we recreate the dataframe
    """
    X_df = pd.DataFrame(X_, columns=self.df_columns)
    # Add the anomaly column
    X_df['Anomaly'] = self.transformer.predict(X_df[self.outlier_cols])
    return X_df

# Preprocessing

In [None]:
# Use only selected columns
zero_column = zero_column.intersection(selected_features)
mean_column = mean_column.intersection(selected_features)
iterative_column = iterative_column.intersection(selected_features)
knn_column = knn_column.intersection(selected_features)

print("Zero Constant Imputation:", zero_column )
print("Mean Imputation:", mean_column)
print("Iterative Imputation:", iterative_column)
print("KNN Imputation:", knn_column)

Well, that make most of the columns out of scope

In [None]:
# select columns with no missing values
non_missing_col = selected_features
non_missing_col = non_missing_col - zero_column - mean_column - iterative_column - knn_column - drop_columns
print("Non Missing Columns: Size {} : {} ".format(len(non_missing_col), non_missing_col) )

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
%%time
processed_columns = list(X_train_selected.columns)
processed_columns.append('Missing_Ind')

iter_transformer = IterativeImputer(random_state=42,  add_indicator=True)
preprocessor = Pipeline(steps=[
                           ('normalizer', StandardScaler()),
                           ('imputer', iter_transformer),
                           ('outlier_transformer', OutlierTransformer(isf, ['Family_Hist_4'], processed_columns))
])


X_train_processed = preprocessor.fit_transform(X_train_selected)
X_val_processed = preprocessor.transform(X_val_selected)
X_test_processed = preprocessor.transform(X_test_selected)

In [None]:
X_train_processed.shape, X_val_processed.shape, X_test_processed.shape

# XG Boost

In [None]:
%%time

import xgboost 
model = xgboost.XGBClassifier(random_state=42)
model.fit(X_train_processed, y_train)
print(model.score(X_val_processed, y_val))

In [None]:
y_pred = model.predict(X_test_processed)

In [None]:
sample_df = pd.DataFrame(zip(test_data['Id'], y_pred), columns=['Id', 'Response'])

In [None]:
sample_df.to_csv("sample_submission.csv", index=False)
%ls -lrt