In [1]:
# Import the libraries 
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')

In [2]:
# Importing the dataset 

test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
sample = pd.read_csv('sample.csv')

In [3]:
# Shape of dataset 

test.shape, train.shape

((21805, 13), (50882, 14))

In [4]:
# Lets have a first look of train dataset 

train.head()

Unnamed: 0,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,1,C3,3213,Rented,Individual,36,36,No,X1,14+,3.0,22,11628.0,0
1,2,C5,1117,Owned,Joint,75,22,No,X2,,,22,30510.0,0
2,3,C5,3732,Owned,Individual,32,32,No,,1.0,1.0,19,7450.0,1
3,4,C24,4378,Owned,Joint,52,48,No,X1,14+,3.0,19,17780.0,0
4,5,C8,2190,Rented,Individual,44,44,No,X2,3.0,1.0,16,10404.0,0


In [5]:
# Data Report function for Null and Unique Values 

def data_report(data):
    Null = pd.Series(data.isnull().sum())
    Unique_Count = pd.Series(data.describe(include='all',datetime_is_numeric=True).loc['unique', :])
    Data_type = pd.Series(data.dtypes)
    info_abt_data = pd.DataFrame(({"Null":Null, "Unique Count": Unique_Count, "Data type": Data_type}))
    return info_abt_data

In [6]:
# Print Data report fn 

data_report(train)

Unnamed: 0,Null,Unique Count,Data type
ID,0,,int64
City_Code,0,36.0,object
Region_Code,0,,int64
Accomodation_Type,0,2.0,object
Reco_Insurance_Type,0,2.0,object
Upper_Age,0,,int64
Lower_Age,0,,int64
Is_Spouse,0,2.0,object
Health Indicator,11691,9.0,object
Holding_Policy_Duration,20251,15.0,object


In [7]:
data_report(test)

Unnamed: 0,Null,Unique Count,Data type
ID,0,,int64
City_Code,0,36.0,object
Region_Code,0,,int64
Accomodation_Type,0,2.0,object
Reco_Insurance_Type,0,2.0,object
Upper_Age,0,,int64
Lower_Age,0,,int64
Is_Spouse,0,2.0,object
Health Indicator,5027,9.0,object
Holding_Policy_Duration,8603,15.0,object


In [8]:
#ratio of null values
train.isnull().sum()/train.shape[0] *100

ID                          0.000000
City_Code                   0.000000
Region_Code                 0.000000
Accomodation_Type           0.000000
Reco_Insurance_Type         0.000000
Upper_Age                   0.000000
Lower_Age                   0.000000
Is_Spouse                   0.000000
Health Indicator           22.976691
Holding_Policy_Duration    39.799929
Holding_Policy_Type        39.799929
Reco_Policy_Cat             0.000000
Reco_Policy_Premium         0.000000
Response                    0.000000
dtype: float64

#### Ratio of null value is 22 and 39 %

In [9]:
#ratio of null values
test.isnull().sum()/test.shape[0] *100

ID                          0.000000
City_Code                   0.000000
Region_Code                 0.000000
Accomodation_Type           0.000000
Reco_Insurance_Type         0.000000
Upper_Age                   0.000000
Lower_Age                   0.000000
Is_Spouse                   0.000000
Health Indicator           23.054345
Holding_Policy_Duration    39.454254
Holding_Policy_Type        39.454254
Reco_Policy_Cat             0.000000
Reco_Policy_Premium         0.000000
dtype: float64

#### Null Values founded 
- Health Indicator	11691
- Holding_Policy_Duration	20251
- Holding_Policy_Type	20251

In [10]:
#categorical features
categorical = train.select_dtypes(include =[np.object])
print("Categorical Features in Train Set:",categorical.shape[1])

#numerical features
numerical= train.select_dtypes(include =[np.float64,np.int64])
print("Numerical Features in Train Set:",numerical.shape[1])

Categorical Features in Train Set: 6
Numerical Features in Train Set: 8


-----------
----------
-----------

In [11]:
# renamed column name to deal with spacing issue further 

train = train.rename(columns= {"Health Indicator":'HealthIndicator'})
test = test.rename(columns= {"Health Indicator":'HealthIndicator'})

In [12]:
# before applying any imputation we have to deal with string values from of missing data columns

In [13]:
train['HealthIndicator']= train['HealthIndicator'].replace({'X1':9, 'X2':8, 'X3':7, 'X4':6, 'X5': 5, 'X6':4, 'X7':3 , 'X8':2, 'X9': 1 })
test['HealthIndicator']= test['HealthIndicator'].replace({'X1':9, 'X2':8, 'X3':7, 'X4':6, 'X5': 5, 'X6':4, 'X7':3 , 'X8':2, 'X9': 1 })

In [14]:
train['Holding_Policy_Duration']= train['Holding_Policy_Duration'].replace({'14+':15.0})
test['Holding_Policy_Duration']= test['Holding_Policy_Duration'].replace({'14+':15.0})

In [15]:
train.Holding_Policy_Duration.value_counts(normalize=True)

1.0     0.146877
15.0    0.141523
2.0     0.139075
3.0     0.117071
4.0     0.090464
5.0     0.077111
6.0     0.061833
7.0     0.053704
8.0     0.042963
9.0     0.036368
10.0    0.026542
11.0    0.017825
12.0    0.016748
13.0    0.016682
14.0    0.015213
Name: Holding_Policy_Duration, dtype: float64

In [16]:
train.head(2)

Unnamed: 0,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,HealthIndicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,1,C3,3213,Rented,Individual,36,36,No,9.0,15.0,3.0,22,11628.0,0
1,2,C5,1117,Owned,Joint,75,22,No,8.0,,,22,30510.0,0


In [17]:
# Looks like we are ready to apply imputer 

In [18]:
train.HealthIndicator.value_counts(normalize=True)

9.0    0.331964
8.0    0.263632
7.0    0.172540
6.0    0.146539
5.0    0.044066
4.0    0.032661
3.0    0.005001
2.0    0.001990
1.0    0.001608
Name: HealthIndicator, dtype: float64

In [19]:
train.Holding_Policy_Duration.value_counts(normalize=True)

1.0     0.146877
15.0    0.141523
2.0     0.139075
3.0     0.117071
4.0     0.090464
5.0     0.077111
6.0     0.061833
7.0     0.053704
8.0     0.042963
9.0     0.036368
10.0    0.026542
11.0    0.017825
12.0    0.016748
13.0    0.016682
14.0    0.015213
Name: Holding_Policy_Duration, dtype: float64

In [20]:
train.Holding_Policy_Type.value_counts(normalize=True)

3.0    0.433515
1.0    0.266821
2.0    0.163397
4.0    0.136267
Name: Holding_Policy_Type, dtype: float64

In [21]:
train = pd.get_dummies(train, columns=["City_Code", "Accomodation_Type",  "Reco_Insurance_Type", "Is_Spouse" ])

In [22]:
train.head(2)

Unnamed: 0,ID,Region_Code,Upper_Age,Lower_Age,HealthIndicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response,...,City_Code_C6,City_Code_C7,City_Code_C8,City_Code_C9,Accomodation_Type_Owned,Accomodation_Type_Rented,Reco_Insurance_Type_Individual,Reco_Insurance_Type_Joint,Is_Spouse_No,Is_Spouse_Yes
0,1,3213,36,36,9.0,15.0,3.0,22,11628.0,0,...,0,0,0,0,0,1,1,0,1,0
1,2,1117,75,22,8.0,,,22,30510.0,0,...,0,0,0,0,1,0,0,1,1,0


In [23]:
train['HealthIndicator'].fillna(train['HealthIndicator'].median(), inplace=True)
train['Holding_Policy_Duration'].fillna(train['Holding_Policy_Duration'].median(), inplace=True)
train['Holding_Policy_Type'].fillna(train['Holding_Policy_Type'].median(), inplace=True)

In [24]:
data_report(train)

Unnamed: 0,Null,Unique Count,Data type
ID,0,,int64
Region_Code,0,,int64
Upper_Age,0,,int64
Lower_Age,0,,int64
HealthIndicator,0,,float64
Holding_Policy_Duration,0,16.0,object
Holding_Policy_Type,0,,float64
Reco_Policy_Cat,0,,int64
Reco_Policy_Premium,0,,float64
Response,0,,int64


In [25]:
train.HealthIndicator.value_counts(normalize=True)

8.0    0.432825
9.0    0.255690
7.0    0.132896
6.0    0.112869
5.0    0.033941
4.0    0.025156
3.0    0.003852
2.0    0.001533
1.0    0.001238
Name: HealthIndicator, dtype: float64

In [26]:
train.Holding_Policy_Duration.value_counts(normalize=True)

5.0     0.397999
1.0     0.088420
15.0    0.085197
2.0     0.083723
3.0     0.070477
4.0     0.054459
5.0     0.046421
6.0     0.037223
7.0     0.032330
8.0     0.025864
9.0     0.021894
10.0    0.015978
11.0    0.010731
12.0    0.010082
13.0    0.010043
14.0    0.009158
Name: Holding_Policy_Duration, dtype: float64

In [27]:
train.Holding_Policy_Type.value_counts(normalize=True)

3.0    0.658976
1.0    0.160627
2.0    0.098365
4.0    0.082033
Name: Holding_Policy_Type, dtype: float64

In [28]:
train.head()

Unnamed: 0,ID,Region_Code,Upper_Age,Lower_Age,HealthIndicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response,...,City_Code_C6,City_Code_C7,City_Code_C8,City_Code_C9,Accomodation_Type_Owned,Accomodation_Type_Rented,Reco_Insurance_Type_Individual,Reco_Insurance_Type_Joint,Is_Spouse_No,Is_Spouse_Yes
0,1,3213,36,36,9.0,15.0,3.0,22,11628.0,0,...,0,0,0,0,0,1,1,0,1,0
1,2,1117,75,22,8.0,5.0,3.0,22,30510.0,0,...,0,0,0,0,1,0,0,1,1,0
2,3,3732,32,32,8.0,1.0,1.0,19,7450.0,1,...,0,0,0,0,1,0,1,0,1,0
3,4,4378,52,48,9.0,15.0,3.0,19,17780.0,0,...,0,0,0,0,1,0,0,1,1,0
4,5,2190,44,44,8.0,3.0,1.0,16,10404.0,0,...,0,0,1,0,0,1,1,0,1,0


In [29]:
X_train = train.drop(['Response','ID'], axis = 1)
y_train = train['Response'] 

In [30]:
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2

#apply SelectKBest to extract top 5 best features for housing df
bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X_train,y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Features','Score']  #naming the dataframe columns
print(featureScores.nlargest(5,'Score'))  #print 5best features

                   Features        Score
7       Reco_Policy_Premium  9829.942822
6           Reco_Policy_Cat  1768.726386
0               Region_Code    74.805013
4   Holding_Policy_Duration    35.091145
14            City_Code_C15     4.610118
