# Porto Seguro’s Safe Driver Prediction

In this competition we are tasked with making predictive models that can predict if a given driver will make insurance claim.

# Import library

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import KFold,cross_val_score,train_test_split,StratifiedKFold


from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import time
seed=2123
%matplotlib inline



## Read data set

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [13]:
#Feature statistics
traintest = train.drop(['id','target'],axis=1).append(test.drop(['id'],axis=1))
cols = traintest.columns

# defining column name for feature statistics
# as the name suggested, we are capturing the following statistic from the features:
# nunique: number of unique value
# freq1: most frequent value 
# freq1_val: number of occurance of the most frequent value
# freq2: second most frequent value 
# freq2_val: number of occurance of the second most frequent value
# freq3: 3rd most frequent value, if available
# freq3_val: number of occurance of the thrid most frequent value, if available
# describe stats: the following ones are the stat offer by our best friend .describe methods. 

stat_col = ['nunique','freq1','freq_val1','freq2','freq_val2','freq3','freq_val3'] + traintest[cols[0]].describe().index.tolist()[1:]

stat_col = ['feature']+stat_col

feature_stat = pd.DataFrame(columns=stat_col)
i=0

for col in cols:
    stat_vals=[]
    
    #get stats value
    stat_vals.append(col)
    stat_vals.append(traintest[col].nunique())
    stat_vals.append(traintest[col].value_counts().index[0])
    stat_vals.append(traintest[col].value_counts().iloc[0])
    stat_vals.append(traintest[col].value_counts().index[1])
    stat_vals.append(traintest[col].value_counts().iloc[1])
    
    if len(traintest[col].value_counts())>2:
        stat_vals.append(traintest[col].value_counts().index[2])
        stat_vals.append(traintest[col].value_counts().iloc[2])
    else:
        stat_vals.append(np.nan)
        stat_vals.append(np.nan)
            
    stat_vals+=traintest[col].describe().tolist()[1:]

    feature_stat.loc[i]=stat_vals
    i+=1


In [14]:
feature_stat[feature_stat['feature'].str.contains("cat")].sort_values(by=['nunique'])

Unnamed: 0,feature,nunique,freq1,freq_val1,freq2,freq_val2,freq3,freq_val3,mean,std,min,25%,50%,75%,max
28,ps_car_08_cat,2,1,1238365,0,249663,,,0.832219,0.373672,0.0,1.0,1.0,1.0,1.0
3,ps_ind_04_cat,3,0,866864,1,620936,-1.0,228.0,0.417135,0.493396,-1.0,0.0,0.0,1.0,1.0
22,ps_car_02_cat,3,1,1234979,0,253039,-1.0,10.0,0.829937,0.375706,-1.0,1.0,1.0,1.0,1.0
23,ps_car_03_cat,3,-1,1028142,1,276842,0.0,183044.0,-0.504896,0.788713,-1.0,-1.0,-1.0,0.0,1.0
25,ps_car_05_cat,3,-1,666910,1,431560,0.0,389558.0,-0.158162,0.844506,-1.0,-1.0,0.0,1.0,1.0
27,ps_car_07_cat,3,1,1383070,0,76138,-1.0,28820.0,0.910097,0.347212,-1.0,1.0,1.0,1.0,1.0
30,ps_car_10_cat,3,1,1475460,0,12136,2.0,432.0,0.992135,0.091565,0.0,1.0,1.0,1.0,2.0
1,ps_ind_02_cat,5,1,1079327,2,309747,3.0,70172.0,1.358745,0.663639,-1.0,1.0,1.0,2.0,4.0
29,ps_car_09_cat,6,2,883326,0,486510,1.0,72947.0,1.328302,0.978743,-1.0,0.0,2.0,2.0,4.0
4,ps_ind_05_cat,8,0,1319412,6,51877,4.0,45706.0,0.406955,1.3533,-1.0,0.0,0.0,0.0,6.0


In [33]:
#frequency encoding
def freq_encoding(cols, train_df, test_df):
    result_traindf = pd.DataFrame()
    result_testdf = pd.DataFrame()
    
    for col in cols:
        print(" ",col)
        col_freq = col+'_freq'
        freq = train_df[col].value_counts()
        freq = pd.DataFrame(freq)
        freq.reset_index(inplace=True)
        freq.columns = [[col,col_freq]]
        
        # merge this 'freq' dataframe with train
        temp_train_df = pd.merge(train_df[[col]],freq,how='left',on=col)
        temp_train_df.drop([col],axis=1,inplace=True)
        
        # merge this 'freq' data frame with test data 
        temp_test_df = pd.merge(test_df[[col]],freq, how='left', on=col)
        temp_test_df.drop([col], axis=1, inplace=True)
        
        #if certain level of freq is not observed in test dataset will assign 0
        temp_test_df.fillna(0,inplace=True)
        temp_test_df[col_freq] = temp_test_df[col_freq].astype(np.int32)
        
        if result_traindf.shape[0] ==0:
            result_traindf = temp_train_df
            result_testdf = temp_test_df
        else:
            result_traindf = pd.concat([result_traindf,temp_train_df], axis=1)
            result_testdf = pd.concat([result_testdf,temp_test_df], axis=1)
        
    return result_traindf,result_testdf
    

In [35]:
cat_freq = train.columns[train.columns.str.endswith('_cat')]
#drop 'ps_car_08_cat' having 2 unique value
cat_freq.drop(['ps_car_08_cat'])
cat_freq
train_freq,test_freq = freq_encoding(cat_freq,train,test)

# merge
train = pd.concat([train,train_freq], axis=1)
test = pd.concat([test,test_freq], axis=1)
train.head()

  ps_ind_02_cat
  ps_ind_04_cat
  ps_ind_05_cat
  ps_car_01_cat
  ps_car_02_cat
  ps_car_03_cat
  ps_car_04_cat
  ps_car_05_cat
  ps_car_06_cat
  ps_car_07_cat
  ps_car_08_cat
  ps_car_09_cat
  ps_car_10_cat
  ps_car_11_cat


Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_car_02_catfreq,ps_car_03_catfreq,ps_car_04_catfreq,ps_car_05_catfreq,ps_car_06_catfreq,ps_car_07_catfreq,ps_car_08_catfreq,ps_car_09_catfreq,ps_car_10_catfreq,ps_car_11_catfreq
0,7,0,2,2,5,1,0,0,1,0,...,493990,411231,496581,172667,31136,553148,99948,194518,590179,7246
1,9,0,1,1,7,0,0,0,0,1,...,493990,411231,496581,266551,131527,553148,495264,353482,590179,5097
2,13,0,5,4,9,1,0,0,0,1,...,493990,411231,496581,266551,59253,553148,495264,353482,590179,7992
3,16,0,0,1,2,0,0,1,0,0,...,493990,73272,496581,172667,131527,553148,495264,14756,590179,85083
4,17,0,0,2,0,1,0,1,0,0,...,493990,411231,496581,266551,59253,553148,495264,353482,590179,10470


In [66]:
# perform binary encoding for categorical variable
# this function take in a pair of train and test data set, and the feature that need to be encode.
# it returns the two dataset with input feature encoded in binary representation
# this function assumpt that the feature to be encoded is already been encoded in a numeric manner 
# ranging from 0 to n-1 (n = number of levels in the feature). 

def binary_encoding(train_df,test_df,feat):
    #calculate higest numeric value used for numerical encoding
    train_feat_max = train_df[feat].max()
    test_feat_max = test_df[feat].max()
    
    if train_feat_max > test_feat_max:
        feat_max = train_feat_max
    else:
        feat_max = test_feat_max
    
    # use value feat_max+1 to represent missing value
    train_df.loc[train_df[feat]==-1, feat] = feat_max + 1
    test_df.loc[test_df[feat]==-1, feat] = feat_max + 1

    #create union set of all possible value feature
    union_val = np.union1d(train_df[feat].unique(),test_df[feat].unique())
    
    #extract highst value from the feature in decimal format
    max_dec = union_val.max()
    
    #work out how the amount digits require represent max_dev in binary
    max_bin_len = len("{:b}".format(max_dec))
    index = np.arange(len(union_val))
    columns = list([feat])
    
    # create binary encoding dataframe to capture all the levels of feature
    bin_df = pd.DataFrame(index=index,columns=columns)
    bin_df[feat] = union_val
    bin_df.head()
    
    # capture binary represantation for each level feature
    feat_bin = bin_df[feat].apply(lambda x: "{:b}".format(x).zfill(max_bin_len))
    
    #split the binary representation into different bit
    splitted = feat_bin.apply(lambda x: pd.Series(list(x)).astype(np.uint8))
    splitted.columns = [feat + '_bin' + str(x) for x in splitted.columns]
    bin_df = bin_df.join(splitted)
    
    #merge data set
    train_df = pd.merge(train_df, bin_df, how='left', on=[feat])
    test_df = pd.merge(test_df, bin_df, how='left', on=[feat])
    return train_df, test_df
    

In [67]:
binary_encoding(train,test,'ps_ind_02_cat')

(             id  target  ps_ind_01  ps_ind_02_cat  ps_ind_03  ps_ind_04_cat  \
 0             7       0          2              2          5              1   
 1             9       0          1              1          7              0   
 2            13       0          5              4          9              1   
 3            16       0          0              1          2              0   
 4            17       0          0              2          0              1   
 5            19       0          5              1          4              0   
 6            20       0          2              1          3              1   
 7            22       0          5              1          4              0   
 8            26       0          5              1          3              1   
 9            28       1          1              1          2              0   
 10           34       0          5              2          2              0   
 11           35       0          2     

In [57]:
np.union1d(train['ps_car_03_cat'].unique(),test['ps_car_03_cat'].unique())

array([-1,  0,  1])