In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from time import time
from IPython.display import display # Allows the use of display() for DataFrames

%matplotlib inline

sns.set(color_codes=True)

data = pd.read_csv("train.csv")

display(data.head(n=10))

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0
5,19,0,5,1,4,0,0,0,0,0,...,4,2,0,9,0,1,0,1,1,1
6,20,0,2,1,3,1,0,0,1,0,...,3,0,0,10,0,1,0,0,1,0
7,22,0,5,1,4,0,0,1,0,0,...,7,1,3,6,1,0,1,0,1,0
8,26,0,5,1,3,1,0,0,0,1,...,4,2,1,5,0,1,0,0,0,1
9,28,1,1,1,2,0,0,0,1,0,...,3,5,0,6,0,1,0,0,1,0


In [2]:
n_records = data.shape[0]

print("Total number of records: {}".format(n_records))

Total number of records: 595212


In [3]:
n_features = data.shape[1]

print("Total features: {}".format(n_features))

Total features: 59


In [4]:
# Split the data into features and target label
target_raw = data['target']
features_raw = data.drop(['id', 'target'], axis = 1)

Analysis of null values, thanks to Gabriel Preda (https://www.kaggle.com/gpreda/porto-seguro-data-exploration)

In [5]:
vars_with_missing = []

for feature in features_raw.columns:
    missings = features_raw[features_raw[feature] == -1][feature].count()
    if missings > 0:
        vars_with_missing.append(feature)
        missings_perc = missings/features_raw.shape[0]
        
        print('Variable {} has {} records ({:.2%}) with missing values'.format(feature, missings, missings_perc))
        
print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))

Variable ps_ind_02_cat has 216 records (0.04%) with missing values
Variable ps_ind_04_cat has 83 records (0.01%) with missing values
Variable ps_ind_05_cat has 5809 records (0.98%) with missing values
Variable ps_reg_03 has 107772 records (18.11%) with missing values
Variable ps_car_01_cat has 107 records (0.02%) with missing values
Variable ps_car_02_cat has 5 records (0.00%) with missing values
Variable ps_car_03_cat has 411231 records (69.09%) with missing values
Variable ps_car_05_cat has 266551 records (44.78%) with missing values
Variable ps_car_07_cat has 11489 records (1.93%) with missing values
Variable ps_car_09_cat has 569 records (0.10%) with missing values
Variable ps_car_11 has 5 records (0.00%) with missing values
Variable ps_car_12 has 1 records (0.00%) with missing values
Variable ps_car_14 has 42620 records (7.16%) with missing values
In total, there are 13 variables with missing values


Let us remove ps_car_03_cat and ps_car_05_cat as they have too much missing values

In [6]:
features_raw.drop(['ps_car_03_cat', 'ps_car_05_cat'], axis = 1, inplace = True)

Let us now fill some missing values

In [7]:
features_raw.replace(-1, np.nan, inplace = True)

In [8]:
features_raw.fillna(value = {
    'ps_reg_03': features_raw['ps_reg_03'].mean(),
    'ps_car_11': features_raw['ps_car_11'].mode().iloc[0],
    'ps_car_12': features_raw['ps_car_12'].mean(),
    'ps_car_14': features_raw['ps_car_14'].mean(),
}, inplace = True)

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,2,2.0,5,1.0,0.0,0,1,0,0,0,...,9,1,5,8,0,1,1,0,0,1
1,1,1.0,7,0.0,0.0,0,0,1,0,0,...,3,1,1,9,0,1,1,0,1,0
2,5,4.0,9,1.0,0.0,0,0,1,0,0,...,4,2,7,7,0,1,1,0,1,0
3,0,1.0,2,0.0,0.0,1,0,0,0,0,...,2,2,4,9,0,0,0,0,0,0
4,0,2.0,0,1.0,0.0,1,0,0,0,0,...,3,1,1,3,0,0,0,1,1,0
5,5,1.0,4,0.0,0.0,0,0,0,1,0,...,4,2,0,9,0,1,0,1,1,1
6,2,1.0,3,1.0,0.0,0,1,0,0,0,...,3,0,0,10,0,1,0,0,1,0
7,5,1.0,4,0.0,0.0,1,0,0,0,0,...,7,1,3,6,1,0,1,0,1,0
8,5,1.0,3,1.0,0.0,0,0,1,0,0,...,4,2,1,5,0,1,0,0,0,1
9,1,1.0,2,0.0,0.0,0,1,0,0,0,...,3,5,0,6,0,1,0,0,1,0


In [9]:
# uses code from https://www.kaggle.com/bertcarremans/data-preparation-exploration (see references)
metadata_arr = []
for feature in features_raw.columns:
    # Defining the role
    if feature == 'target':
        use = 'target'
    elif feature == 'id':
        use = 'id'
    else:
        use = 'input'
         
    # Defining the type
    if 'bin' in feature or feature == 'target':
        type = 'binary'
    elif 'cat' in feature or feature == 'id':
        type = 'categorical'
    elif features_raw[feature].dtype == float or isinstance(features_raw[feature].dtype, float):
        type = 'real'
    elif features_raw[feature].dtype == int:
        type = 'integer'
        
    # Defining the data type 
    dtype = features_raw[feature].dtype
    
    category = 'none'
    # Defining the category
    if 'ind' in feature:
        category = 'individual'
    elif 'reg' in feature:
        category = 'registration'
    elif 'car' in feature:
        category = 'car'
    elif 'calc' in feature:
        category = 'calculated'
    
    
    # Creating a Dict that contains all the metadata for the variable
    feature_dictionary = {
        'varname': feature,
        'use': use,
        'type': type,
        'dtype': dtype,
        'category' : category
    }
    metadata_arr.append(feature_dictionary)
    
metadata = pd.DataFrame(metadata_arr, columns=['varname', 'use', 'type', 'preserve', 'dtype', 'category'])
metadata.set_index('varname', inplace=True)
metadata

Unnamed: 0_level_0,use,type,preserve,dtype,category
varname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ps_ind_01,input,integer,,int64,individual
ps_ind_02_cat,input,categorical,,float64,individual
ps_ind_03,input,integer,,int64,individual
ps_ind_04_cat,input,categorical,,float64,individual
ps_ind_05_cat,input,categorical,,float64,individual
ps_ind_06_bin,input,binary,,int64,individual
ps_ind_07_bin,input,binary,,int64,individual
ps_ind_08_bin,input,binary,,int64,individual
ps_ind_09_bin,input,binary,,int64,individual
ps_ind_10_bin,input,binary,,int64,individual


In [10]:
cat_data_index = metadata[metadata.type == 'categorical'].index

In [11]:
features_raw[cat_data_index].head()

Unnamed: 0,ps_ind_02_cat,ps_ind_04_cat,ps_ind_05_cat,ps_car_01_cat,ps_car_02_cat,ps_car_04_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat
0,2.0,1.0,0.0,10.0,1.0,0,4,1.0,0,0.0,1,12
1,1.0,0.0,0.0,11.0,1.0,0,11,1.0,1,2.0,1,19
2,4.0,1.0,0.0,7.0,1.0,0,14,1.0,1,2.0,1,60
3,1.0,0.0,0.0,7.0,1.0,0,11,1.0,1,3.0,1,104
4,2.0,1.0,0.0,11.0,1.0,0,14,1.0,1,2.0,1,82


In [12]:
features_encoded = pd.get_dummies(features_raw, columns = cat_data_index)

In [13]:
features_encoded.head()

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,...,ps_car_11_cat_95,ps_car_11_cat_96,ps_car_11_cat_97,ps_car_11_cat_98,ps_car_11_cat_99,ps_car_11_cat_100,ps_car_11_cat_101,ps_car_11_cat_102,ps_car_11_cat_103,ps_car_11_cat_104
0,2,5,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,7,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,9,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
int_real_data_index = metadata[(metadata.type == 'integer') | (metadata.type == 'real')].index

In [15]:
# Initialize a scaler, then apply it to the features
scaler = {}
for feature in int_real_data_index:
    max_value = features_encoded[feature].max()
    min_value = features_encoded[feature].min()
    scaler[feature] = {
        "min": min_value,
        "max": max_value
    }
    features_encoded[feature] = features_encoded[feature].sub(min_value).div(max_value)
    
# Show an example of a record with scaling applied
display(features_encoded.head(n = 1))

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,...,ps_car_11_cat_95,ps_car_11_cat_96,ps_car_11_cat_97,ps_car_11_cat_98,ps_car_11_cat_99,ps_car_11_cat_100,ps_car_11_cat_101,ps_car_11_cat_102,ps_car_11_cat_103,ps_car_11_cat_104
0,0.285714,0.454545,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Import train_test_split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_encoded, target_raw, test_size = 0.2, random_state = 0, stratify = target_raw)

print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 476169 samples.
Testing set has 119043 samples.


In [17]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score

classifier = GaussianNB()

classifier.fit(X_train, y_train)

GaussianNB(priors=None)

In [18]:
index_class_true = classifier.classes_[classifier.classes_ == 1]
predictions_train = classifier.predict_proba(X_train)[:, index_class_true]
predictions_test = classifier.predict_proba(X_test)[:, index_class_true]

In [19]:
print("Train score: {0:.2f}".format(roc_auc_score(y_train, predictions_train)))
print("Test score: {0:.2f}".format(roc_auc_score(y_test, predictions_test)))

Train score: 0.60
Test score: 0.58
