In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Porto Seguro is a dataset that contains the information about the insurance contracts of several automobiles and we have to predict their chances of initiating a claim in the following year

## Datasets
This analysis is based on the dataset of Porto Seguro, Brazil's one of the largest homeowners and auto insurance companies. As they have the concern to increase the cost of insurance for good drivers and reduce the price for bad drivers for inaccuracies in the calculation of insurance premium. This model helps in predicting the chances of an auto insurance claim after one year of the contract initiation.


In [None]:
# Initializing the file paths
path = '../input/porto-seguro-safe-driver-prediction/'

# datasets
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

In [None]:
train.head(10)

In [None]:
test.head(10)

## Dealing with Missing values
We now define a function 'missing_values' to count the number of missing values in each dataset by counting the total number of missing values, calculating their percentage in each column, sorting them in descending order and adding the values to a column and to the datasets

In [None]:
# Creating a function to calculate and view the missing values in the dataset
def missing_values(df):
        # Total missing values in dataframe
        miss_val = df.isnull().sum()

        # Percentage of missing values of each column
        miss_val_percent = 100 * df.isnull().sum() / len(df)

        # Creating a table with them
        miss_val_table = pd.concat([miss_val, miss_val_percent], axis=1)

        # Renaming the columns
        ren_columns = miss_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})

        # Sorting the table by percentage of missing in descending order
        ren_columns = ren_columns[
            ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)

        # Printing the summary information of missing values
        print ("The dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(ren_columns.shape[0]) +
            " columns that have missing values.")

        # Returning the dataframe with missing value information
        return ren_columns

We made a copy of the actual dataframe and replaced all the missing or null values with -1

In [None]:
train_copy = train
train_copy = train_copy.replace(-1, np.NaN)

In [None]:
# Applying the function on the dataframes
missing_values_df1 = missing_values(train_copy)
missing_values_df1.head(10)

Dropping 'ps_car_03_cat' from train_copy as 69.1% data of the column is missing. If more than 50% data are missing then the impact on the model is usually near to zero.

In [None]:
train_copy.drop(['ps_car_03_cat'], inplace=True, axis=1)

In [None]:
test_copy = test
test_copy = test_copy.replace(-1, np.NaN)

In [None]:
# Applying the function on the dataframes
missing_values_df2 = missing_values(test_copy)
missing_values_df2.head(10)

Dropping 'id' and 'target' from train_copy to get the labels

In [None]:
y = train_copy.target.values
train_copy.drop(['id','target'], inplace=True, axis=1)

x = train_copy.values

Calculating and visualising the number of claims opened within one year of starting the contract

In [None]:
target_count = train.target.value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

target_count.plot(kind='bar', title='target');

In [None]:
# Creating training and validation sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

## LightGBM
LightGBM is very popular for tabular datasets. In most cases it is more accurate than XGBoost as it provides more optimized, scalable and fast implementation of GBMs.

In [None]:
# Creating the LightGBM data containers
categorical_features = [c for c, col in enumerate(train_copy.columns) if 'cat' in col]
train_data = lightgbm.Dataset(x_train, label=y_train, categorical_feature=categorical_features)
test_data = lightgbm.Dataset(x_test, label=y_test)

setting the model configuration with the basic parameters

In [None]:
parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

Creating the model

In [None]:
model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)

So, here we can see that the score we got is at [129]valid_0's auc:0.630456 and it is not overfitting the data.

But still, we used the LightGBM’s inbuilt ‘feature importance’ function to visually understand the 20 most important features which helped the model lean towards a particular classification.

Feature Importance

In [None]:
feature_imp= pd.DataFrame({'Value':model.feature_importance(),'Feature':train_copy.columns})
plt.figure(figsize=(40, 20))
sns.set(font_scale = 5)
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[0:20])
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances-01.png')
plt.show()

From the plot above, we can deduce that only the top 8 features are actually impacting the model, others are obvious on the machine.

We can also view the indivitual trees from the model

In [None]:
lightgbm_tree = lightgbm.plot_tree(model, figsize=(20, 20))

In [None]:
ids = test_copy['id'].values
test_copy.drop('id', inplace=True, axis=1)

## Prediction

In [None]:
# Applying the model on the test set 
x = test_copy.values
y = model.predict(x, predict_disable_shape_check= True)

## Submission

In [None]:
output = pd.DataFrame({'id': ids, 'target': y})
output.to_csv("submission.csv", index=False)