In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.ensemble as ens
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv("Datasets/train_cleaned_data.csv")
test = pd.read_csv('Datasets/test_cleaned_data.csv')

In [3]:
train.dtypes

Unnamed: 0                int64
id                        int64
Gender                    int64
Age                       int64
Driving_License           int64
Region_Code             float64
Previously_Insured        int64
Vehicle_Age               int64
Vehicle_Damage            int64
Annual_Premium          float64
Policy_Sales_Channel    float64
Vintage                   int64
Response                  int64
dtype: object

In [4]:
## Converting float64 dtypes to int64 so the data can be compatable with the models
train['Policy_Sales_Channel'] = train['Policy_Sales_Channel'].astype('int64')
train['Annual_Premium'] = train['Annual_Premium'].astype('int64')
train['Region_Code'] = train['Region_Code'].astype('int64')
train.dtypes

Unnamed: 0              int64
id                      int64
Gender                  int64
Age                     int64
Driving_License         int64
Region_Code             int64
Previously_Insured      int64
Vehicle_Age             int64
Vehicle_Damage          int64
Annual_Premium          int64
Policy_Sales_Channel    int64
Vintage                 int64
Response                int64
dtype: object

In [5]:
## Extracting responses from the training dataset
responses = train['Response']

## Will be dropping id and responses from the training dataset
train.drop(['Response', 'id', 'Unnamed: 0'], axis=1, inplace=True)

In [6]:
## Splitting up the data using train_test_split with a shuffle added
x_train, y_train, x_test, y_test = train_test_split(train, responses, test_size=0.1, random_state=42)

In [7]:
print("x_train length: {}\nx_test length: {}\ny_train length: {}\ny_test length: {}".format(len(x_train), len(x_test), len(y_train), len(y_test)))

x_train length: 342998
x_test length: 342998
y_train length: 38111
y_test length: 38111


In [38]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=2, verbose=1, n_estimators=400)
clf.fit(x_train, x_test)

      Iter       Train Loss   Remaining Time 
         1           0.7126            2.44m
         2           0.6884            2.25m
         3           0.6688            2.35m
         4           0.6526            2.28m
         5           0.6390            2.24m
         6           0.6273            2.22m
         7           0.6172            2.21m
         8           0.6086            2.18m
         9           0.6012            2.19m
        10           0.5945            2.17m
        20           0.5582            2.10m
        30           0.5457            2.01m
        40           0.5403            1.94m
        50           0.5376            1.88m
        60           0.5356            1.82m
        70           0.5342            1.76m
        80           0.5331            1.72m
        90           0.5322            1.67m
       100           0.5315            1.61m
       200           0.5279            1.05m
       300           0.5260           31.08s
       40

GradientBoostingClassifier(n_estimators=400, random_state=2, verbose=1)

In [39]:
prediction = clf.predict(y_train[:200])
actual = y_test[:200]
print("{}\n{}".format(prediction, np.array(actual)))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0
 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0]


In [40]:
from sklearn.metrics import roc_auc_score
predictions = clf.predict(y_train)
print("Score: {}".format(clf.score(y_train, y_test)))
roc_auc_score(predictions, np.array(y_test))

Score: 0.8747343286715121


0.7130274666529549