In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.ensemble as ens
from sklearn.model_selection import train_test_split

In [19]:
train = pd.read_csv("Datasets/train_cleaned_data.csv")
test = pd.read_csv('Datasets/test_cleaned_data.csv')

In [20]:
train.dtypes

Unnamed: 0                int64
id                        int64
Gender                    int64
Age                       int64
Driving_License           int64
Region_Code             float64
Previously_Insured        int64
Vehicle_Age               int64
Vehicle_Damage            int64
Annual_Premium          float64
Policy_Sales_Channel    float64
Vintage                   int64
Response                  int64
dtype: object

In [21]:
## Converting float64 dtypes to int64 so the data can be compatable with the models
train['Policy_Sales_Channel'] = train['Policy_Sales_Channel'].astype('int64')
train['Annual_Premium'] = train['Annual_Premium'].astype('int64')
train['Region_Code'] = train['Region_Code'].astype('int64')
train.dtypes

Unnamed: 0              int64
id                      int64
Gender                  int64
Age                     int64
Driving_License         int64
Region_Code             int64
Previously_Insured      int64
Vehicle_Age             int64
Vehicle_Damage          int64
Annual_Premium          int64
Policy_Sales_Channel    int64
Vintage                 int64
Response                int64
dtype: object

In [22]:
## Extracting responses from the training dataset
responses = train['Response']

## Will be dropping id and responses from the training dataset
train.drop(['Response', 'id', 'Unnamed: 0'], axis=1, inplace=True)

In [23]:
## Splitting up the data using train_test_split with a shuffle added
x_train, y_train, x_test, y_test = train_test_split(train, responses, test_size=0.1, random_state=42)

In [24]:
print("x_train length: {}\nx_test length: {}\ny_train length: {}\ny_test length: {}".format(len(x_train), len(x_test), len(y_train), len(y_test)))

x_train length: 342998
x_test length: 342998
y_train length: 38111
y_test length: 38111


In [25]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=2, verbose=1, n_estimators=400)
clf.fit(x_train, x_test)

Iter       Train Loss   Remaining Time 
         1           0.7126            2.31m
         2           0.6884            2.24m
         3           0.6688            2.25m
         4           0.6526            2.21m
         5           0.6390            2.18m
         6           0.6273            2.15m
         7           0.6172            2.13m
         8           0.6086            2.13m
         9           0.6012            2.11m
        10           0.5945            2.10m
        20           0.5582            2.03m
        30           0.5457            1.98m
        40           0.5403            1.92m
        50           0.5376            1.88m
        60           0.5356            1.84m
        70           0.5342            1.78m
        80           0.5331            1.73m
        90           0.5322            1.68m
       100           0.5315            1.63m
       200           0.5279            1.09m
       300           0.5260           32.55s
       400     

GradientBoostingClassifier(n_estimators=400, random_state=2, verbose=1)

In [26]:
prediction = clf.predict(y_train[:200])
actual = y_test[:200]
print("{}\n{}".format(prediction, np.array(actual)))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0
 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0]


In [27]:
from sklearn.metrics import roc_auc_score
predictions = clf.predict(y_train)
print("Score: {}".format(clf.score(y_train, y_test)))
roc_auc_score(predictions, np.array(y_test))

Score: 0.8747343286715121


0.7130274666529549

In [28]:
from sklearn.tree import DecisionTreeClassifier #DecisionTreeClassifier reduced accuracy, so wont use
clf_adaboost = AdaBoostClassifier(n_estimators=400, random_state=2)
clf_adaboost.fit(x_train, x_test)

AdaBoostClassifier(n_estimators=400)

In [29]:
predictions = clf_adaboost.predict(y_train)
print("Score: {}".format(clf_adaboost.score(y_train, y_test)))
roc_auc_score(predictions, np.array(y_test))

Score: 0.8744719372359686


0.6754409981372438

In [31]:
from sklearn.ensemble import RandomForestClassifier
cls_randomforest = RandomForestClassifier(n_estimators=400, criterion='gini', random_state=2, verbose=True)
cls_randomforest.fit(x_train, x_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  3.3min finished


RandomForestClassifier(n_estimators=400, random_state=2, verbose=True)

In [33]:
predictions = cls_randomforest.predict(y_train)
print("Score: {}".format(cls_randomforest.score(y_train, y_test)))
roc_auc_score(predictions, np.array(y_test))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    4.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Score: 0.8641074755319986
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    3.1s finished


0.6259202635157612

In [37]:
from sklearn.ensemble import BaggingClassifier
cls_bagging = BaggingClassifier(n_estimators=400, n_jobs=4, random_state=2, verbose=1)
cls_bagging.fit(x_train, x_test)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.6min remaining:  2.6min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.6min finished


BaggingClassifier(n_estimators=400, n_jobs=4, random_state=2, verbose=1)

In [38]:
predictions = cls_bagging.predict(y_train)
print("Score: {}".format(cls_bagging.score(y_train, y_test)))
roc_auc_score(predictions, np.array(y_test))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    4.1s remaining:    4.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    5.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    5.8s remaining:    5.8s
Score: 0.8619821049040959
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    6.3s finished


0.6209097947203638

In [41]:
from sklearn.ensemble import ExtraTreesClassifier
cls_extra = ExtraTreesClassifier(n_estimators=400, criterion='gini', random_state=2, verbose=True)
cls_extra.fit(x_train, x_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  2.3min finished


ExtraTreesClassifier(n_estimators=400, random_state=2, verbose=True)

In [42]:
predictions = cls_extra.predict(y_train)
print("Score: {}".format(cls_extra.score(y_train, y_test)))
roc_auc_score(predictions, np.array(y_test))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   11.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Score: 0.8598042559890845
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    4.3s finished


0.6238235648978123