In [1]:
import pandas as pd
import numpy as np
from numpy.random import RandomState
import matplotlib.pyplot as plt
from tqdm import tqdm

seed = 42
rng = RandomState(seed)

df = pd.read_csv('Model.csv')

X, y = df.iloc[:, 1:-1], df.iloc[:, -1]
print(X.shape, y.shape)

(12529, 59) (12529,)


In [2]:
from sklearn.model_selection import train_test_split, cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, 
                                                    random_state=seed)
print('Train set: ', X_train.shape, y_train.shape)
print('Test set: ', X_test.shape, y_test.shape)

Train set:  (10023, 59) (10023,)
Test set:  (2506, 59) (2506,)


In [3]:
def cross_validation_score(estimator):
    scores = cross_val_score(
        estimator, X_train, y_train, cv=10, n_jobs=-1,
        scoring='roc_auc'
    )
    
    return scores.mean(), scores.std()


def plot_cross_validation(title, X, Y, error, xlabel, ylabel='ROC AUC'):
    plt.title(title)
    plt.grid()
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    
    plt.errorbar(X, Y, error, linestyle='None', marker='o')

In [4]:
from sklearn.ensemble import AdaBoostClassifier
adbc = AdaBoostClassifier(n_estimators=100, random_state=seed)
adbc.fit(X_train, y_train)

AdaBoostClassifier(n_estimators=100, random_state=42)

In [5]:
adbc.score(X_train, y_train)

0.690910904918687

In [6]:
adbc.score(X_test,y_test)

0.683559457302474

# Hyperparameter Tuning

In [7]:
from sklearn.ensemble import RandomForestClassifier
adbc1=AdaBoostClassifier(random_state=seed,base_estimator = RandomForestClassifier(random_state=0),
                         n_estimators=100,learning_rate=0.1)

In [8]:
adbc1.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
adbc1.score(X_train, y_train)

# Why this much accuracy??????

In [None]:
adbc1.score(X_test, y_test)

In [None]:
title = 'Change no_of_estimators'
xlabel = 'No of Estimators'
n_estimators = [100, 300, 500, 700, 1000]
means = []
stddevs = []

for n in tqdm(n_estimators):
    clf = AdaBoostClassifier(random_state=seed,base_estimator = RandomForestClassifier(random_state=0),
                          n_estimators=n,learning_rate=0.1)
    mean, std = cross_validation_score(clf)
    means.append(mean)
    stddevs.append(std)

plot_cross_validation(title, n_estimators, means, stddevs, xlabel)
plt.savefig('AdaBoost_no_of_estimators.png', bbox_inches='tight')
plt.show()

In [None]:
title = 'Change learning rate'
xlabel = 'learning rate'
learning_rate = [0.1,0.05,1.0]
means = []
stddevs = []

for n in tqdm(learning_rate):
    clf = AdaBoostClassifier(random_state=seed,base_estimator = RandomForestClassifier(random_state=0),
                          n_estimators=100,learning_rate=n)
    mean, std = cross_validation_score(clf)
    means.append(mean)
    stddevs.append(std)

plot_cross_validation(title, learning_rate, means, stddevs, xlabel)
plt.savefig('AdaBoost_learning_rate.png', bbox_inches='tight')
plt.show()