In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-whitegrid')

import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_colwidth', -1)

import warnings
warnings.simplefilter('ignore')



In [2]:
train  = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/bank_marketing_data/training_set_label.csv" )

In [3]:
test = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/bank_marketing_data/testing_set_label.csv')

In [4]:
TARGET_COL =  'subscribe'
features = [c for c in train.columns if c not in [TARGET_COL]]

cat_cols = ['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

num_cols = [c for c in features if c not in cat_cols]

In [5]:
# function for download

def download_preds(preds_test, file_name = 'hacklive_sub.csv'):
    
    df = pd.DataFrame()

  ## 1. Setting the target column with our obtained predictions
    df['prediction'] = preds_test
    df = df.astype(int)

  ## 2. Saving our predictions to a csv file

    df.to_csv(file_name, index = False)

In [6]:
train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,subscribe
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


In [7]:
train.shape,test.shape

((45211, 17), (4509, 16))

In [8]:
df = pd.concat([train, test], axis=0).reset_index(drop = True)
df.shape

(49720, 17)

In [9]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,subscribe
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0.0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0.0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0.0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0.0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49715,46,blue-collar,married,secondary,no,668,yes,no,unknown,15,may,1263,2,-1,0,unknown,
49716,40,blue-collar,married,secondary,no,1100,yes,no,unknown,29,may,660,2,-1,0,unknown,
49717,49,blue-collar,married,secondary,no,322,no,no,cellular,14,aug,356,2,-1,0,unknown,
49718,38,blue-collar,married,secondary,no,1205,yes,no,cellular,20,apr,45,4,153,1,failure,


## 2. Encoding categorical columns.

In [10]:
df = pd.get_dummies(df, columns = cat_cols)

In [11]:
df

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,subscribe,job_admin.,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0.0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,0.0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,0.0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,0.0,0,1,...,0,0,1,0,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,0.0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49715,46,668,15,1263,2,-1,0,,0,1,...,0,0,1,0,0,0,0,0,0,1
49716,40,1100,29,660,2,-1,0,,0,1,...,0,0,1,0,0,0,0,0,0,1
49717,49,322,14,356,2,-1,0,,0,1,...,0,0,0,0,0,0,0,0,0,1
49718,38,1205,20,45,4,153,1,,0,1,...,0,0,0,0,0,0,1,0,0,0


In [12]:
df.shape

(49720, 52)

### 3. Filling Null Values

In [13]:
df = df.fillna(-999)
df.isnull().sum().sum()

0

## 4. Split the processed dataset back into train and test datasets.

In [14]:
train_proc, test_proc = df[:train.shape[0]], df[train.shape[0]:].reset_index(drop = True)
features = [c for c in train_proc.columns if c not in [TARGET_COL]]

In [15]:
len(features)

51

## 5. Split the train set into train and validation sets.

In [16]:
trn, val = train_test_split(train_proc, test_size=0.2, random_state = 1, stratify = train_proc[TARGET_COL])

###### Input to our model will be the features
X_trn, X_val = trn[features], val[features]

###### Output of our model will be the TARGET_COL
y_trn, y_val = trn[TARGET_COL], val[TARGET_COL]

##### Features for the test data that we will be predicting
X_test = test_proc[features]

## 6. Fit a classification Model on train, check its results on validation set.

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler= scaler.fit(X_trn)

X_trn = scaler.transform(X_trn)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

### 6A. Logistic Regression

In [19]:
clf = LogisticRegression(random_state = 1)
_ = clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

f1_score(y_val, preds_val)

0.43310516490354695

In [20]:
preds_test = clf.predict(X_test)
download_preds(preds_test, file_name='logistic2.csv')

### 6B. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 2200, oob_score = 'TRUE', n_jobs = -1)
                                
_ = clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

f1_score(y_val, preds_val)

In [None]:
preds_test = clf.predict(X_test)
download_preds(preds_test, file_name='random20.csv')

### Hyper Parameter Tuning Random Forest

In [None]:
params={ "n_estimators" : [100,200,300,400,500] ,
"max_depth" : [ 5,10,15,30,50,100]}

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour,tmin, round(tsec, 2)))

In [None]:
classifier=RandomForestClassifier() #here I just inailize xgb

In [None]:
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter
=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)

In [None]:
from datetime import datetime
# Here we go
start_time = timer(None) # timing starts from this point for "start_time"variable
random_search.fit(X_trn, y_trn)
timer(start_time) 

In [None]:
random_search.best_estimator_
random_search.best_params_
random_search.best_score_

In [None]:
optimal_params ={'n_estimators': 200, 'max_depth': 15}

clf =RandomForestClassifier(**optimal_params)
_ = clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

f1_score(y_val, preds_val)

In [None]:
preds_test = clf.predict(X_test)
download_preds(preds_test, file_name='random.csv')

### 6c. Decision Tree

In [None]:
clf = DecisionTreeClassifier(random_state = 1,max_depth=500,max_features=0.5,min_samples_split=60)
_ = clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

f1_score(y_val, preds_val)

In [None]:
preds_test = clf.predict(X_test)
download_preds(preds_test, file_name='hacklive_decision_tree.csv')

### Hyper PArameter Tuning DT

In [21]:
hyperparam_combs = {
    'max_depth': [30,50,100],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [10, 20, 30, 40],
    'max_features': [0.2, 0.4, 0.6, 0.8, 1],
    'max_leaf_nodes': [8, 16, 32, 64, 128]
}

In [None]:
clf = RandomizedSearchCV(DecisionTreeClassifier(),
                         hyperparam_combs,
                         scoring='f1',
                         random_state=1,
                         n_iter=20)

In [None]:
optimal_params = {'min_samples_split': 40,
 'max_leaf_nodes': 8,
 'max_features': 0.4,
 'max_depth': 30,
 'criterion': 'gini'}
clf = DecisionTreeClassifier(random_state = 1, **optimal_params)
_ = clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

f1_score(y_val, preds_val)

In [None]:
preds_test = clf.predict(X_test)
download_preds(preds_test, file_name = 'hacklive_dt_tuned_random4.csv')

### 6D. LGBM

In [None]:
clfl = LGBMClassifier(learning_rate=0.02,
                    boosting_type='gbdt', max_depth=4,  objective='binary', 
                    random_state=100,  
                  n_estimators=1000 ,reg_alpha=0, reg_lambda=1, n_jobs=-1)
_ = clfl.fit(X_trn, y_trn)

preds_val = clfl.predict(X_val)

f1_score(y_val, preds_val)

In [None]:
preds_test = clfl.predict(X_test)
download_preds(preds_test, file_name='hacklive_lgbm_tree22.csv')

### 6E. Catboost

In [None]:
clf = CatBoostClassifier(n_estimators = 3000,
                       learning_rate = 0.01,
                       rsm = 0.4, ## Analogous to colsample_bytree
                       random_state=2054,
                       )

_ = clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

f1_score(y_val, preds_val)

In [None]:
preds_test = clf.predict(X_test)
download_preds(preds_test, file_name='catboost1.csv')

### 6F. XGB Classifier

In [None]:
clf = XGBClassifier(n_estimators = 1000,
                    max_depth = 6,
                    learning_rate = 0.05,
                    colsample_bytree = 0.5,
                    random_state=1452,
                    )

_ = clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

f1_score(y_val, preds_val)

In [None]:
preds_test = clf.predict(X_test)
download_preds(preds_test, file_name='xgboost.csv')

### Hyperparameter Tuning XGBOOST

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
params={ "learning rate" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
"max_depth" : [ 3, 4, 5, 6, 8, 10, 12, 15],
"min_child_weight" : [ 1, 3, 5, 7 ],
"gamma" : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
"colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] }

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost #Model Import

In [None]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour,tmin, round(tsec, 2)))

In [None]:
classifier=xgboost.XGBClassifier() 

In [None]:
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter
=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)

In [None]:
from datetime import datetime
# Here we go
start_time = timer(None) # timing starts from this point for "start_time"variable
random_search.fit(train_proc[features], train_proc[TARGET_COL])
timer(start_time)

In [None]:
random_search.best_estimator_
random_search.best_params_
random_search.best_score_

In [None]:
optimal_params = {'min_child_weight': 1,
 'max_depth': 4,
 'learning rate': 0.2,
 'gamma': 0.4,
 'colsample_bytree': 0.5}

clf = XGBClassifier(**optimal_params)
_ = clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

f1_score(y_val, preds_val)

In [None]:
preds_test = clf.predict(X_test)

download_preds(preds_test, file_name = 'xgb_parameter_tuning.csv')