Test several models using default parameters.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import roc_auc_score
import sklearn.metrics as metrics

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# load processed train and validation data
train = pd.read_csv('../data/train_data_processed.csv')
valid = pd.read_csv('../data/valid_data_processed.csv')

In [3]:
# create X and y

X_train = train.drop(columns=['y'])
y_train = train['y']

X_valid = valid.drop(columns=['y'])
y_valid = valid['y']

In [4]:
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(28824, 62)
(28824,)
(6176, 62)
(6176,)


## feature scaling

In [5]:
scaler = StandardScaler()

# fit on training data
scaler.fit(X_train)

# transform on train and valid
X_train_std = scaler.transform(X_train)
X_valid_std = scaler.transform(X_valid)

## baseline model - Logistic Regression

In [6]:
# starting with default parameters
lr = LogisticRegression(random_state=2021)

# fit the model
lr.fit(X_train_std, y_train)

LogisticRegression(random_state=2021)

In [7]:
# predictions
y_predict_train = lr.predict_proba(X_train_std)
y_predict_valid = lr.predict_proba(X_valid_std)

In [8]:
# AUC scores
train_score_lr = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_lr = roc_auc_score(y_valid, y_predict_valid[:,1])
print("Training ROC-AUC score of baseline Logistic Regression model: ", train_score_lr)
print("Validation ROC-AUC score of baseline Logistic Regression model: ", valid_score_lr)

Training ROC-AUC score of baseline Logistic Regression model:  0.799661510473242
Validation ROC-AUC score of baseline Logistic Regression model:  0.7900684605814263


### observations:
Given that the [original paper](https://github.com/rachelkriggs/rocket/blob/main/docs/bank_telemarketing.pdf) regarding this problem stated that it achieved a best AUC score of 0.8, this results seems quite good for a simple baseline model with no hyperparameter tuning.

## Random Forest

In [9]:
# starting with default parameters
rf = RandomForestClassifier(random_state=2021)

# fit the model
rf.fit(X_train_std, y_train)

RandomForestClassifier(random_state=2021)

In [10]:
# predictions
y_predict_train = rf.predict_proba(X_train_std)
y_predict_valid = rf.predict_proba(X_valid_std)

In [11]:
# AUC scores
train_score_rf = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_rf = roc_auc_score(y_valid, y_predict_valid[:,1])
print("Training ROC-AUC score of Random Forest model: ", train_score_rf)
print("Validation ROC-AUC score of Random Forest model: ", valid_score_rf)

Training ROC-AUC score of Random Forest model:  0.9998503672391786
Validation ROC-AUC score of Random Forest model:  0.7764130824197656


### observations:
This model is clearly overfitting.

## LightGBM

In [12]:
# starting with default parameters
lgbm = lgb.LGBMClassifier(random_state = 2021, objective='binary')

# fit the model
lgbm.fit(X_train_std, y_train)

LGBMClassifier(objective='binary', random_state=2021)

In [13]:
# predictions
y_predict_train = lgbm.predict_proba(X_train_std)
y_predict_valid = lgbm.predict_proba(X_valid_std)

In [14]:
# AUC scores
train_score_lgb = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_lgb = roc_auc_score(y_valid, y_predict_valid[:,1])
print("Training ROC-AUC score of LightGBM model: ", train_score_lgb)
print("Validation ROC-AUC score of LightGBM model: ", valid_score_lgb)

Training ROC-AUC score of LightGBM model:  0.8926429127437405
Validation ROC-AUC score of LightGBM model:  0.8038074765989878


## XGBoost

In [15]:
# starting with default parameters
xgbc = xgb.XGBClassifier(random_state = 2021, verbosity=0)

# fit the model
xgbc.fit(X_train_std, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              random_state=2021, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=0)

In [16]:
# predictions
y_predict_train = xgbc.predict_proba(X_train_std)
y_predict_valid = xgbc.predict_proba(X_valid_std)

In [17]:
# AUC scores
train_score_xgb = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_xgb = roc_auc_score(y_valid, y_predict_valid[:,1])
print("Training ROC-AUC score of XGBoost model: ", train_score_xgb)
print("Validation ROC-AUC score of XGBoost model: ", valid_score_xgb)

Training ROC-AUC score of XGBoost model:  0.914232576542531
Validation ROC-AUC score of XGBoost model:  0.784485438284512


## Support Vector Machine

### RBF Kernel SVM (see the [scikit-learn documentation](https://scikit-learn.org/stable/modules/svm.html#classification))

In [18]:
# starting with default parameters
svcm = SVC(random_state = 2021, probability=True)

# fit the model
svcm.fit(X_train_std, y_train)

SVC(probability=True, random_state=2021)

In [19]:
# predictions
y_predict_train = svcm.predict_proba(X_train_std)
y_predict_valid = svcm.predict_proba(X_valid_std)

In [20]:
# AUC scores
train_score_svc = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_svc = roc_auc_score(y_valid, y_predict_valid[:,1])
print("Training ROC-AUC score of Support Vector Machine model: ", train_score_svc)
print("Validation ROC-AUC score of Support Vector Machine model: ", valid_score_svc)

Training ROC-AUC score of Support Vector Machine model:  0.9070801942564497
Validation ROC-AUC score of Support Vector Machine model:  0.7153376455031626


### observations:
- The above model was quite slow to run, has a poor score compared to some of the others tested thus far, and is overfitting a fair bit.

### Linear SVM (see the [scikit-learn documentation](https://scikit-learn.org/stable/modules/svm.html#classification))

In [21]:
# starting with default parameters
lsvc = SGDClassifier(random_state = 2021)

# calibrate the model so we can get probability estimates
clsvc = CalibratedClassifierCV(lsvc)

# fit the model
clsvc.fit(X_train_std, y_train)

CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=2021))

In [22]:
# predictions
y_predict_train = clsvc.predict_proba(X_train_std)
y_predict_valid = clsvc.predict_proba(X_valid_std)

In [23]:
# AUC scores
train_score_lsvc = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_lsvc = roc_auc_score(y_valid, y_predict_valid[:,1])
print("Training ROC-AUC score of Linear SVM model: ", train_score_lsvc)
print("Validation ROC-AUC score of Linear SVM model: ", valid_score_lsvc)

Training ROC-AUC score of Linear SVM model:  0.7759044034878305
Validation ROC-AUC score of Linear SVM model:  0.7603673812361988


## k-Nearest Neighbor

In [24]:
# starting with default parameters
knn = KNeighborsClassifier()
                          
# fit the model
knn.fit(X_train_std, y_train)

KNeighborsClassifier()

In [25]:
# predictions
y_predict_train = knn.predict_proba(X_train_std)
y_predict_valid = knn.predict_proba(X_valid_std)

In [26]:
# AUC scores
train_score_knn = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_knn = roc_auc_score(y_valid, y_predict_valid[:,1])
print("Training ROC-AUC score of k-Nearest Neighbor model: ", train_score_knn)
print("Validation ROC-AUC score of k-Nearest Neighbor model: ", valid_score_knn)

Training ROC-AUC score of k-Nearest Neighbor model:  0.9257271384997878
Validation ROC-AUC score of k-Nearest Neighbor model:  0.7269578358379093


## summary
of initial models tested with default parameters

In [27]:
summary = {
    'model':['Logistic Regression', 'Random Forest', 'LightGBM', 'XGBoost', 'RBF Kernel SVM', 'Linear SVM', 'k-Nearest Neighbor'],
    'Train ROC-AUC score':[train_score_lr, train_score_rf, train_score_lgb, train_score_xgb, train_score_svc, train_score_lsvc, train_score_knn],
    'Validation ROC-AUC score':[valid_score_lr, valid_score_rf, valid_score_lgb, valid_score_xgb, valid_score_svc, valid_score_lsvc, valid_score_knn]
}

pd.DataFrame(summary)

Unnamed: 0,model,Train ROC-AUC score,Validation ROC-AUC score
0,Logistic Regression,0.799662,0.790068
1,Random Forest,0.99985,0.776413
2,LightGBM,0.892643,0.803807
3,XGBoost,0.914233,0.784485
4,RBF Kernel SVM,0.90708,0.715338
5,Linear SVM,0.775904,0.760367
6,k-Nearest Neighbor,0.925727,0.726958


### observations:
From this intial run of models, candidates for further tuning are: 

- Logistic Regression
- LightGBM
- XGBoost
- Random Forest