# Environment Setup

In [None]:
# if you haven't installed xgboost on your system, uncomment the line below
!pip install xgboost
# if you haven't installed bayesian-optimization on your system, uncomment the line below
!pip install scikit-optimize

In [None]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from xgboost.sklearn import XGBClassifier

# Data Loading

In [None]:
# Load train and test data into pandas dataframe
data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')
data.shape

In [None]:
# Plot histogram of how many attributes have a certain number of null values
data.isnull().sum().hist()

In [None]:
# Plot distribution of values in match attribute
data['match'].hist()

In [None]:
# Removing match (this is our target attribute) and id from training set
x = data.drop(['match', 'id'], axis=1)
# Create list of numeric features by checking if they contain float type
features_numeric = list(x.select_dtypes(include=['float64']))
# Create list of categorical features by check if they contain object type
features_categorical = list(x.select_dtypes(include=['object']))
# Setting labels to be match column from train data
y = data['match']

# Preprocessing Pipeline Setup

In [None]:
# Preprocessing steps for pipeline

# Transformations for numeric data
# Impute missing values with median value in feature by default
# Standardize featuers by removing the mean and scaling to unit variance
transformer_numeric = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())]
)

# Transformations for categorical data
# Impute missing value with 'missing' by default
# Encode features using one-hot numeric array, ignoring unknown categorical 
# features
transformer_categorical = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

# Apply corresponding transformers to numeric and categorical features in data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer_numeric, features_numeric),
        ('cat', transformer_categorical, features_categorical)
    ]
)

# Model Parameters and Pipelines

## XGBoost
### Parameters:
Didn't know defaults for this, so based choices off of Random Forest Classifier.


*    **N_estimators**: used default 100 from RFC, let the range cover lower than the default and greater to see if either direction was better
*    **max_depth**: RFC default 'None', wanted to see if limiting this impacted the model, maybe preventing overfitting
*    **learning**: RFC default 0.0001, always talked about to train in papers

In [None]:
# XGBboost pipeline
XGB_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('xgb_classifier', XGBClassifier(
            objective='binary:logistic', seed=1))
    ]
)

# Parameter grid for grid and random search
XGB_param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'preprocessor__cat__imputer__strategy': ['constant', 'most_frequent'],
    'xgb_classifier__n_estimators': [50, 100, 500],
    'xgb_classifier__max_depth': ['None', 50, 100],
    'xgb_classifier__learning': [0.001, 0.0001, 0.00001]
}

# Parameter grid for Bayes search centered on defaults
XGB_bayes_grid1 = {
    'preprocessor__num__imputer__strategy': Categorical(['mean', 'median']),
    'preprocessor__cat__imputer__strategy': Categorical(['constant', 'most_frequent']),
    'xgb_classifier__n_estimators': Integer(50, 500),
    'xgb_classifier__max_depth': Integer(50, 100),
    'xgb_classifier__learning': Real(1e-5, 1e-3, prior='log-uniform')
}

# Parameter grid that focuses on results from default grid 
XGB_bayes_grid2 = {
    'preprocessor__num__imputer__strategy': Categorical(['mean']),
    'preprocessor__cat__imputer__strategy': Categorical(['constant']),
    'xgb_classifier__n_estimators': Integer(70, 100),
    'xgb_classifier__max_depth': Integer(65, 85),
    'xgb_classifier__learning': Real(1e-6, 1e-5, prior='log-uniform')
}

# Set model as XGB
model_type = 'XGB'
pipeline = XGB_pipeline
param_grid = XGB_param_grid
bayes_grid = XGB_bayes_grid

## Support Vector Machine
### Parameters:
*    **C**: default 1.0, regularization parameter
*    **kernel**: default rbf, wanted to try different kernels on the problme
*    **max_iter**: default -1, seeing if early stopping made a difference on result. Got convergence warnings so removed this argument. 
*    **degree**: default 3, added in second Bayes model since polynomial kernel was chosen as best option

In [None]:
# Support Vector Machine model
# Pipeline
SVM_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('svm_classifier', SVC(probability=True, class_weight='balanced'))
    ]
)

# Parameter grid for grid and random search
SVM_param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'preprocessor__cat__imputer__strategy': ['constant', 'most_frequent'],
    'svm_classifier__C': [0.5, 1.0, 1.5],
    'svm_classifier__kernel': ['linear', 'poly', 'rbf'],
    # 'svm_classifier__max_iter': [-1, 500, 1000]
}

# Parameter grid for Bayes search centered on defaults
SVM_bayes_grid = {
    # Numeric imputation with mean or median
    'preprocessor__num__imputer__strategy': Categorical(['mean', 'median']),
    # Categorical imputation with constant "missing" value, or use most frequent
    'preprocessor__cat__imputer__strategy': Categorical(['constant', 'most_frequent']),
    'svm_classifier__C': Real(0.5, 1.5, prior='log-uniform'),
    'svm_classifier__kernel': Categorical(['linear', 'poly', 'rbf']),
    # 'svm_classifier__max_iter': Integer(-1, 1000)
}

# Parameter grid that focuses on results from default grid 
SVM_bayes_grid2 = {
    'preprocessor__num__imputer__strategy': Categorical(['mean', 'median']),
    'preprocessor__cat__imputer__strategy': Categorical(['most_frequent']),
    'svm_classifier__C': Real(1.0, 3.0, prior='log-uniform'),
    'svm_classifier__kernel': Categorical(['poly']),
    'svm_classifier__degree': Integer(2, 4)
}

# Set model as SVM
model_type = 'SVM'
pipeline = SVM_pipeline
param_grid = SVM_param_grid
bayes_grid = SVM_bayes_grid

## Multi-Layer Perceptron
### Parameters:
*    **hidden_layer_sizes**: had this for grid search but couldn't figure out how to use it in Bayes so eliminated it
*    **alpha**: default 0.0001, learning rate always mentioned to train in papers and in class
*    **max_iter**: default 200, seeing if stopping earlier or running longer is better

In [None]:
# Multi-Layer Perceptron model
# Pipeline
MLP_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('mlp_classifier', MLPClassifier())
    ]
)

# Parameter grid for grid and random seach
MLP_param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'preprocessor__cat__imputer__strategy': ['constant', 'most_frequent'],
    'mlp_classifier__hidden_layer_sizes': [(100,)],
    'mlp_classifier__alpha':[0.001, 0.0001, 0.00001],
    'mlp_classifier__max_iter':[100, 200, 400]
}

# Parameter grid for Bayes search centered on defaults
MLP_bayes_grid = {
    'preprocessor__num__imputer__strategy': Categorical(['mean', 'median']),
    'preprocessor__cat__imputer__strategy': Categorical(['constant', 'most_frequent']),
    'mlp_classifier__alpha': Real(1e-5, 1e-3, prior='log-uniform'),
    'mlp_classifier__max_iter': Integer(100, 400)
}

# Parameter grid that focuses on results from default grid 
MLP_bayes_grid2 = {
    'preprocessor__num__imputer__strategy': Categorical(['mean']),
    'preprocessor__cat__imputer__strategy': Categorical(['most_frequent']),
    'mlp_classifier__alpha': Real(1e-5, 1e-4, prior='log-uniform'),
    'mlp_classifier__max_iter': Integer(320, 380)
}

# Set model as MLP
model_type = 'MLP'
pipeline = MLP_pipeline
param_grid = MLP_param_grid
bayes_grid = MLP_bayes_grid

## Random Forest Classifier
### Parameters:
*     **n_estimators**: default 100, seeing if more or less trees in the forest makes a difference
*     **criterion**: default gini, only two options here so adding it wouldn't grow grid too much

In [None]:
# Random Forest Classifier model
# Pipeline
RFC_pipeline = Pipeline(
    steps=[
           ('preprocessor', preprocessor),
           ('rfc_classifier', RandomForestClassifier())
    ]
)

# Parameter grid for grid and random seach
RFC_param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'preprocessor__cat__imputer__strategy': ['constant', 'most_frequent'],
    'rfc_classifier__n_estimators': [50, 100, 500],
    'rfc_classifier__max_depth': ['None', 50, 100],
    'rfc_classifier__criterion': ['gini', 'entropy']
}

# Parameter grid for Bayes search that centers on defaults
RFC_bayes_grid = {
    'preprocessor__num__imputer__strategy': Categorical(['mean', 'median']),
    'preprocessor__cat__imputer__strategy': Categorical(['constant', 'most_frequent']),
    'rfc_classifier__n_estimators': Integer(50, 500),
    'rfc_classifier__criterion': Categorical(['gini', 'entropy'])
}

# Parameter grid that focuses on results from default grid 
RFC_bayes_grid2 = {
    'preprocessor__num__imputer__strategy': Categorical(['mean']),
    'preprocessor__cat__imputer__strategy': Categorical(['most_frequent']),
    'rfc_classifier__n_estimators': Integer(380, 440),
    'rfc_classifier__criterion': Categorical(['entropy'])
}

# Set model as RFC
model_type = 'RFC'
pipeline = RFC_pipeline
param_grid = RFC_param_grid
bayes_grid = RFC_bayes_grid

## Logistic Regression
### Parameters:
*     **C**: default 1.0, regularization term, included to compare to SVM
*     **max_iter**: default 100, as with SVM and MLP seeing if stopping earlier or later makes any difference

In [None]:
# Logistic Regression Model
# Pipeline
LG_pipeline = Pipeline(
    steps=[
           ('preprocessor', preprocessor),
           ('lg_classifier', LogisticRegression())
    ]
)

# Parameter grid for grid and random seach
LG_param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'preprocessor__cat__imputer__strategy': ['constant', 'most_frequent'],
    'lg_classifier__C': [0.5, 1.0, 1.5],
    'lg_classifier__max_iter':[50, 100, 200]
}

# Parameter grid for Bayes search that centers on defaults
LG_bayes_grid = {
    'preprocessor__num__imputer__strategy': Categorical(['mean', 'median']),
    'preprocessor__cat__imputer__strategy': Categorical(['constant', 'most_frequent']),
    'lg_classifier__C': Real(0.5, 1.5, prior='log-uniform'),
    'lg_classifier__max_iter': Integer(100, 500)
}

# First parameter grid didn't use enough iterations (had Convergence Warnings)
LG_bayes_grid2 = {
    'preprocessor__num__imputer__strategy': Categorical(['mean', 'median']),
    'preprocessor__cat__imputer__strategy': Categorical(['constant', 'most_frequent']),
    'lg_classifier__C': Real(0.5, 1.5, prior='log-uniform'),
    'lg_classifier__max_iter': Integer(1000,5000)
}

# Parameter grid that focuses on results from grid2
LG_bayes_grid3 = {
    'preprocessor__num__imputer__strategy': Categorical(['mean']),
    'preprocessor__cat__imputer__strategy': Categorical(['constant']),
    'lg_classifier__C': Real(0.4, 0.6, prior='log-uniform'),
    'lg_classifier__max_iter': Integer(1160,1220)
}

# Set model as LG
model_type = 'LG'
pipeline = LG_pipeline
param_grid = LG_param_grid
bayes_grid = LG_bayes_grid

# Parameter Search


In [None]:
# Grid Search
# Generate grid search to optimize the hyperparameters
# Estimator is classifier specified in pipeline
# cv is number of folds to use in cross-validation
# verbose indicates level of messages to print
# n_jobs is number of jobs to run in parallel
# scoring uses area under the receiver operating characteristic curve
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, verbose=3, n_jobs=2, 
    scoring='roc_auc')

# Run fit with all sets of hyperparameters on the training data
grid_search.fit(x, y)

print('best score {}'.format(grid_search.best_score_))
print('best parameters {}'.format(grid_search.best_params_))

In [None]:
# Random Search
# Estimator is classifier specified in pipeline
# cv is number of folds to use in cross-validation
# verbose indicates level of messages to print
# n_jobs is number of jobs to run in parallel
# n_iter is number of random parameter sets to choose
# scoring uses area under the receiver operating characteristic curve
random_search = RandomizedSearchCV(
    pipeline, param_grid, cv=5, verbose=3, n_jobs=2, n_iter=10,
    scoring='roc_auc'
)

# Run fit with the randomly chosen hyperparameters on the training data
random_search.fit(x,y)

print('best score {}'.format(random_search.best_score_))
print('best parameters {}'.format(random_search.best_params_))

In [None]:
# Bayes Search
bayes_search = BayesSearchCV(pipeline, bayes_grid, n_iter=10, cv=5, verbose=3)

# Fits n_iter samples from parameter settings to the training data
bayes_search.fit(x, y)

print('best score {}'.format(bayes_search.best_score_))
print('best score {}'.format(bayes_search.best_params_))

# Output Setup

In [None]:
# NOTE: Only run this cell once. If data_test changes, need to run loading cell again before this one
# Prepare submission:
submission = pd.DataFrame()
submission['id'] = data_test['id']
# Drop ID column
data_test.drop(columns='id', inplace=True)

In [None]:
# Grid search fit submission csv
submission['match'] = grid_search.predict_proba(data_test)[:,1]
filename = model_type + '_grid_submission.csv'
submission.to_csv(filename, index=False)

In [None]:
# Random search fit submission csv
submission['match'] = random_search.predict_proba(data_test)[:,1]
filename = model_type + '_random_submission.csv'
submission.to_csv(filename, index=False)

In [None]:
# Bayes search fit submission csv
submission_bayes['match'] = bayes_search.predict_proba(data_test)[:,1]
filename = model_type + '_bayes_submission2.csv'
submission_bayes.to_csv(filename, index=False)