# Notebook for optimizing different classifiers and ensembling them

Here is how the algorithm works for training:
- Transform the dataset by deleting and adding features
- We create 10,000 (adjustable) sets of undersampled data
- For each undersampled set, there will be a blend of classifiers:
    - Tensorflow
    - XGBoost
    - RandomForest
    - SVM
    - ...
- For each classifier for each undersampled set, we optimize the hyperparameters
- After optimization, we do a small ensemble learning for each undersampled data
- Every model is saved

For testing:
- Transform the dataset
- Ensemble classification on the whole dataset (no undersampling)

Generating and ensembling predictions
- Each model generates the likelihood that the user will convert
- Ensembling works by taking a weighted mean of all the votes, the weights being the accuracy of the model.

In [1]:
# Imports => add any which are necessary

# Standard imports
import numpy as np
import pandas as pd
import scipy as sc
import sklearn
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Preprocessing modules (Shouldn't be needed by now)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split

# Gridsearching and Parameter Optimization
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline



In [None]:
# The data
X_train_file = "X_train.npy"
X_valid_file = "X_valid.npy"
y_train_file = "y_train.npy"
y_valid_file = "y_valid.npy"
X_test_file = "X_test.npy"

X_train = np.load(X_train_file)
X_valid = np.load(X_valid_file)
y_train = np.load(y_train_file)
y_valid = np.load(y_valid_file)
X_test = np.load(X_test_file)

In [2]:
# Function for setting the parameters
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = {'clf__penalty': ['l1', 'l2'],
              'clf__C': param_range,
              "clf__fit_intercept": [True, False],
              "clf__kernel": ["rbf", "sigmoid", "poly", "linear"],
              "clf__gamma": param_range}

def get_params(*args):
    """
    Returns a list of a dictionary of parameter options
    
    Usage:
        get_params('penalty', 'C', 'kernel')
        
    Returns:
        [{
            'clf__penalty': ...,
            'clf__C': ...,
            'clf__kernel': ...,
        }]
    """
    to_return = [{}]
    for arg in args:
        to_return[0]["clf__" + arg] = param_grid["clf__" + arg]
    
    return to_return

In [None]:
# Parameters for the gridsearch
cv = 5 # Cross validation
n_jobs = -1
scoring = "log_loss"

# Set up the pipeline of your classifier
pipe = Pipeline([("clf", RandomForestClassifier())])

# Get the params
params = get_params("penalty", "C")

# The GridSearch
gs_pipe = GridSearchCV(pipe, params, scoring=scoring, cv=cv, verbose=1, n_jobs=n_jobs)