<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Get-Data" data-toc-modified-id="Get-Data-1">Get Data</a></span></li><li><span><a href="#Data-Preprocessing" data-toc-modified-id="Data-Preprocessing-2">Data Preprocessing</a></span></li><li><span><a href="#Model-Performance" data-toc-modified-id="Model-Performance-3">Model Performance</a></span></li><li><span><a href="#Grid-Search" data-toc-modified-id="Grid-Search-4">Grid Search</a></span><ul class="toc-item"><li><span><a href="#Logistic-Regression" data-toc-modified-id="Logistic-Regression-4.1">Logistic Regression</a></span></li><li><span><a href="#Support-Vector-Classifier" data-toc-modified-id="Support-Vector-Classifier-4.2">Support Vector Classifier</a></span></li></ul></li><li><span><a href="#Validation-Curve" data-toc-modified-id="Validation-Curve-5">Validation Curve</a></span><ul class="toc-item"><li><span><a href="#Logistic-Regression" data-toc-modified-id="Logistic-Regression-5.1">Logistic Regression</a></span></li><li><span><a href="#Support-Vector-Classifier" data-toc-modified-id="Support-Vector-Classifier-5.2">Support Vector Classifier</a></span></li></ul></li></ul></div>

**Note**: This notebook is **not a report**. This notebook is solely to construct pickles, therefore it contains no description to increase performance.

In [1]:
import time

import numpy as np
import pandas as pd
import pickle
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import (
    AdaBoostClassifier,
    ExtraTreesClassifier,
    RandomForestClassifier,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    StratifiedShuffleSplit,
    cross_validate,
    validation_curve,
)
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    FunctionTransformer,
    MinMaxScaler,
    RobustScaler,
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Get Data

In [2]:
BANK_DATA_FILE_PATH = '../../datasets/speed_dating.csv'
BANK_DATA_URL = 'https://raw.githubusercontent.com/polarBearYap/AI_Assignment/master/Datasets/speeddating.csv'


def fetch_data_from_website(path):
    return pd.read_csv(path, low_memory=False)

In [3]:
dating = fetch_data_from_website(BANK_DATA_FILE_PATH)

# Data Preprocessing

In [4]:
preproccessed_features = [feature for feature in dating.columns if feature.lower()[
    :2] == 'd_']

irrelevant_features = ['has_null', 'wave', 'expected_happy_with_sd_people', 'expected_num_interested_in_me',
                       'expected_num_matches', 'field', 'decision']

self_interest_feature = ['sports', 'tvsports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming',
                         'clubbing', 'reading', 'tv', 'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga']

partner_features = ['age_o', 'race_o', 'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence',
                    'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests', 'attractive_o', 'sinsere_o', 'intelligence_o', 'funny_o', 'ambitous_o', 'shared_interests_o']

In [5]:
new_dating = dating.drop(columns=preproccessed_features +
                         irrelevant_features+self_interest_feature+partner_features, axis=1)

In [6]:
class DataPreprocessTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, y_feature):
        self.y_feature = y_feature
        self.features_with_wrong_data_type = []
        self.numerical_features = []
        self.categorical_features = []
        self.features_with_invalid_value = []
        self.one_hot_features = []

    def getNumericalFeatures(self):
        return self.numerical_features

    def getCategoricalFeatures(self):
        return self.categorical_features

    def detect_int_value(self, data):
        return np.any(data.astype(str).str.contains('^\d+$', regex=True))

    def detect_float_value(self, data):
        return np.any(data.astype(str).str.contains('^-?\d+\.\d+$|^\d+$', regex=True))

    def get_invalid_int_value(self, data):
        return ', '.join(data[~data.astype(str).str.contains('^\d+$', regex=True)].value_counts().index.to_list())

    def get_invalid_float_value(self, data):
        return ', '.join(data[~data.astype(str).str.contains('^-?\d+\.\d+$|^\d+$', regex=True)].value_counts().index.to_list())

    def drop_rows_with_unknow_values(self, data, feature):
        return data[~data[feature].isna()]

    def find_invalid_values(self, data):
        invalid_values = set()
        for feature in data.columns.values:
            if data[feature].dtype == 'object':
                if self.detect_float_value(data[feature]):
                    data[feature] = data[feature].astype(
                        'float64', errors='ignore')
                    invalid_value = self.get_invalid_float_value(data[feature])
                    if invalid_value != '':
                        invalid_values.add(invalid_value)
                        self.features_with_invalid_value.append(feature)
                    self.features_with_wrong_data_type.append(feature)
                else:
                    self.categorical_features.append(feature)

            if data[feature].dtype == 'int64':
                invalid_value = self.get_invalid_int_value(data[feature])
                if invalid_value != '':
                    invalid_values.add(invalid_value)
                    self.features_with_invalid_value.append(feature)
                data[feature] = data[feature].astype('float64', errors='raise')
                self.numerical_features.append(feature)

            elif data[feature].dtype == 'float64':
                invalid_value = self.get_invalid_float_value(data[feature])
                if invalid_value != '':
                    invalid_values.add(invalid_value)
                    self.features_with_invalid_value.append(feature)
                self.numerical_features.append(feature)
        return list(invalid_values)

    def fit(self, data, y=None):

        # Detect any numerical features with 'object' data type and with invalid values
        print(f'Invalid values found: {self.find_invalid_values(data)}')

        return self

    def transform(self, data, y=None):

        # Replace '?' value with NaN
        data = data.replace('^\?$', np.NaN, regex=True)

        # Change numerical features with 'object' data type and change to 'float64'
        for feature in self.features_with_invalid_value:
            data[feature] = data[feature].astype('float64', errors='raise')

        # Add the fixed features back to numerical features
        self.numerical_features += self.features_with_invalid_value

        # Remove unwanted quotes: change values like ''Example'' to 'Example'
        for feature in self.categorical_features:
            for value in data[feature].value_counts().index:
                if re.search('^\'.+\'$', value.replace(' ', '')):
                    data[feature].replace(value, value[1:-1], inplace=True)

        # Drop any rows with null values for categorical attribute
        for feature in self.categorical_features:
            data = self.drop_rows_with_unknow_values(data, feature)

        # Generate one-hot-encoded feature's names
        for feature in self.categorical_features:
            for value in data[feature].value_counts().index:
                self.one_hot_features.append(f'{feature}_{value}')

        self.numerical_features.remove(self.y_feature)

        # Add mean value to the missing values for numerical attribute
        numerical_pipeline = make_pipeline(
            SimpleImputer(strategy='median'),
            RobustScaler(),
            MinMaxScaler()
        )

        categorical_pipleline = make_pipeline(
            FunctionTransformer(lambda data: pd.get_dummies(data, columns=self.categorical_features,
                                                            prefix=self.categorical_features))
        )

        data_preprocess_pipeline = make_column_transformer(
            (numerical_pipeline, self.numerical_features),
            (categorical_pipleline, self.categorical_features), remainder='passthrough')

        data = pd.DataFrame(data_preprocess_pipeline.fit_transform(data),
                            columns=self.numerical_features+self.one_hot_features+[self.y_feature])

        return data

In [7]:
def split_data(X, y, n_splits=1, test_size=0.2, random_state=42):

    # split using stratified sampling
    split = StratifiedShuffleSplit(
        n_splits=n_splits, test_size=test_size, random_state=random_state)
    train_index, test_index = next(split.split(X, y))

    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y[train_index], y[test_index]

    return X_train, X_test, y_train, y_test

In [8]:
Y_FEATURE = 'match'
data_preprocess_pipeline = DataPreprocessTransformer(Y_FEATURE)
dating_prepared = data_preprocess_pipeline.fit_transform(new_dating.copy())
X_train, X_test, y_train, y_test = split_data(dating_prepared.drop(Y_FEATURE, axis=1),
                                              dating_prepared[Y_FEATURE], test_size=0.1)

Invalid values found: ['?']


In [9]:
RANDOM_SEED = 42

# Model Performance

In [10]:
RANDOM_SEED = 42

short_names = ['log_reg', 'decision_tree', 'rand_forest', 'extra_tree',
               'ada_boost_cf', 'k_neighbors', 'c_naive_bayes',
               'quadratic_dis_analysis', 'support_vector']

names = ['Logistic Regression', 'Decision tree Classifier', 'Random forest Classifier',
         'Extra Tree Classifier', 'AdaBoost Classifier', 'K Neighbors Classifier',
         'Complement Naive Bayes', 'Quadratic Discriminant Analysis', 'C-Support Vector Classifier']

functions = [
    LogisticRegression(
        penalty='none', random_state=RANDOM_SEED, max_iter=1000),
    DecisionTreeClassifier(random_state=RANDOM_SEED),
    RandomForestClassifier(random_state=RANDOM_SEED),
    ExtraTreesClassifier(random_state=RANDOM_SEED),
    AdaBoostClassifier(random_state=RANDOM_SEED),
    KNeighborsClassifier(),
    ComplementNB(),
    QuadraticDiscriminantAnalysis(),
    SVC(random_state=RANDOM_SEED, probability=True)
]

classifiers_idx = {}
classifiers = {}

# Zip all classfiers together into a dictionary for easy access
for idx, s_name, name, func in zip(range(len(names)), short_names, names, functions):
    classifiers_idx[idx] = {'name': name, 'func': func}
    classifiers[s_name] = {'name': name, 'func': func}

In [11]:
def get_models_performace(classifiers, X, y):
    train_results = {'classifier_name': [], 'duration': [],
                     'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}
    test_results = {'classifier_name': [], 'duration': [],
                    'precision': [], 'recall': [], 'f1_score': [], 'auc_score': []}
    for idx in range(len(classifiers)):
        cf_name = classifiers[idx]['name']
        print(f'{cf_name} has started...')
        start = time.time()
        cv_scores = cross_validate(classifiers[idx]['func'], X, y,
                                   scoring=['f1', 'roc_auc', 'precision', 'recall'], cv=5,
                                   return_train_score=True, n_jobs=-1)
        end = time.time()
        duration = end - start
        print(f'{cf_name} ended in {duration} seconds.\n')
        updateRecord(train_results, cv_scores, 'train', cf_name, duration)
        updateRecord(test_results, cv_scores, 'test', cf_name, duration)
    return pd.DataFrame(train_results), pd.DataFrame(test_results)


def updateRecord(df, scores, key_name, classifier_name, duration):
    df['classifier_name'].append(classifier_name)
    df['duration'].append(duration)
    df['f1_score'].append(np.mean(scores[f'{key_name}_f1']))
    df['auc_score'].append(np.mean(scores[f'{key_name}_roc_auc']))
    df['precision'].append(np.mean(scores[f'{key_name}_precision']))
    df['recall'].append(np.mean(scores[f'{key_name}_recall']))

In [12]:
train_results, test_results = get_models_performace(
    classifiers_idx, X_train, y_train)

Logistic Regression has started...
Logistic Regression ended in 4.533708572387695 seconds.

Decision tree Classifier has started...
Decision tree Classifier ended in 2.7222795486450195 seconds.

Random forest Classifier has started...
Random forest Classifier ended in 2.8382511138916016 seconds.

Extra Tree Classifier has started...
Extra Tree Classifier ended in 2.446058988571167 seconds.

AdaBoost Classifier has started...
AdaBoost Classifier ended in 1.755713701248169 seconds.

K Neighbors Classifier has started...
K Neighbors Classifier ended in 7.593762397766113 seconds.

Complement Naive Bayes has started...
Complement Naive Bayes ended in 0.39484477043151855 seconds.

Quadratic Discriminant Analysis has started...
Quadratic Discriminant Analysis ended in 0.398914098739624 seconds.

C-Support Vector Classifier has started...
C-Support Vector Classifier ended in 14.477275609970093 seconds.



In [13]:
with open('../../pickles/model_performance.pickle', 'wb') as file:
    pickle.dump(train_results, file)
    pickle.dump(test_results, file)

# Grid Search

## Logistic Regression

In [14]:
# Logistics Regression with Default Settings
log_reg = LogisticRegression(penalty='elasticnet', random_state=RANDOM_SEED,
                             solver='saga', max_iter=3000, l1_ratio=0.5, C=1.0)

In [15]:
log_reg_param_grid = {
    'l1_ratio': np.linspace(0, 1, 5),
    'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]
}

In [16]:
log_grid_search = GridSearchCV(log_reg, param_grid=log_reg_param_grid,
                               scoring='roc_auc', return_train_score=True, verbose=1, cv=3, n_jobs=-1)
log_grid_search.fit(X_train, y_train)
log_grid_search.best_params_

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  1.4min finished


{'C': 3, 'l1_ratio': 1.0}

In [17]:
with open('../../pickles/log_grid_search.pickle', 'wb') as file:
    pickle.dump(log_grid_search, file)

## Support Vector Classifier

In [18]:
# C-Support Vector Classifier with Default Settings
svm_model = SVC(random_state=RANDOM_SEED, probability=True)

In [19]:
# List of parameters and different combinations to produce a best result
svm_param_grid = {'C': [0.1, 1, 10, 100, 1000, 10000],
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}

In [20]:
svm_grid_search = GridSearchCV(svm_model, svm_param_grid, cv=3,
                               scoring='roc_auc', return_train_score=True, verbose=10, n_jobs=-1)
svm_grid_search.fit(X_train, y_train)
svm_grid_search.best_params_

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   52.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  85 out of  90 | elapsed:  2.5min remaining:    8.8s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  2.6min finished


{'C': 100, 'gamma': 0.1}

In [21]:
with open('../../pickles/svm_grid_search.pickle', 'wb') as file:
    pickle.dump(svm_grid_search, file)

# Validation Curve

## Logistic Regression

In [22]:
log_reg = LogisticRegression(penalty='elasticnet', random_state=RANDOM_SEED,
                             solver='saga', l1_ratio=1.0, max_iter=3000)
C_param = log_reg_param_grid['C']

train_score, test_score = validation_curve(log_reg, X_train, y_train, param_name='C',
                                           param_range=C_param, cv=3, scoring='roc_auc',
                                           verbose=1, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   30.7s finished


In [23]:
with open('../../pickles/log_val_C.pickle', 'wb') as file:
    pickle.dump(train_score, file)
    pickle.dump(test_score, file)

## Support Vector Classifier

In [24]:
svm_model = SVC(random_state=42, probability=True, gamma=0.1)
C_param = svm_param_grid['C']

train_score, test_score = validation_curve(svm_model, X_train, y_train, param_name='C',
                                           param_range=C_param, cv=3, scoring='roc_auc',
                                           verbose=1, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   45.5s finished


In [25]:
with open('../../pickles/tree_val_C.pickle', 'wb') as file:
    pickle.dump(train_score, file)
    pickle.dump(test_score, file)

In [26]:
svm_model = SVC(random_state=42, probability=True, C=100)
gamma_param = svm_param_grid['gamma']

train_score, test_score = validation_curve(svm_model, X_train, y_train, param_name='gamma',
                                           param_range=gamma_param, cv=3, scoring='roc_auc',
                                           verbose=1, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   22.9s finished


In [27]:
with open('../../pickles/tree_val_gamma.pickle', 'wb') as file:
    pickle.dump(train_score, file)
    pickle.dump(test_score, file)