# 10 Classifier Showdown
---
**Inspiration:** https://www.kaggle.com/jeffd23/10-classifier-showdown-in-scikit-learn

**Main Idea:** test many classifiers at once and see how well they perform. Basically, we are throwing models at the data and seeing which model sticks better.

## Data Preprocessing
---
Read in data and specify which features need to be encoded and scaled.

In [1]:
import pandas as pd

# Columns in Data With Categorical Values- Must LabelEncode them
categorical_cols = ['hitpoint', 'outside.sideline', 
                    'outside.baseline', 'same.side', 
                    'previous.hitpoint', 
                    'server.is.impact.player', 'outcome', 
                    'gender']

# Columns in the Data That Should Be Scaled
scaled_data = ['serve', 'rally', 'speed', 'net.clearance', 
               'distance.from.sideline', 'depth', 
               'player.distance.travelled', 
               'player.impact.depth', 
               'player.impact.distance.from.center', 
               'player.depth', 
               'player.distance.from.center', 
               'previous.speed', 'previous.net.clearance', 
               'previous.distance.from.sideline', 
               'previous.depth', 'opponent.depth', 
               'opponent.distance.from.center', 
               'previous.time.to.net']


train_data = pd.read_csv('tennis_data/mens_train_file.csv')
train_data.head()

Unnamed: 0,rally,serve,hitpoint,speed,net.clearance,distance.from.sideline,depth,outside.sideline,outside.baseline,player.distance.travelled,...,opponent.depth,opponent.distance.from.center,same.side,previous.hitpoint,previous.time.to.net,server.is.impact.player,id,train,outcome,gender
0,4,1,B,35.515042,-0.021725,3.474766,6.797621,False,False,1.46757,...,12.5628,2.0724,True,F,0.445318,False,5718,1,UE,mens
1,4,2,B,33.38264,1.114202,2.540801,2.608708,False,True,2.311931,...,12.3544,5.1124,False,B,0.432434,False,371,1,FE,mens
2,23,1,B,22.31669,-0.254046,3.533166,9.435749,False,False,3.903728,...,13.862,1.6564,False,F,0.397538,True,7948,1,FE,mens
3,9,1,F,36.837309,0.766694,0.586885,3.34218,True,False,0.583745,...,14.2596,0.1606,True,B,0.671984,True,9312,1,UE,mens
4,4,1,B,35.544208,0.116162,0.918725,5.499119,False,False,2.333456,...,11.3658,1.1082,False,F,0.340411,False,7667,1,W,mens


## Encode Data
---
Define function to encode data using sklearn's LabelEncoder

In [None]:
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
#test
# Encode Categorical Data
def encode_data(data):
    d = defaultdict(LabelEncoder)
    data[categorical_cols] = data[categorical_cols].apply(lambda x: d[x.name].fit_transform(x))
    return data

encode_data(train_data).head()

Unnamed: 0,rally,serve,hitpoint,speed,net.clearance,distance.from.sideline,depth,outside.sideline,outside.baseline,player.distance.travelled,...,opponent.depth,opponent.distance.from.center,same.side,previous.hitpoint,previous.time.to.net,server.is.impact.player,id,train,outcome,gender
0,4,1,0,35.515042,-0.021725,3.474766,6.797621,0,0,1.46757,...,12.5628,2.0724,1,1,0.445318,0,5718,1,1,0
1,4,2,0,33.38264,1.114202,2.540801,2.608708,0,1,2.311931,...,12.3544,5.1124,0,0,0.432434,0,371,1,0,0
2,23,1,0,22.31669,-0.254046,3.533166,9.435749,0,0,3.903728,...,13.862,1.6564,0,1,0.397538,1,7948,1,0,0
3,9,1,1,36.837309,0.766694,0.586885,3.34218,1,0,0.583745,...,14.2596,0.1606,1,0,0.671984,1,9312,1,1,0
4,4,1,0,35.544208,0.116162,0.918725,5.499119,0,0,2.333456,...,11.3658,1.1082,0,1,0.340411,0,7667,1,2,0


## Split Into Training and Validation
---
Split the data into training and validation data.

In [3]:
from sklearn.model_selection import train_test_split
import numpy as np

train_data.drop('id', 1, inplace=True)   
train_data.drop('gender', 1, inplace=True)
train_data.drop('train', 1, inplace=True)

# Split into training and validation sets
train_mens, val_mens = train_test_split(train_data, 
                                        shuffle = True,
                                        test_size=0.2,
                                        random_state=42
                                        )


# Split data into input and outputs
X_train = train_mens.loc[:, train_mens.columns != 'outcome']
y_train = train_mens['outcome']
X_val = val_mens.loc[:, val_mens.columns != 'outcome']
y_val = val_mens['outcome']

  train_data.drop('id', 1, inplace=True)
  train_data.drop('gender', 1, inplace=True)
  train_data.drop('train', 1, inplace=True)


## Set up classifiers
---
We define a list of classifiers and we just iterate through each classifier and output its accuracy/log loss.

We just use the default hyperparameters for each model.

In [4]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    KNeighborsClassifier(),
    SVC(probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB()]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_val)
    acc = accuracy_score(y_val, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(X_val)
    ll = log_loss(y_val, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

KNeighborsClassifier
****Results****
Accuracy: 66.9000%
Log Loss: 3.2011618418277954
SVC
****Results****
Accuracy: 67.3000%
Log Loss: 0.7107593166666752
NuSVC
****Results****
Accuracy: 76.3000%
Log Loss: 0.5803509632844609
DecisionTreeClassifier
****Results****
Accuracy: 80.1000%
Log Loss: 6.873216502587227
RandomForestClassifier
****Results****
Accuracy: 85.0000%
Log Loss: 0.4227366810193996
AdaBoostClassifier
****Results****
Accuracy: 81.2000%
Log Loss: 1.0417907250518212
GradientBoostingClassifier
****Results****
Accuracy: 87.7000%
Log Loss: 0.33940857994396467
GaussianNB
****Results****
Accuracy: 73.3000%
Log Loss: 1.2996934391800417
