In [1]:
import pandas as pd

# Columns in Data With Categorical Values- Must LabelEncode them
categorical_cols = ['hitpoint', 'outside.sideline', 
                    'outside.baseline', 'same.side', 
                    'previous.hitpoint', 
                    'server.is.impact.player', 'outcome', 
                    'gender']

# Columns in the Data That Should Be Scaled
scaled_data = ['serve', 'rally', 'speed', 'net.clearance', 
               'distance.from.sideline', 'depth', 
               'player.distance.travelled', 
               'player.impact.depth', 
               'player.impact.distance.from.center', 
               'player.depth', 
               'player.distance.from.center', 
               'previous.speed', 'previous.net.clearance', 
               'previous.distance.from.sideline', 
               'previous.depth', 'opponent.depth', 
               'opponent.distance.from.center', 
               'previous.time.to.net']


train_data = pd.read_csv('tennis_data/train-1542197608821.csv')
raw_mw_test = pd.read_csv("tennis_data/test-1542197608821.csv")
train_data.head()

Unnamed: 0,rally,serve,hitpoint,speed,net.clearance,distance.from.sideline,depth,outside.sideline,outside.baseline,player.distance.travelled,...,previous.depth,opponent.depth,opponent.distance.from.center,same.side,previous.hitpoint,previous.time.to.net,server.is.impact.player,outcome,gender,ID
0,4,1,B,35.515042,-0.021725,3.474766,6.797621,False,False,1.46757,...,0.705435,12.5628,2.0724,True,F,0.445318,False,UE,mens,8644
1,4,2,B,33.38264,1.114202,2.540801,2.608708,False,True,2.311931,...,3.8566,12.3544,5.1124,False,B,0.432434,False,FE,mens,1182
2,23,1,B,22.31669,-0.254046,3.533166,9.435749,False,False,3.903728,...,2.908892,13.862,1.6564,False,F,0.397538,True,FE,mens,9042
3,9,1,F,36.837309,0.766694,0.586885,3.34218,True,False,0.583745,...,0.557554,14.2596,0.1606,True,B,0.671984,True,UE,mens,1222
4,4,1,B,35.544208,0.116162,0.918725,5.499119,False,False,2.333456,...,3.945317,11.3658,1.1082,False,F,0.340411,False,W,mens,4085



Encode Data categorical_cols

In [2]:
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

# Encode Categorical Data
def encode_data(data):
    d = defaultdict(LabelEncoder)
    data[categorical_cols] = data[categorical_cols].apply(lambda x: d[x.name].fit_transform(x))
    return data

encode_data(train_data).head()
# encode_data(raw_mw_test).head()

Unnamed: 0,rally,serve,hitpoint,speed,net.clearance,distance.from.sideline,depth,outside.sideline,outside.baseline,player.distance.travelled,...,previous.depth,opponent.depth,opponent.distance.from.center,same.side,previous.hitpoint,previous.time.to.net,server.is.impact.player,outcome,gender,ID
0,4,1,0,35.515042,-0.021725,3.474766,6.797621,0,0,1.46757,...,0.705435,12.5628,2.0724,1,1,0.445318,0,1,0,8644
1,4,2,0,33.38264,1.114202,2.540801,2.608708,0,1,2.311931,...,3.8566,12.3544,5.1124,0,0,0.432434,0,0,0,1182
2,23,1,0,22.31669,-0.254046,3.533166,9.435749,0,0,3.903728,...,2.908892,13.862,1.6564,0,1,0.397538,1,0,0,9042
3,9,1,1,36.837309,0.766694,0.586885,3.34218,1,0,0.583745,...,0.557554,14.2596,0.1606,1,0,0.671984,1,1,0,1222
4,4,1,0,35.544208,0.116162,0.918725,5.499119,0,0,2.333456,...,3.945317,11.3658,1.1082,0,1,0.340411,0,2,0,4085


# Split the data into training and validation data.

In [3]:
from sklearn.model_selection import train_test_split
import numpy as np

train_data.drop('ID', 1, inplace=True)   
train_data.drop('gender', 1, inplace=True)

# Split into training and validation sets
train_mw, val_mw = train_test_split(train_data, 
                                        shuffle = True,
                                        test_size=0.2,
                                        random_state=42
                                        )


# Split data into input and outputs
X_train = train_mw.loc[:, train_mw.columns != 'outcome']
y_train = train_mw['outcome']
X_val = val_mw.loc[:, val_mw.columns != 'outcome']
y_val = val_mw['outcome']

We define a list of classifiers and we just iterate through each classifier and output its accuracy/log loss.

We just use the default hyperparameters for each model.

In [4]:

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_val)
    acc = accuracy_score(y_val, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(X_val)
#     train_predictions = clf.predict(y_val)
    ll = log_loss(y_val, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, 6]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

  from numpy.core.umath_tests import inner1d


KNeighborsClassifier
****Results****
Accuracy: 67.6452%
Log Loss: 2.86061128394377
DecisionTreeClassifier
****Results****
Accuracy: 79.2005%
Log Loss: 7.183892904125708
RandomForestClassifier
****Results****
Accuracy: 83.3229%
Log Loss: 0.9342875247423644
AdaBoostClassifier
****Results****
Accuracy: 82.0737%
Log Loss: 1.0514796992078692
GradientBoostingClassifier
****Results****
Accuracy: 85.6340%
Log Loss: 0.351072324522599
