## TOC:
* [Model to predict match results](#first-bullet)
* [Model to predict # of games](#second-bullet)
* [test results](#third-bullet)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, StandardScaler, LabelEncoder

from sklearn.impute import SimpleImputer

from sklearn_pandas import DataFrameMapper

from sklearn.metrics import accuracy_score,r2_score, roc_auc_score

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import load_model



In [4]:
data=pd.read_csv('data/final_2.csv' ,encoding = "ISO-8859-1")

In [5]:
data.head()

Unnamed: 0.1,uuid,Unnamed: 0,ATP,B365L,B365W,Best of,Comment,Court,Date,L1,...,one_clay_vv,one_grass_vv,one_hard_vv,one_three_vv,one_five_vv,pts_diff,player_one_pts,player_two_pts,player_one_sets,player_two_sets
0,000569ca-bcdb-4266-98a0-b409cb8932ff,383,11,5.0,1.16,3,Completed,Indoor,2016-02-09,3.0,...,0.0,0.0,1.0,1.0,0.0,301.0,414.0,113.0,2.0,0.0
1,00056f39-ba9b-4807-ac20-cbbcf9e2c304,1887,45,1.36,3.0,3,Retired,Outdoor,2010-08-04,5.0,...,,,,,,-610.0,1385.0,775.0,0.0,1.0
2,000a7190-03aa-44b3-b631-e85d00f265ea,1537,39,3.0,1.36,5,Completed,Outdoor,2015-06-29,3.0,...,0.0,1.0,1.0,1.0,1.0,347.0,895.0,548.0,3.0,2.0
3,000c3ad6-ac2d-4ca8-abe9-853099f8a568,597,18,19.0,1.02,3,Completed,Outdoor,2012-02-28,3.0,...,1.0,0.0,1.0,1.0,1.0,6663.0,7150.0,487.0,2.0,1.0
4,0014c43e-cdee-41e7-a5f8-60a15a6f9cc2,2510,63,9.0,1.07,3,Completed,Indoor,2016-10-25,7.0,...,0.0,0.0,1.0,1.0,0.0,5311.0,5820.0,509.0,2.0,1.0


In [6]:
data.groupby('Comment')['Comment'].count()/len(data)

Comment
Awarded         0.000035
Completed       0.961710
Disqualified    0.000035
Retired         0.031937
Sched           0.000069
Walkover        0.006215
Name: Comment, dtype: float64

In [7]:
data=data[data.Comment=='Completed']

In [8]:
data.columns

Index(['uuid', 'Unnamed: 0', 'ATP', 'B365L', 'B365W', 'Best of', 'Comment',
       'Court', 'Date', 'L1', 'L2', 'L3', 'L4', 'L5', 'LPts', 'LRank',
       'Location', 'Loser', 'Lsets', 'Round', 'Series', 'Surface',
       'Tournament', 'W1', 'W2', 'W3', 'W4', 'W5', 'WPts', 'WRank', 'Winner',
       'Wsets', 'rank_diff', 'outcome', 'player_one', 'player_two',
       'player_one_rank', 'player_two_rank', 'uuid.1', 'one_name', 'one_date',
       'one_cutoff_date', 'one_win_rate_year', 'one_games_played_year',
       'one_clay_year', 'one_grass_year', 'one_hard_year', 'one_three_year',
       'one_five_year', 'two_name', 'two_date', 'two_cutoff_date',
       'two_win_rate_year', 'two_games_played_year', 'two_clay_year',
       'two_grass_year', 'two_hard_year', 'two_three_year', 'two_five_year',
       'major', 'total_games', 'player_one_total_games',
       'player_two_total_games', 'total_sets', 'player_two_name_vv',
       'two_cutoff_date_vv', 'two_win_rate_vv', 'two_games_played_vv',
 

In [None]:
print("% of games with only top 100 players: ",len(data[(data.WRank<=100) & (data.LRank<=100)])/len(data))

In [None]:
# % of times the higher ranked player one
data.groupby('outcome')['outcome'].count()/len(data)

In [None]:
# % of times the higher ranked player one in 5set games
data[data['Best of']==5].groupby('outcome')['outcome'].count()/len(data[data['Best of']==5])

In [None]:
# % of times the higher ranked player one in 5set games
data[data['Best of']==3].groupby('total_sets')['total_sets'].count()

In [None]:
data[(data['Best of']==3) & (data.total_sets==1.0)]

In [9]:
data['rank_dif']=data['player_two_rank']-data['player_one_rank']
data['pts_dif']=data['player_two_pts']-data['player_one_pts']
data['win_rate_diff']=data['two_win_rate_year']-data['one_win_rate_year']
data['hard_diff']=data['two_hard_year']-data['one_hard_year']
data['five_diff']=data['two_five_year']-data['one_five_year']
data['three_diff']=data['two_three_year']-data['one_three_year']

data['hard_vv']=data['two_hard_vv']-data['one_hard_vv']
data['vv']=data['two_win_rate_vv']-data['one_win_rate_vv']




## Model to predict match results <a class="anchor" id="first-bullet"></a>

In [None]:
t.columns

In [None]:
data.shape

In [10]:
t=data


In [11]:

feature_columns=['player_one_rank','player_two_rank','Surface', 'Best of','one_win_rate_year',
       'one_games_played_year', 'one_clay_year', 'one_grass_year',
       'one_hard_year', 'one_three_year', 'one_five_year', 'two_win_rate_year',
       'two_games_played_year', 'two_clay_year', 'two_grass_year',
       'two_hard_year', 'two_three_year', 'two_five_year', 'major','one_win_rate_vv', 'one_games_played_vv', 'one_clay_vv', 'one_grass_vv',
       'one_hard_vv', 'one_three_vv', 'one_five_vv','two_win_rate_vv',
       'two_games_played_vv', 'two_clay_vv', 'two_grass_vv', 'two_hard_vv',
       'two_three_vv', 'two_five_vv', 'rank_dif', 'win_rate_diff', 'hard_diff',
       'five_diff', 'three_diff', 'hard_vv', 'vv' ,'pts_diff', 'outcome']
data=data[feature_columns].dropna()

In [12]:
target='outcome'
y=data[target]
X=data.drop('outcome',axis=1)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
mapper = DataFrameMapper([
    (['player_one_rank'], [StandardScaler(),SimpleImputer()]),
    (['player_two_rank'], [StandardScaler(),SimpleImputer()]),
    (['Surface'], [SimpleImputer(strategy='constant', fill_value='most_frequent'),LabelBinarizer()]),
    ('Best of', LabelEncoder()),
    (['one_win_rate_year'],StandardScaler()),
    (['one_games_played_year'], StandardScaler()),
    (['one_clay_year'], StandardScaler()),
    (['one_grass_year'],StandardScaler()),
    (['one_hard_year'], StandardScaler()),
    (['one_three_year'], StandardScaler()),
    (['one_five_year'], StandardScaler()),
    (['two_win_rate_year'],StandardScaler()),
    (['two_games_played_year'], StandardScaler()),
    (['two_clay_year'], StandardScaler()),
    (['two_grass_year'],StandardScaler()),
    (['two_hard_year'], StandardScaler()),
    (['two_three_year'], StandardScaler()),
    (['two_five_year'], StandardScaler()),
    ('major', LabelEncoder()),
    (['one_win_rate_vv'],StandardScaler()), 
    (['one_games_played_vv'],StandardScaler()),  
    (['one_clay_vv'],StandardScaler()),  
    (['one_grass_vv'],StandardScaler()), 
    (['one_hard_vv'],StandardScaler()),  
    (['one_three_vv'],StandardScaler()),  
    (['one_five_vv'],StandardScaler()),
    (['two_win_rate_vv'],StandardScaler()), 
    (['two_games_played_vv'],StandardScaler()),  
    (['two_clay_vv'],StandardScaler()),  
    (['two_grass_vv'],StandardScaler()), 
    (['two_hard_vv'],StandardScaler()),  
    (['two_three_vv'],StandardScaler()),  
    (['two_five_vv'],StandardScaler()),    
    (['rank_dif'],StandardScaler()),  
    (['pts_diff'],[StandardScaler(),SimpleImputer()]),  
    (['win_rate_diff'],StandardScaler()),  
    (['hard_diff'],StandardScaler()),
    (['five_diff'],StandardScaler()), 
    (['three_diff'],StandardScaler()),
    (['hard_vv'],StandardScaler()),
    (['vv'],StandardScaler())
    
], df_out=
    True)


In [17]:
%%time
Z_train=mapper.fit_transform(X_train)
Z_test=mapper.transform(X_test)

CPU times: user 245 ms, sys: 25.8 ms, total: 271 ms
Wall time: 272 ms


In [18]:
model=LogisticRegression(C=20, max_iter=1000, n_jobs=1)

In [19]:
model.fit(Z_train,y_train)

LogisticRegression(C=20, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
model.score(Z_train,y_train)

0.9967629482071713

In [21]:
y_pred=model.predict(Z_test)

In [22]:
accuracy_score(y_test,y_pred),roc_auc_score(y_test,y_pred)

(0.9938259310894244, 0.9927517672923577)

In [None]:
Z_train.columns[model.coef_.argsort()]

In [None]:
m = Sequential()
m.add(Dense(units=64, activation='relu', input_shape=(Z_train.shape[1],)))
m.add(Dropout(0.2))
m.add(Dense(units=32, activation='relu'))
m.add(Dropout(0.2))
m.add(Dense(units=16, activation='relu'))
m.add(Dropout(0.2))
m.add(Dense(units=5, activation='relu'))
m.add(Dropout(0.2))
m.add(Dense(units=1, activation='sigmoid'))

m.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
mc=ModelCheckpoint('data/best_model.h5', monitor='val_accuracy', mode='max', verbose=2, save_best_only=True)
early_stop = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=15)

In [None]:
hist = m.fit( Z_train,
                    y_train, 
                    batch_size=128,
                    validation_data=(Z_test, y_test),
                    epochs=175,
                    callbacks=[mc],
                    verbose=1)

In [None]:
plt.figure(figsize=(8,6))
plt.plot(hist.history['val_accuracy'], label='Val Accuracy')
plt.plot(hist.history['accuracy'], label='Train Accuracy')
plt.legend();

## Test Results <a class="anchor" id="third-bullet"></a>

In [23]:
data=t

In [75]:
player_one_name='Cilic M.'
player_two_name='Federer R.'
Surface='Hard'
Major=1
best_of=5
vals=[Surface,Major,best_of]
cols_to_get_one=['player_one_rank','one_win_rate_year',
       'one_games_played_year', 'one_clay_year', 'one_grass_year',
       'one_hard_year', 'one_three_year', 'one_five_year','one_hard_vv','one_win_rate_vv','one_games_played_vv'
                ,'one_three_vv','one_five_vv','player_one_pts','one_clay_vv','one_grass_vv']

cols_to_get_two=['player_two_rank','two_win_rate_year',
       'two_games_played_year', 'two_clay_year', 'two_grass_year',
       'two_hard_year', 'two_three_year', 'two_five_year','two_hard_vv','two_win_rate_vv',
                 'two_games_played_vv','two_three_vv','two_five_vv','player_two_pts','two_clay_vv',
                 'two_grass_vv']

one=list(data[data.player_one==player_one_name][cols_to_get_one].head(1).values[0])
two=list(data[data.player_two==player_two_name][cols_to_get_two].head(1).values[0])

temp=vals+one+two
temp=pd.DataFrame(temp).T
temp.columns=['Surface','major', 'Best of','player_one_rank','one_win_rate_year',
       'one_games_played_year', 'one_clay_year', 'one_grass_year',
       'one_hard_year', 'one_three_year', 'one_five_year', 'one_hard_vv','one_win_rate_vv','one_games_played_vv','one_three_vv',
              'one_five_vv','player_one_pts','one_clay_vv','one_grass_vv'
        ,'player_two_rank','two_win_rate_year',
       'two_games_played_year', 'two_clay_year', 'two_grass_year',
       'two_hard_year', 'two_three_year', 'two_five_year','two_hard_vv','two_win_rate_vv','two_games_played_vv','two_three_vv',
              'two_five_vv','player_two_pts','two_clay_vv','two_grass_vv'
             ]
# 

In [91]:
player_one_name='Cilic M.'
cols_to_get_one=['Date','player_one','player_one_rank','one_win_rate_year',
       'one_games_played_year', 'one_clay_year', 'one_grass_year',
       'one_hard_year', 'one_three_year', 'one_five_year','one_hard_vv','one_win_rate_vv','one_games_played_vv'
                ,'one_three_vv','one_five_vv','player_one_pts','one_clay_vv','one_grass_vv']
data[data.player_one==player_one_name][cols_to_get_one].sort_values('Date', ascending=False).head(5)

Unnamed: 0,Date,player_one,player_one_rank,one_win_rate_year,one_games_played_year,one_clay_year,one_grass_year,one_hard_year,one_three_year,one_five_year,one_hard_vv,one_win_rate_vv,one_games_played_vv,one_three_vv,one_five_vv,player_one_pts,one_clay_vv,one_grass_vv
18396,2019-10-28,Cilic M.,24.0,0.680851,47.0,0.5,0.5,0.484848,0.472222,0.545455,1.0,1.0,1.0,1.0,0.0,1500.0,0.0,0.0
18303,2019-10-19,Cilic M.,25.0,0.673469,49.0,0.5,0.5,0.485714,0.473684,0.545455,1.0,1.0,2.0,1.0,0.0,1455.0,0.0,0.0
14742,2019-10-18,Cilic M.,25.0,0.673469,49.0,0.5,0.5,0.485714,0.473684,0.545455,1.0,1.0,1.0,1.0,0.0,1455.0,0.0,0.0
5146,2019-10-17,Cilic M.,25.0,0.673469,49.0,0.5,0.5,0.485714,0.473684,0.545455,1.0,1.0,1.0,1.0,0.0,1455.0,0.0,0.0
19385,2019-10-07,Cilic M.,25.0,0.66,50.0,0.5,0.5,0.472222,0.461538,0.545455,1.0,1.0,1.0,1.0,0.0,1455.0,0.0,0.0


In [76]:
temp['pts_diff']=temp['player_two_pts']-temp['player_one_pts']
temp['rank_dif']=temp['player_two_rank']-temp['player_one_rank']
temp['win_rate_diff']=temp['two_win_rate_year']-temp['one_win_rate_year']
temp['hard_diff']=temp['two_hard_year']-temp['one_hard_year']
temp['five_diff']=temp['two_five_year']-temp['one_five_year']
temp['three_diff']=temp['two_three_year']-temp['one_three_year']

temp['hard_vv']=temp['two_hard_vv']-temp['one_hard_vv']
temp['vv']=temp['two_win_rate_vv']-temp['one_win_rate_vv']








In [77]:
temp.columns

Index(['Surface', 'major', 'Best of', 'player_one_rank', 'one_win_rate_year',
       'one_games_played_year', 'one_clay_year', 'one_grass_year',
       'one_hard_year', 'one_three_year', 'one_five_year', 'one_hard_vv',
       'one_win_rate_vv', 'one_games_played_vv', 'one_three_vv', 'one_five_vv',
       'player_one_pts', 'one_clay_vv', 'one_grass_vv', 'player_two_rank',
       'two_win_rate_year', 'two_games_played_year', 'two_clay_year',
       'two_grass_year', 'two_hard_year', 'two_three_year', 'two_five_year',
       'two_hard_vv', 'two_win_rate_vv', 'two_games_played_vv', 'two_three_vv',
       'two_five_vv', 'player_two_pts', 'two_clay_vv', 'two_grass_vv',
       'pts_diff', 'rank_dif', 'win_rate_diff', 'hard_diff', 'five_diff',
       'three_diff', 'hard_vv', 'vv'],
      dtype='object')

In [78]:
temp[['player_one_rank', 'player_two_rank']]

Unnamed: 0,player_one_rank,player_two_rank
0,3,3


In [79]:
Z_temp=mapper.transform(temp)

In [80]:
model.predict_proba(Z_temp)

array([[0., 1.]])

In [64]:
data=pd.read_csv('data/final_2.csv' ,encoding = "ISO-8859-1")

In [None]:
data.columns

In [None]:
plt.hist(data.total_games, bins=5);

In [None]:

def f(x):
    d = {}
    d['mean'] = x.mean()
    d['std'] = x.std()  
    d['0.25'] = np.quantile(x,0.25)
    d['0.75'] = np.quantile(x,0.75)
    d['count']=len(x)
    d['count%']=len(x)/len(data)

    return pd.Series(d)

In [None]:
data.groupby('Best of')['total_games'].apply(f)

In [None]:
t=data[np.abs(data.rank_diff)<50]
sns.jointplot(t.rank_diff, t.total_games, kind='reg')

In [None]:
sns.jointplot(np.abs(data.rank_diff), data.total_games)