In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import where
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from pandas.plotting import scatter_matrix
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode,  plot
from plotly.graph_objs import *
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score , cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [None]:
# load dataset and check columns
url = 'https://raw.githubusercontent.com/sartasos/ML-prediction-of-tennis-match-result/main/stats.csv'
df = pd.read_csv(url)
df.info()

In [None]:
# remove columns with stats that we cannot know before the match (eg "score", "minutes", "l_1stWon" etc)
# also remove columns that we do not need (eg "winner_name","loser_name" etc)
df = df.drop(columns=["score", "tourney_name", "minutes", "l_1stIn", "l_1stWon", "l_2ndWon", "l_ace", "l_svpt",
                      "l_SvGms", "l_bpFaced", "l_df", "l_bpSaved", "w_1stIn", "w_1stWon", "w_2ndWon", "w_ace",
                      "w_svpt", "w_SvGms", "w_bpFaced", "w_df", "w_bpSaved", "winner_name", "loser_name",
                      "winner_entry", "loser_entry", "tourney_id", "match_num", "winner_id", "winner_seed", "loser_id",
                      "loser_seed", "winner_rank", "loser_rank"])
df.info()

In [None]:
# check null values for every column
zero_percent = df.isnull().sum() * 100 / len(df)
zero_values_df = pd.DataFrame({"Feature Name": df.columns, "Zero values count": df.isnull().sum(),
                               "Zero values percent": zero_percent})
print(zero_values_df.reset_index().drop(columns=["index"]))

#there are not many columns with null values, so no action

In [None]:
# rank points and surface are considered as important features, 
# therefore remove entries that do not contain info about rank points and surface
df.dropna(subset=["winner_rank_points", "loser_rank_points", "surface"], inplace=True)
df = df.reset_index(drop=True)
df.info()

In [None]:
# convert "tourney_date" column to new columns of "year" and "month"
df["tourney-year"] = df.tourney_date.astype(str).str[:4].astype(int)        #year column
df["tourney-month"] = df.tourney_date.astype(str).str[4:6].astype(int)      #month column
df = df.drop(columns=["tourney_date"])                                      #remove old tourney_date column
df.info()

In [None]:
'''
transform our data so that we have 2 players (first & second),their personal information (hand, age, etc)
and general information about the match and the tourney. then create 2nd copy with inverse positions
finally create a column "label" in each copy which is equal to 0 if first player wins, or 1 if second player wins.
'''

df = df.rename(columns={"loser_age": "first_age", "loser_hand": "first_hand", "loser_ht": "first_ht", "loser_ioc": "first_ioc",
                        "loser_rank_points": "first_rank_points",
                        "winner_age": "second_age",  "winner_hand": "second_hand", "winner_ht": "second_ht",
                        "winner_ioc": "second_ioc", "winner_rank_points": "second_rank_points"
                        })

copy_2_df = df.copy()
copy_2_df[['first_age','first_hand','first_ht','first_ioc','first_rank_points',
            'second_age','second_hand','second_ht','second_ioc','second_rank_points']]\
=copy_2_df[['second_age','second_hand','second_ht','second_ioc','second_rank_points',
             'first_age','first_hand','first_ht','first_ioc','first_rank_points']]

winner_player1 = np.zeros(copy_2_df.shape[0])  # if 1st player wins then label = 0
copy_2_df['label'] = winner_player1

winner_player2 = np.ones(df.shape[0])  #if 2nd player wins then label = 1
df['label'] = winner_player2

df = pd.concat([df,copy_2_df])

df = df.sample(frac=1).reset_index(drop=True) #shuffle data
df.info()

In [None]:
# height columns have some null values, so we will fill those values with the columns' mean
df['second_ht'] = df['second_ht'].fillna(df['second_ht'].mean())
df['first_ht'] = df['first_ht'].fillna(df['first_ht'].mean())

#remove missing values
df = df.dropna()

In [None]:
# show correlations

correlations = df.corr()
plt.figure(figsize=(20,15))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
# use pandas.get_dummies to turn columns with categorical values to numerical
dataset1 = pd.get_dummies(df['second_hand'] , drop_first=True)
dataset2 = pd.get_dummies(df['first_hand'], drop_first=True)
dataset3 = pd.get_dummies(df['second_ioc'], drop_first=True)
dataset4 = pd.get_dummies(df['first_ioc'], drop_first=True)
dataset5 = pd.get_dummies(df['surface'], drop_first=True)
dataset6 = pd.get_dummies(df['tourney_level'], drop_first=True)
dataset7 = pd.get_dummies(df['round'], drop_first=True)

#merge the two datasets
merged = pd.concat([df,dataset1,dataset2,dataset3,dataset4,dataset5,dataset6,dataset7],axis='columns' )

#remove old columns with categorical values that we replaced
df = merged.drop(['second_hand','first_hand','second_ioc','first_ioc','surface','tourney_level','round'], axis='columns')
df.info()

In [None]:
#check how labels are distributed in our dataset
class_count_01, class_count_02 = df['label'].value_counts()
df['label'].value_counts().plot(kind='bar', title='count (target)')
print('class 0:', class_count_02)
print('class 1:', class_count_01)

In [None]:
#HISTOGRAMS 

hist_data = [df['second_rank_points']]
group_labels = ['second_rank_points'] # name of the dataset
fig = ff.create_distplot(hist_data, group_labels)
fig.show()

hist_data = [df['first_rank_points']]
group_labels = ['first_rank_points'] # name of the dataset
fig = ff.create_distplot(hist_data, group_labels)
fig.show()

hist_data = [df['second_ht']]
group_labels = ['second_ht'] # name of the dataset
fig = ff.create_distplot(hist_data, group_labels)
fig.show()

hist_data = [df['first_ht']]
group_labels = ['first_ht'] # name of the dataset
fig = ff.create_distplot(hist_data, group_labels)
fig.show()

hist_data = [df['second_age']]
group_labels = ['second_age'] # name of the dataset
fig = ff.create_distplot(hist_data, group_labels)
fig.show()

hist_data = [df['first_age']]
group_labels = ['first_age'] # name of the dataset
fig = ff.create_distplot(hist_data, group_labels)
fig.show()

In [None]:
#train-test split to fit and evaluate our models
#and standardize features

y = df['label'].values
X = df.drop(['label'],axis='columns').values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)  #, random_state = 0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
################## Naive Bayes ####################################

gaussian_classifier = GaussianNB()
gaussian_classifier.fit(X_train, y_train)

y_pred = gaussian_classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
acc = accuracy_score(y_test, y_pred)
print("Accuracy Score:\n", accuracy_score)
print("Training set score: {:.3f}".format(gaussian_classifier.score(X_train,y_train)))
print("Test set score: {:.3f}".format(gaussian_classifier.score(X_test,y_test)))

report = classification_report(y_test,y_pred)  
print(report)

gaussian_classifier.class_count_
gaussian_classifier.classes_
gaussian_classifier.epsilon_
gaussian_classifier.theta_

In [None]:
#try naive bayes using kfold

scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

kfold = KFold(n_splits=10) #, random_state=7)
result1 = cross_val_score(gaussian_classifier, X, y, cv=kfold, scoring='f1_score')
result2 = cross_val_score(gaussian_classifier, X, y, cv=kfold, scoring='recall')
result3 = cross_val_score(gaussian_classifier, X, y, cv=kfold, scoring='precision')
print("Mean F1 Score = %.2f%% - SD F1 Score = %.2f%%" % (result1.mean()*100, result1.std()*100 ))  
print("Mean Recall Score = %.2f%% - SD Recall = %.2f%%" % (result2.mean()*100, result2.std()*100 ))
print("Mean Precision Score = %.2f%% - SD Precision = %.2f%%" % (result3.mean()*100, result3.std()*100 ))

In [None]:
################## SVM ######################################

svm_classifier = SVC(kernel = 'linear', random_state = 0)
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
acc = accuracy_score(y_test, y_pred)
print("Accuracy Score:\n", accuracy_score)
print("Training set score: {:.3f}".format(svm_classifier.score(X_train,y_train)))
print("Test set score: {:.3f}".format(svm_classifier.score(X_test,y_test)))

report = classification_report(y_test,y_pred)  
print(report)

In [None]:
#try SVM using kfold

scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

kfold = KFold(n_splits=10) #, random_state=7)
result1 = cross_val_score(svm_classifier, X, y, cv=kfold, scoring='f1_score')
result2 = cross_val_score(svm_classifier, X, y, cv=kfold, scoring='recall')
result3 = cross_val_score(svm_classifier, X, y, cv=kfold, scoring='precision')
print("Mean F1 Score = %.2f%% - SD F1 Score = %.2f%%" % (result1.mean()*100, result1.std()*100 ))  
print("Mean Recall Score = %.2f%% - SD Recall = %.2f%%" % (result2.mean()*100, result2.std()*100 ))
print("Mean Precision Score = %.2f%% - SD Precision = %.2f%%" % (result3.mean()*100, result3.std()*100 ))

In [None]:
################## Decision Tree #####################################

dtree_classifier = DecisionTreeClassifier(criterion = 'entropy')
dtree_classifier.fit(X_train, y_train)

y_pred = dtree_classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
acc = accuracy_score(y_test, y_pred)
print("Accuracy Score:\n", accuracy_score)
print("Training set score: {:.3f}".format(svm_classifier.score(X_train,y_train)))
print("Test set score: {:.3f}".format(svm_classifier.score(X_test,y_test)))

report = classification_report(y_test,y_pred)  
print(report)

In [None]:
#try Decision Tree using kfold
kfold = KFold(n_splits=10) #, random_state=7)
result1 = cross_val_score(dtree_classifier, X, y, cv=kfold, scoring='f1_score')
result2 = cross_val_score(dtree_classifier, X, y, cv=kfold, scoring='recall')
result3 = cross_val_score(dtree_classifier, X, y, cv=kfold, scoring='precision')
print("Mean F1 Score = %.2f%% - SD F1 Score = %.2f%%" % (result1.mean()*100, result1.std()*100 ))  
print("Mean Recall Score = %.2f%% - SD Recall = %.2f%%" % (result2.mean()*100, result2.std()*100 ))
print("Mean Precision Score = %.2f%% - SD Precision = %.2f%%" % (result3.mean()*100, result3.std()*100 ))

In [None]:
################## Random Forest ######################################

rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
acc = accuracy_score(y_test, y_pred)
print("Accuracy Score:\n", accuracy_score)
print("Training set score: {:.3f}".format(rf_classifier.score(X_train,y_train)))
print("Test set score: {:.3f}".format(rf_classifier.score(X_test,y_test)))

#Analyze the results of Random Forest
report = classification_report(y_test,y_pred)  
print(report)

In [None]:
# Random Forest using kfold
kfold = KFold(n_splits=10) #, random_state=7)
result1 = cross_val_score(rf_classifier, X, y, cv=kfold, scoring='f1_score')
result2 = cross_val_score(rf_classifier, X, y, cv=kfold, scoring='recall')
result3 = cross_val_score(rf_classifier, X, y, cv=kfold, scoring='precision')
print("Mean F1 Score = %.2f%% - SD F1 Score = %.2f%%" % (result1.mean()*100, result1.std()*100 ))  
print("Mean Recall Score = %.2f%% - SD Recall = %.2f%%" % (result2.mean()*100, result2.std()*100 ))
print("Mean Precision Score = %.2f%% - SD Precision = %.2f%%" % (result3.mean()*100, result3.std()*100 ))

In [None]:
################## XG Boost ######################################################

xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)

y_pred = xgb_classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
acc = accuracy_score(y_test, y_pred)
print("Accuracy Score:\n", accuracy_score)
print("Training set score: {:.3f}".format(xgb_classifier.score(X_train,y_train)))
print("Test set score: {:.3f}".format(xgb_classifier.score(X_test,y_test)))

report = classification_report(y_test,y_pred)  
print(report)

In [None]:
# XGBoost using kfold
kfold = KFold(n_splits=10) #, random_state=7)
result1 = cross_val_score(xgb_classifier, X, y, cv=kfold, scoring='f1_score')
result2 = cross_val_score(xgb_classifier, X, y, cv=kfold, scoring='recall')
result3 = cross_val_score(xgb_classifier, X, y, cv=kfold, scoring='precision')
print("Mean F1 Score = %.2f%% - SD F1 Score = %.2f%%" % (result1.mean()*100, result1.std()*100 ))  
print("Mean Recall Score = %.2f%% - SD Recall = %.2f%%" % (result2.mean()*100, result2.std()*100 ))
print("Mean Precision Score = %.2f%% - SD Precision = %.2f%%" % (result3.mean()*100, result3.std()*100 ))

In [None]:
################## K-Nearest Neighbors ###########################################

knn_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_classifier .fit(X_train, y_train)

y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
acc = accuracy_score(y_test, y_pred)
print("Accuracy Score:\n", accuracy_score)
print("Training set score: {:.3f}".format(knn_classifier.score(X_train,y_train)))
print("Test set score: {:.3f}".format(knn_classifier.score(X_test,y_test)))

report = classification_report(y_test,y_pred)  
print(report)

In [None]:
#KNN using kfold
kfold = KFold(n_splits=10) #, random_state=7)
result1 = cross_val_score(knn_classifier, X, y, cv=kfold, scoring='f1_score')
result2 = cross_val_score(knn_classifier, X, y, cv=kfold, scoring='recall')
result3 = cross_val_score(knn_classifier, X, y, cv=kfold, scoring='precision')
print("Mean F1 Score = %.2f%% - SD F1 Score = %.2f%%" % (result1.mean()*100, result1.std()*100 ))  
print("Mean Recall Score = %.2f%% - SD Recall = %.2f%%" % (result2.mean()*100, result2.std()*100 ))
print("Mean Precision Score = %.2f%% - SD Precision = %.2f%%" % (result3.mean()*100, result3.std()*100 ))