# Import Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
sns.set_style("darkgrid")

from IPython.display import Image
from pydotplus import graph_from_dot_data
from sklearn import metrics

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

from statsmodels.stats.outliers_influence import variance_inflation_factor

from xgboost import XGBClassifier

import joblib

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)
plt.rcParams['figure.figsize'] = (30,30)

import warnings
warnings.filterwarnings("ignore")

# User Test Function

Run the following 2 codes to have a test on the functions created:

In [None]:
from user_test import user_test

In [None]:
user_test()

# Summary of Project 

## Data Collection & Transformation

## Clustering

## Classification

## Case Study

We will conduct the case study in 2 parts, the first part is the pre-match prediction and the second part is the post-match analysis.

__Djokovic beats Federer in Wimbledon 2019 Final over 5 sets in 4 hours and 57 minutes__

__Final Score: Djokovic 7-6(5) 1-6 7-6(4) 4-6 13-12(3) Federer__

--------------------------------------------------------------------------------------------------------------

The pre-match prediction model uses only 2 metrics: 
* __second_serves_return_points_won_%__
* __break_point_convert_%__

For this case study, we will use the following pre-match statistics available to us:
* __Tournament Average across first 6 matches before final__
* __YTD (2019) Player Statistics__
* __Previous H2H Accumulated Statistics prior to the match__

--------------------------------------------------------------------------------------------------------------

The match analysis model uses 6 metrics:
* __first_serves_points_won_%__
* __second_serves_points_won_%__
* __break_point_save_%__
* __first_serves_return_points_won_%__
* __break_point_convert_%__
* __break_points_return_total__

For this case study, we will use the following match statistics available to us:
* __Full Match Statistics__
* __Individual Sets Statistics__
* __Cumulative Win Probability over 5 sets__

### Extract Trained Models 

In [None]:
# Load the trained models
prediction = joblib.load('prediction.pkl')  # for pre-match prediction

analysis = joblib.load('analysis.pkl')      # for match analysis

### Data Preparation

In [None]:
# Load main dataset on matches
df_match = pd.read_csv('index_csv/clean_df_v3.0.csv')

In [None]:
# Load case study file
df_wim_2019_final = df_match[df_match['match_id'] == '2019-540-MS001-10-1-d643-f324']

# Remove from matches data
df_match = df_match[df_match['match_id'] != '2019-540-MS001-10-1-d643-f324']
df_match.reset_index(drop = True, inplace = True)

In [None]:
# Define features required for each analysis
pre_match = ['second_serves_return_points_won_%', 'break_point_convert_%']

match_analysis = ['first_serves_points_won_%', 'second_serves_points_won_%', 'break_point_save_%',
                  'first_serves_return_points_won_%', 'break_point_convert_%', 'break_points_return_total']

#### Tournament Performance

Here we wil extract the 6 matches for both players, Novak Djokovic and Roger Federer, leading up to the final. We will calculate the performance of the 2 features required by the model from the first 6 matches, which would account for their form going into the final.

In [None]:
# Extract the 6 matches that both players played before final

df_2019 = df_match[df_match['year'] == 2019]
df_wim_2019 = df_2019[df_2019['tourney_slug'] == 'wimbledon']

df_djokovic = df_wim_2019[df_wim_2019['slug'] == 'd643']
df_federer = df_wim_2019[df_wim_2019['slug'] == 'f324']

In [None]:
# Extract raw stats required to compute statistics

raw_stats = ['second_serve_return_won', 'second_serve_return_total',
            'break_points_converted', 'break_points_return_total']

df_djokovic = df_djokovic[raw_stats]
df_federer = df_federer[raw_stats]

In [None]:
djokovic_1 = df_djokovic.sum()[0] / df_djokovic.sum()[1]
djokovic_2 = df_djokovic.sum()[2] / df_djokovic.sum()[3]

federer_1 = df_federer.sum()[0] / df_federer.sum()[1]
federer_2 = df_federer.sum()[2] / df_federer.sum()[3]

feature_1 = list([djokovic_1, federer_1])
feature_2 = list([djokovic_2, federer_2])

In [None]:
tourney_average = pd.DataFrame(columns = pre_match)

tourney_average['second_serves_return_points_won_%'] = feature_1
tourney_average['break_point_convert_%'] = feature_2

#### YTD Player Statistics (01 Jan 2019 - 13 July 2019)

YTD Player Statistics from __1/1/2019 - 13/07/2019__

Wimbledon Final Date: __14/07/2019__

Extracted from: 
- Djokovic: https://www.ultimatetennisstatistics.com/playerProfile?playerId=4920
- Federer: https://www.ultimatetennisstatistics.com/playerProfile?playerId=3819

In [None]:
feature_1 = [0.551, 0.503]
feature_2 = [0.50, 0.396]

ytd_average = pd.DataFrame(columns = pre_match)

ytd_average['second_serves_return_points_won_%'] = feature_1
ytd_average['break_point_convert_%'] = feature_2

#### Previous H2H Accumulated Statistics

This section aims to account for both players' performance when they face each other in previous head-to-head matches prior to the match. This dataframe would capture the match dynamic to be expected in this match.

In [None]:
df_djokovic = df_match[df_match['slug'] == 'd643']
df_federer = df_match[df_match['slug'] == 'f324']

df_djokovic.reset_index(drop = True, inplace = True)
df_federer.reset_index(drop = True, inplace = True)

In [None]:
# Filter only djokovic-federer match
for i in range(0,len(df_djokovic)):
    if 'f324' in df_djokovic['match_id'][i]:
        pass
    else:
        df_djokovic.drop(i, inplace = True)
        
# Filter only federer-djokovic match
for i in range(0,len(df_federer)):
    if 'd643' in df_federer['match_id'][i]:
        pass
    else:
        df_federer.drop(i, inplace = True)
        
# Remove the only other match they played in 2019
df_djokovic = df_djokovic[df_djokovic['match_id'] != '2019-605-MS016-1-10-f324-d643']
df_federer = df_federer[df_federer['match_id'] != '2019-605-MS016-1-10-f324-d643']

In [None]:
# Extract raw stats required to compute statistics

raw_stats = ['second_serve_return_won', 'second_serve_return_total',
            'break_points_converted', 'break_points_return_total']

df_djokovic = df_djokovic[raw_stats]
df_federer = df_federer[raw_stats]

In [None]:
djokovic_1 = df_djokovic.sum()[0] / df_djokovic.sum()[1]
djokovic_2 = df_djokovic.sum()[2] / df_djokovic.sum()[3]

federer_1 = df_federer.sum()[0] / df_federer.sum()[1]
federer_2 = df_federer.sum()[2] / df_federer.sum()[3]

feature_1 = list([djokovic_1, federer_1])
feature_2 = list([djokovic_2, federer_2])

In [None]:
h2h_average = pd.DataFrame(columns = pre_match)

h2h_average['second_serves_return_points_won_%'] = feature_1
h2h_average['break_point_convert_%'] = feature_2

#### Full Match Statistics

This dataframe consists of 2 lines of data describing the player's individual full match statistics. Player 0 is Novak Djokovic and Player 1 is Roger Federer.

In [None]:
df_full_match = df_wim_2019_final[match_analysis]

#### Individual Set Statistics

This dataset aims to capture the performance of both players by individual set during the match. Each set is represented as a row.

In [None]:
df_djokovic_individual_stats = pd.DataFrame(columns = match_analysis)

df_djokovic_individual_stats['first_serves_points_won_%']        = [0.88, 0.27, 0.78, 0.80, 0.74]
df_djokovic_individual_stats['second_serves_points_won_%']       = [0.52, 0.33, 0.69, 0.30, 0.43]
df_djokovic_individual_stats['break_point_save_%']               = [1.00, 0.25, 1.00, 0.00, 0.60]
df_djokovic_individual_stats['first_serves_return_points_won_%'] = [0.25, 0.00, 0.13, 0.24, 0.27]
df_djokovic_individual_stats['break_point_convert_%']            = [0.00, 0.00, 0.00, 0.50, 0.33]
df_djokovic_individual_stats['break_points_return_total']        = [0.00, 0.00, 0.00, 2.00, 6.00]

df_federer_individual_stats = pd.DataFrame(columns = match_analysis)

df_federer_individual_stats['first_serves_points_won_%']        = [0.75, 1.00, 0.88, 0.76, 0.73]
df_federer_individual_stats['second_serves_points_won_%']       = [0.54, 0.33, 0.67, 0.54, 0.50]
df_federer_individual_stats['break_point_save_%']               = [0.00, 0.00, 0.00, 0.50, 0.67]
df_federer_individual_stats['first_serves_return_points_won_%'] = [0.12, 0.73, 0.22, 0.20, 0.26]
df_federer_individual_stats['break_point_convert_%']            = [0.00, 0.75, 0.00, 1.00, 0.40]
df_federer_individual_stats['break_points_return_total'] 

#### Cumulative Over Set Statistics

This section aims to capture the cumulative performance as the match progresses each set, each row will describe the cumulative statistics at the conclusion of each set.

In [None]:
df_djokovic_cumulative_stats = pd.DataFrame(columns = match_analysis)

df_djokovic_cumulative_stats['first_serves_points_won_%']        = [0.88, 0.694, 0.73, 0.7435, 0.743]
df_djokovic_cumulative_stats['second_serves_points_won_%']       = [0.52, 0.466, 0.535, 0.4905, 0.47]
df_djokovic_cumulative_stats['break_point_save_%']               = [1.00, 0.4, 0.5, 0.375, 0.46]
df_djokovic_cumulative_stats['first_serves_return_points_won_%'] = [0.25, 0.189, 0.164, 0.179, 0.21]
df_djokovic_cumulative_stats['break_point_convert_%']            = [0.00, 0.00, 0.00, 0.50, 0.375]
df_djokovic_cumulative_stats['break_points_return_total']        = [0.00, 0.00, 0.00, 2.00, 8.00]

df_federer_cumulative_stats = pd.DataFrame(columns = match_analysis)

df_federer_cumulative_stats['first_serves_points_won_%']        = [0.75, 0.811, 0.836, 0.821, 0.79]
df_federer_cumulative_stats['second_serves_points_won_%']       = [0.54, 0.455, 0.516, 0.523, 0.51]
df_federer_cumulative_stats['break_point_save_%']               = [0.00, 0.00, 0.00, 0.50, 0.625]
df_federer_cumulative_stats['first_serves_return_points_won_%'] = [0.12, 0.306, 0.27, 0.256, 0.26]
df_federer_cumulative_stats['break_point_convert_%']            = [0.00, 0.6, 0.5, 0.625, 0.54]
df_federer_cumulative_stats['break_points_return_total']