### Load Packages & Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
dat = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-04-04/soccer21-22.csv')

dat.head(5)

### EDA

In [None]:
## Check Columns for NAs
len(dat) - dat.count()

In [None]:
## Plot the count of Full Time Results (FTR)
sns.countplot(x = 'FTR', data = dat)

In [None]:
dat['FTR'].value_counts()

### Get Key Model Features and Create a Plot

In [None]:
dat.columns

In [None]:
dat_model = dat[['FTR', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC',
       'HY', 'AY', 'HR', 'AR']]

dat_model.head()

In [None]:
dat_model.columns

In [None]:
dat_model_long = pd.melt(dat_model, 
        id_vars = 'FTR',
        value_vars = ['HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC',
       'HY', 'AY', 'HR', 'AR'])

dat_model_long.head()

In [None]:
sns.boxplot(x = 'FTR',
           y = 'value',
           data = dat_model_long)

In [None]:
f = sns.FacetGrid(dat_model_long, col = "variable", col_wrap = 4, sharey = False, sharex = False)
f.map(sns.boxplot, 'FTR', 'value')
f.add_legend()

### Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
features = dat_model.drop('FTR', axis=1)
outcome = dat_model['FTR']

X_train, X_test, y_train, y_test = train_test_split(
    features,
    outcome, 
    test_size=0.25,
    stratify=outcome
)

In [None]:
X_train.head()

In [None]:
y_train.value_counts() / len(y_train)

In [None]:
y_test.value_counts() / len(y_test)

### Set Up Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier()

### Perform Hyperparameter Tuning on Cross Validation Folds

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
# Dictionary of the parameter grid to explore
forest_params = [{'max_features': list(np.linspace(start = 1, stop = 14, num = 5).astype(int)),
                 'n_estimators': list(np.linspace(start = 400, stop = 900, num = 5).astype(int))}]

In [None]:
# create cross validated structure
cv_structure = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 1, random_state=30)

# Set up grid search using accuracy are the performance metric
rf_cv = GridSearchCV(estimator = rf_model,
                     param_grid = forest_params,
                     scoring = 'roc_auc_ovr',
                     n_jobs = -1,
                     cv = cv_structure,
                     verbose = 2)

### Train the model using Cross Validation

In [None]:
rf_cv.fit(X = X_train, y = y_train)

### Best Parameters

In [None]:
# parameters
rf_cv.best_params_

In [None]:
# best model
rf_cv.best_estimator_ 

In [None]:
# Cross Validated Scoring
rf_cv.cv_results_

In [None]:
# store the best parameters
mtry = rf_cv.best_params_['n_estimators']
n_features = rf_cv.best_params_['max_features']

In [None]:
mtry, n_features

### Refit the model using the optimal hyperparameters

In [None]:
rfc_optimized = RandomForestClassifier(n_estimators = mtry, max_features = n_features)
rfc_optimized.fit(X = X_train, y = y_train)

In [None]:
rfc_optimized

### Variable of Importance Plot

#### dotchained version

In [None]:
(
    pd.Series(
        rfc_optimized.feature_importances_,
        index=rfc_optimized.feature_names_in_
    )
    .sort_values(ascending=False)
    .plot(kind='bar', title='Feature Importance')
)

#### original version:

In [None]:
import matplotlib.pyplot as plt

In [None]:
importances = rfc_optimized.feature_importances_
importances

In [None]:
## sort the importances in descending order
indices = np.argsort(importances)[::-1]
indices

In [None]:
## Rearrange feature names so that they match the sorted feature importances
names = [X_test.columns[i] for i in indices]
names

In [None]:
# plot title
plt.title("Feature Importance")

# barplot
plt.bar(range(X_test.shape[1]), importances[indices])

# feature names for x-axis
plt.xticks(range(X_test.shape[1]), names, rotation = 90)

### Get Predictions on Test Set

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
rfc_test_pred = rfc_optimized.predict(X_test)

In [None]:
confusion_matrix(y_test, rfc_test_pred)

In [None]:
print(classification_report(y_test, rfc_test_pred))

### Save the model

In [None]:
from pickle import dump

In [None]:
dump(rfc_optimized, open('rfc_optimized.pkl', 'wb'))

### Load the model

In [None]:
from pickle import load

In [None]:
model = load(open('rfc_optimized.pkl', 'rb'))

In [None]:
model

In [None]:
# get a new observations
new_dat = dat_model.loc[[180, 242]]
new_dat

In [None]:
## Get the features by themselves to make predictions
new_feats = new_dat.drop('FTR', axis = 1)
new_feats

In [None]:
## Add predictions to the new data
new_dat['pred_ftr'] = model.predict(new_feats)
new_dat