In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# import data

all_data = pd.read_csv('../input/bundesliga-1-results-and-metrics-2011-to-2021/Bundesliga_1_Seasons_11_12_to_20_21.csv')

all_data.shape

In [None]:
train_data = all_data.copy()

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
# define target

target = 'FullTimeResult'

In [None]:
# drop useless features

train_data.drop('Date', axis=1, inplace=True)
train_data.drop('HomeTeam', axis=1, inplace=True)
train_data.drop('AwayTeam', axis=1, inplace=True)

In [None]:
# get list of categorical variables

s = (train_data.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

In [None]:
# label encoding

train_data['FullTimeResult'].replace({'H': 1 , 'D': 0 ,'A' : -1}, inplace=True)
train_data['HalfTimeResult'].replace({'H': 1 , 'D': 0 ,'A' : -1}, inplace=True)

In [None]:
# drop post game stats

train_data.drop('FullTimeHomeGoals', axis=1, inplace=True)
train_data.drop('FullTimeAwayGoals', axis=1, inplace=True)
train_data.drop('HalfTimeHomeGoals', axis=1, inplace=True)
train_data.drop('HalfTimeAwayGoals', axis=1, inplace=True)
train_data.drop('HalfTimeResult', axis=1, inplace=True)
train_data.drop('HomeShots', axis=1, inplace=True)
train_data.drop('AwayShots', axis=1, inplace=True)
train_data.drop('HomeShotsOnGoal', axis=1, inplace=True)
train_data.drop('AwayShotsOnGoal', axis=1, inplace=True)
train_data.drop('HomeFouls', axis=1, inplace=True)
train_data.drop('AwayFouls', axis=1, inplace=True)
train_data.drop('HomeCorners', axis=1, inplace=True)
train_data.drop('AwayCorners', axis=1, inplace=True)
train_data.drop('HomeYellowCards', axis=1, inplace=True)
train_data.drop('AwayYellowCards', axis=1, inplace=True)
train_data.drop('HomeRedCards', axis=1, inplace=True)
train_data.drop('AwayRedCards', axis=1, inplace=True)
train_data.drop('HomePoints', axis=1, inplace=True)
train_data.drop('AwayPoints', axis=1, inplace=True)

In [None]:
train_data.info()

In [None]:
# search for missing values in data frame

Total = train_data.isnull().sum().sort_values(ascending=False)
percent = (train_data.isnull().sum() / train_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([Total, percent], axis=1, keys=['Total', 'Percent'])

missing_data.head(10)

In [None]:
# drop features with over 10% missing data

train_data.drop((missing_data[missing_data['Percent'] > 0.10]).index, axis=1, inplace=True)

print(train_data.isnull().sum().max())

In [None]:
# verification if no missing values are imminent

total = train_data.isnull().sum().sort_values(ascending=False)

total.head(5)

In [None]:
# spearman correlation of target relative to the other integer attributes

train_corr_matrix = train_data.corr(method='spearman')
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(abs(train_corr_matrix), vmax=.8, square=True, cmap='rocket_r');

In [None]:
# preselect features out of correlation threshold

prefeatures = train_corr_matrix[(train_corr_matrix[target]>0.15) | (train_corr_matrix[target]<-0.15)].index.tolist()
prefeatures.remove(target)

print(prefeatures)

In [None]:
# spearman correlation of preselected features

features_corr = train_data[prefeatures]
features_corr_matrix = features_corr.corr(method='spearman')
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(abs(features_corr_matrix), vmax=.8, square=True, cmap='rocket_r');

In [None]:
# identify strong intercorrelating features

s = features_corr_matrix.unstack()
so = s.sort_values(kind='quicksort')
so = so.to_frame(name='correlation')
so.drop(so.loc[so['correlation']==1].index, inplace=True)
so.drop(so.loc[(so['correlation'] <= 0.9) & (so['correlation']>= -0.9)].index, inplace=True)
so.shape

In [None]:
so.head(10)

In [None]:
# set final feature selection

train_data.drop('GoalDiffDiff', axis=1, inplace=True)
features = prefeatures
features.remove('GoalDiffDiff')

print(features)

In [None]:
# train test split

seed_value = 42
y = all_data[target]
X = all_data[features]
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.25, random_state=seed_value)

In [None]:
# Randomized hyperparameter tuning 1/3

n_estimators = [120, 300, 500, 800, 1200]
max_features = ['auto', 'sqrt', None]
max_depth = [5, 8, 15, 25, 30, None]
min_samples_split = [1, 2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
              }

print(random_grid)

In [None]:
# RFC

params = {'n_estimators': 300,
 'min_samples_split': 100,
 'min_samples_leaf': 10,
 'max_features': 'sqrt',
 'max_depth': 8}

rf_classifier = RandomForestClassifier(**params, random_state = seed_value)
rf_classifier.fit(train_X, train_y)
rf_classifier_pred = rf_classifier.predict(val_X)
rf_classifier_pred_all = rf_classifier.predict(X)

In [None]:
# Randomized hyperparameter tuning 2/3

#rf = rf_classifier
#rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
#rf_random.fit(train_X, train_y)
#rf_random_pred = rf_random.predict(val_X)

In [None]:
# Randomized hyperparameter tuning 3/3

#rf_random.best_params_

In [None]:
# k-fold cv

kfold = StratifiedKFold(n_splits=10, random_state=seed_value, shuffle=True)
results = cross_val_score(rf_classifier, X, y, cv=kfold)

print("Accuracy:", results.mean(), "Sigma:", results.std())

In [None]:
# confusion matrix

cm = confusion_matrix(y, rf_classifier_pred_all)

print(cm)

In [None]:
# generate output file

prediction_data = pd.DataFrame({'Prediction': rf_classifier_pred_all}) 
output = all_data.join(prediction_data)
output.to_csv('prediction.csv', index=False)

output.shape, all_data.shape, prediction_data.shape

In [None]:
output.tail(10)