Required Libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.metrics import make_scorer, cohen_kappa_score, accuracy_score, classification_report, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

Read Data

In [2]:
data = pd.read_csv("data/FIFA_World_Cup_Tidy.csv")

Change the continent column into dummy variables

In [3]:
data_continent = pd.get_dummies(data, columns = ['continent'])

Select columns to be used for prediction

In [4]:
data_predict = data_continent[['players_used', 'age', 'matches_played', 'goals', 'yellow_cards', 'red_cards', 'world_cup_winner', 'continent_Africa', 'continent_Americas', 'continent_Asia', 'continent_Europe', 'continent_Oceania']]

Training splits

In [5]:
X = data_predict.drop(columns=['world_cup_winner'])
y = data_predict[['world_cup_winner']]

In [6]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

In [8]:
y = y.values.ravel()
y_train = y_train.values.ravel()
y_valid = y_valid.values.ravel()

# Decision Tree Classifier

Find the most optimal amount of hyperparameters

In [9]:
dct_parameter = {'max_depth': [5, 6, 7, 8, 9, 10],
                 'max_features': [5, 6, 7, 8, 9, 10],
                 'max_leaf_nodes': [5, 6, 7, 8, 9, 10]}
dct_cv = ShuffleSplit(n_splits = 10, test_size = 0.70, random_state=0)
dct_scorer = make_scorer(accuracy_score)
dct = GridSearchCV(DecisionTreeClassifier(random_state=42), dct_parameter, cv=dct_cv)
dct.fit(X, y)

In [10]:
dct.best_estimator_

Predict based on the given best estimators

In [11]:
model_dct = DecisionTreeClassifier(max_depth=5, max_features=5, max_leaf_nodes=5, random_state=42, class_weight='balanced')
model_dct.fit(X_train, y_train)

In [12]:
y_pred = model_dct.predict(X_valid)
y_prob = model_dct.predict_proba(X_valid)[:, 1]

In [13]:
print("Classification Report:\n", classification_report(y_valid, y_pred), "\n")
print("Accuracy score:", accuracy_score(y_valid, y_pred))
print("Cohen’s Kappa score", cohen_kappa_score(y_valid, y_pred))
print("ROC-AUC score:", roc_auc_score(y_valid, y_prob))

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.88      0.93       108
           1       0.32      0.75      0.44         8

    accuracy                           0.87       116
   macro avg       0.65      0.81      0.69       116
weighted avg       0.93      0.87      0.89       116
 

Accuracy score: 0.8706896551724138
Cohen’s Kappa score 0.3847241867043848
ROC-AUC score: 0.818287037037037


# Random Forest Classifier

Find the most optimal amount of hyperparameters

In [14]:
rfc_parameter = {'n_estimators': [10, 15, 20, 25],
                 'max_depth': [5, 6, 7, 8, 9, 10],
                 'max_features': [5, 6, 7, 8, 9, 10],
                 'max_leaf_nodes': [10, 11, 12, 13, 14, 15]}
rfc_cv = ShuffleSplit(n_splits = 10, test_size = 0.30, random_state=0)
rfc_scorer = make_scorer(accuracy_score)
rfc = GridSearchCV(RandomForestClassifier(random_state=42), rfc_parameter, cv=rfc_cv, scoring=rfc_scorer)
rfc.fit(X, y)

In [15]:
rfc.best_estimator_

Predict based on the given best estimators

In [16]:
model_rfc = RandomForestClassifier(n_estimators=10, max_depth=5, max_features=8, max_leaf_nodes=11, random_state=42, class_weight='balanced')
model_rfc.fit(X_train, y_train)
y_pred = model_rfc.predict(X_valid)

In [17]:
y_pred = model_rfc.predict(X_valid)
y_prob = model_rfc.predict_proba(X_valid)[:, 1]

In [18]:
print("Classification Report:\n", classification_report(y_valid, y_pred), "\n")
print("Accuracy score:", accuracy_score(y_valid, y_pred))
print("Cohen’s Kappa score", cohen_kappa_score(y_valid, y_pred))
print("ROC-AUC score:", roc_auc_score(y_valid, y_prob))

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94       108
           1       0.22      0.25      0.24         8

    accuracy                           0.89       116
   macro avg       0.58      0.59      0.59       116
weighted avg       0.89      0.89      0.89       116
 

Accuracy score: 0.8879310344827587
Cohen’s Kappa score 0.1750547045951859
ROC-AUC score: 0.8703703703703705


# Neural Networks

In [19]:
model_nn = MLPClassifier(solver='lbfgs', hidden_layer_sizes=((80, 50)), random_state=42, activation='logistic', max_iter=500)
model_nn.fit(X_train, y_train)

In [20]:
y_pred = model_nn.predict(X_valid)
y_prob = model_nn.predict_proba(X_valid)[:, 1]

In [None]:
print("Classification Report:\n", classification_report(y_valid, y_pred), "\n")
print("Accuracy score:", accuracy_score(y_valid, y_pred))
print("Cohen’s Kappa score", cohen_kappa_score(y_valid, y_pred))
print("ROC-AUC score:", roc_auc_score(y_valid, y_prob))

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.94       108
           1       0.17      0.12      0.14         8

    accuracy                           0.90       116
   macro avg       0.55      0.54      0.54       116
weighted avg       0.88      0.90      0.89       116
 

Accuracy score: 0.896551724137931
Cohen’s Kappa score 0.08900523560209428
ROC-AUC score: 0.8252314814814814
