In [1]:
import random

import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, log_loss

from src.dataset import get_dataset
from src.models.elog import ELOgPredictor
from src.models.frequency_random import FrequencyRandomMatchPredictor
from src.models.uniform_random import UniformRandomMatchPredictor

In [2]:
ITERATIONS = 1
random.seed(5438)
np.random.seed(5438)

In [3]:
def determine_target(row):
    if row["home_score"] > row["away_score"]:
        return 0
    elif row["home_score"] == row["away_score"]:
        return 1
    else:
        return 2

In [4]:
dataset = get_dataset()

  mls_df = pd.read_csv("data/mls_matches.csv")


In [5]:
dataset

Unnamed: 0,date,home_team,home_score,away_score,away_team,neutral,fold
0,1971-08-07,Santa Cruz,1,4,Corinthians,False,brazil
1,1971-08-07,Internacional,0,0,Fluminense,False,brazil
2,1971-08-07,Palmeiras,1,0,Portuguesa-SP,False,brazil
3,1971-08-07,Ceará,0,0,Vasco da Gama,False,brazil
4,1971-08-07,Coritiba,0,2,Cruzeiro,False,brazil
...,...,...,...,...,...,...,...
143000,2017-07-06,Grenada,0,2,Barbados,False,international
143001,2016-08-25,Barawa,0,5,Tamil Eelam,False,international
143002,2016-08-26,Barawa,2,3,Chagos Islands,False,international
143003,2016-08-28,Chagos Islands,1,5,Tamil Eelam,True,international


In [6]:
model_classes = [FrequencyRandomMatchPredictor, UniformRandomMatchPredictor, ELOgPredictor]
# folds_names = ["brazil", "libertadores", "mls", "europe", "international"]
folds_names = ["brazil", "international"]

In [7]:
folds_train = [dataset[dataset["fold"] != name] for name in folds_names]
folds_test = [dataset[dataset["fold"] == name] for name in folds_names]

In [8]:
results = pd.DataFrame({}, columns=["metric", "model", "fold", "iteration", "value"])

In [9]:
for iteration in range(ITERATIONS):
    for model_class in model_classes:
        for fold_train, fold_test, fold_test_name in zip(
            folds_train, folds_test, folds_names
        ):
            model = model_class()
            model.fit(fold_train)
            pred = model.predict_proba(fold_test)
            max_pred = np.argmax(pred, axis=1)
            target = fold_test.apply(determine_target, axis=1).to_numpy()
            report = classification_report(target, max_pred, target_names=["win", "draw", "loss"], output_dict=True)
            metrics = {
                "accuracy": report["accuracy"],
                "log_loss": log_loss(target, pred, labels=[0, 1, 2]),
                "win_precision": report["win"]["precision"],
                "win_recall": report["win"]["recall"],
                "draw_precision": report["draw"]["precision"],
                "draw_recall": report["draw"]["recall"],
                "loss_precision": report["loss"]["precision"],
                "loss_recall": report["loss"]["recall"],
                "weighted_precision": report["weighted avg"]["precision"],
                "weighted_recall": report["weighted avg"]["recall"],
            }
            for key, value in metrics.items():
                results.loc[len(results)] = {
                    "metric": key,
                    "model": model_class.__name__,
                    "fold": fold_test_name,
                    "iteration": iteration+1,
                    "value": value
                }

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  home_neutral_df["target"] = home_neutral_df.apply(determine_target, axis=1)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  

In [10]:
results

Unnamed: 0,metric,model,fold,iteration,value
0,accuracy,FrequencyRandomMatchPredictor,brazil,1,0.481029
1,log_loss,FrequencyRandomMatchPredictor,brazil,1,1.057446
2,win_precision,FrequencyRandomMatchPredictor,brazil,1,0.481029
3,win_recall,FrequencyRandomMatchPredictor,brazil,1,1.0
4,draw_precision,FrequencyRandomMatchPredictor,brazil,1,0.0
5,draw_recall,FrequencyRandomMatchPredictor,brazil,1,0.0
6,loss_precision,FrequencyRandomMatchPredictor,brazil,1,0.0
7,loss_recall,FrequencyRandomMatchPredictor,brazil,1,0.0
8,weighted_precision,FrequencyRandomMatchPredictor,brazil,1,0.231389
9,weighted_recall,FrequencyRandomMatchPredictor,brazil,1,0.481029


In [13]:
results.groupby(['metric', 'model', 'fold'])['value'].mean().reset_index().groupby(['metric', 'model'])['value'].mean().reset_index().pivot(index='model', columns='metric', values='value')

metric,accuracy,draw_precision,draw_recall,log_loss,loss_precision,loss_recall,weighted_precision,weighted_recall,win_precision,win_recall
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ELOgPredictor,0.536295,0.0,0.0,0.970716,0.451488,0.456333,0.394152,0.536295,0.565045,0.854041
FrequencyRandomMatchPredictor,0.485622,0.0,0.0,1.055025,0.0,0.0,0.23585,0.485622,0.485622,1.0
UniformRandomMatchPredictor,0.485622,0.0,0.0,1.098612,0.0,0.0,0.23585,0.485622,0.485622,1.0
