In [1]:
import pandas as pd
import numpy as np
import warnings
import os
import matplotlib.pyplot as plt

In [39]:
df = pd.read_csv("laliga_matches.csv")
df.head()

Unnamed: 0,match_id,team1_name,team1_average_distance,team1_recovery_passes,team1_throw_in_passes,team1_free_kick_passes,team1_corner_passes,team1_goal_kick_passes,team1_interception_passes,team1_kick_off_passes,...,team2_avg_x_position_non_gk,team2_avg_links,team2_avg_pass_distance,team2_avg_backline_distance,team2_avg_front_three_distance,team2_avg_time_between_passes,team2_avg_passes_per_possession,team2_betti_0,team2_betti_1,match_outcome
0,69225,Barcelona,0.548764,0.099315,0.041096,0.032534,0.018836,0.008562,0.003425,0.001712,...,0.458763,7.071429,0.245051,0.420846,0.689028,3.686673,2.719298,1,54,team1_wins
1,69212,Barcelona,0.571302,0.045531,0.030354,0.028668,0.008432,0.003373,0.008432,0.003373,...,0.470242,9.928571,0.213755,0.420846,0.689028,3.35635,4.086538,1,67,team1_wins
2,69235,Málaga,0.627899,0.221477,0.107383,0.080537,0.003356,0.033557,0.013423,0.016779,...,0.520929,7.642857,0.218094,0.420846,0.689028,3.734753,2.647887,1,57,team2_wins
3,69232,Barcelona,0.533748,0.04321,0.029321,0.024691,0.009259,0.006173,0.003086,0.001543,...,0.50155,7.428571,0.236023,0.420846,0.689028,3.846556,3.346939,1,52,team1_wins
4,69216,Barcelona,0.59545,0.071082,0.029079,0.048465,0.008078,0.009693,0.001616,0.001616,...,0.483673,7.785714,0.26344,0.420846,0.689028,3.651864,2.962617,1,61,team1_wins


In [40]:
drop_columns = [
    "match_outcome",
    "match_id",
    "team1_name",
    "team2_name",
    "team1_most_advanced_player",
    "team2_most_advanced_player",
]

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

X = df.drop(drop_columns, axis=1)  # Features
y = df["match_outcome"]  # Target variable

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

# Keep copies of the original data before scaling
X_train_original = X_train.copy()
X_test_original = X_test.copy()

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define Logistic Regression with L1 regularization
logistic_model = LogisticRegression(C=4.5, penalty="l1", solver="liblinear")
logistic_model.fit(X_train_scaled, y_train)

# Predict on training and testing sets
y_pred_train = logistic_model.predict(X_train_scaled)
y_pred_test = logistic_model.predict(X_test_scaled)

# Print accuracy scores
print(f"Train Accuracy: {accuracy_score(y_train, y_pred_train)}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test)}")

# Create DataFrames for error analysis with original data
df_train = pd.DataFrame(X_train_original, columns=X.columns)
df_train["true_label"] = y_train
df_train["predicted_label"] = y_pred_train

df_test = pd.DataFrame(X_test_original, columns=X.columns)
df_test["true_label"] = y_test
df_test["predicted_label"] = y_pred_test

# Print classification report and confusion matrix
print("Classification Report (Test):")
print(classification_report(y_test, y_pred_test))
print("Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_pred_test))

# Display the first few rows of the DataFrames
print("\nTraining DataFrame:")
print(df_train.head())
print("\nTesting DataFrame:")
print(df_test.head())

# Save DataFrames to CSV for further analysis (optional)
# df_train.to_csv('train_data_with_predictions.csv', index=False)
# df_test.to_csv('test_data_with_predictions.csv', index=False)


Train Accuracy: 0.9193083573487032
Test Accuracy: 0.867816091954023
Classification Report (Test):
              precision    recall  f1-score   support

           0       0.74      0.50      0.60        34
           1       0.91      0.99      0.95        87
           2       0.84      0.91      0.87        53

    accuracy                           0.87       174
   macro avg       0.83      0.80      0.81       174
weighted avg       0.86      0.87      0.86       174

Confusion Matrix (Test):
[[17  8  9]
 [ 1 86  0]
 [ 5  0 48]]

Training DataFrame:
     team1_average_distance  team1_recovery_passes  team1_throw_in_passes  \
761                0.687933               0.073394               0.024465   
662                0.686997               0.107527               0.107527   
457                0.559213               0.100000               0.056897   
522                0.688931               0.058917               0.022293   
742                0.576937               0.065606   

In [52]:
error_rows = df_train[df_train['true_label'] != df_train['predicted_label']].index

In [59]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

# Assuming df_train and df_test are already created
# df_train and df_test contain original features, true labels, and predicted labels

# Function to calculate per-class accuracy
def per_class_accuracy(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    per_class_acc = cm.diagonal() / cm.sum(axis=1)
    return per_class_acc

# Training data analysis
print("Training Data Analysis:")
y_train_true = df_train["true_label"]
y_train_pred = df_train["predicted_label"]

train_classification_report = classification_report(y_train_true, y_train_pred, target_names=label_encoder.classes_)
train_confusion_matrix = confusion_matrix(y_train_true, y_train_pred)
train_per_class_acc = per_class_accuracy(y_train_true, y_train_pred)

print("Classification Report (Train):")
print(train_classification_report)
print("Confusion Matrix (Train):")
print(train_confusion_matrix)
print("Per-Class Accuracy (Train):")
for idx, class_name in enumerate(label_encoder.classes_):
    print(f"{class_name}: {train_per_class_acc[idx]:.4f}")

# Testing data analysis
print("\nTesting Data Analysis:")
y_test_true = df_test["true_label"]
y_test_pred = df_test["predicted_label"]

test_classification_report = classification_report(y_test_true, y_test_pred, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(y_test_true, y_test_pred)
test_per_class_acc = per_class_accuracy(y_test_true, y_test_pred)

print("Classification Report (Test):")
print(test_classification_report)
print("Confusion Matrix (Test):")
print(test_confusion_matrix)
print("Per-Class Accuracy (Test):")
for idx, class_name in enumerate(label_encoder.classes_):
    print(f"{class_name}: {test_per_class_acc[idx]:.4f}")

# Additional error analysis: Misclassified samples
df_train_misclassified = df_train[df_train["true_label"] != df_train["predicted_label"]]
df_test_misclassified = df_test[df_test["true_label"] != df_test["predicted_label"]]

print("\nMisclassified Samples in Training Data:")
print(df_train_misclassified.head())

print("\nMisclassified Samples in Testing Data:")
print(df_test_misclassified.head())


Training Data Analysis:
Classification Report (Train):
              precision    recall  f1-score   support

        draw       0.89      0.69      0.78       140
  team1_wins       0.93      0.98      0.95       333
  team2_wins       0.92      0.97      0.95       221

    accuracy                           0.92       694
   macro avg       0.91      0.88      0.89       694
weighted avg       0.92      0.92      0.92       694

Confusion Matrix (Train):
[[ 97  25  18]
 [  7 326   0]
 [  5   1 215]]
Per-Class Accuracy (Train):
draw: 0.6929
team1_wins: 0.9790
team2_wins: 0.9729

Testing Data Analysis:
Classification Report (Test):
              precision    recall  f1-score   support

        draw       0.74      0.50      0.60        34
  team1_wins       0.91      0.99      0.95        87
  team2_wins       0.84      0.91      0.87        53

    accuracy                           0.87       174
   macro avg       0.83      0.80      0.81       174
weighted avg       0.86      0.87

In [61]:
df_train

Unnamed: 0,team1_average_distance,team1_recovery_passes,team1_throw_in_passes,team1_free_kick_passes,team1_corner_passes,team1_goal_kick_passes,team1_interception_passes,team1_kick_off_passes,team1_through_ball_percentage,team1_pass_completion_rate,...,team2_avg_links,team2_avg_pass_distance,team2_avg_backline_distance,team2_avg_front_three_distance,team2_avg_time_between_passes,team2_avg_passes_per_possession,team2_betti_0,team2_betti_1,true_label,predicted_label
761,0.687933,0.073394,0.024465,0.022936,0.003058,0.015291,0.006116,0.001529,1.223242,84.250765,...,8.214286,0.243489,0.420846,0.689028,3.700231,4.500000,1,60,1,1
662,0.686997,0.107527,0.107527,0.037634,0.005376,0.086022,0.005376,0.005376,1.612903,54.838710,...,9.307692,0.206373,0.420846,0.689028,3.732547,7.666667,1,57,1,1
457,0.559213,0.100000,0.056897,0.031034,0.013793,0.003448,0.006897,0.006897,0.000000,76.034483,...,7.928571,0.230521,0.420846,0.689028,4.442975,3.299065,1,56,2,1
522,0.688931,0.058917,0.022293,0.023885,0.009554,0.009554,0.011146,0.001592,1.114650,85.350318,...,8.857143,0.232554,0.420846,0.689028,4.243085,4.238095,1,64,1,1
742,0.576937,0.065606,0.031809,0.025845,0.021869,0.005964,0.013917,0.007952,0.000000,81.908549,...,9.214286,0.200794,0.420846,0.689028,3.922012,5.184874,1,61,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.592320,0.066547,0.050360,0.019784,0.007194,0.014388,0.005396,0.003597,2.877698,82.553957,...,8.214286,0.215615,0.420846,0.689028,3.586084,4.099010,1,54,1,1
270,0.573518,0.092050,0.026499,0.036262,0.012552,0.011158,0.015342,0.004184,0.139470,81.589958,...,7.285714,0.219707,0.420846,0.689028,3.282487,3.490909,1,55,2,2
860,0.645646,0.136364,0.043062,0.031100,0.009569,0.021531,0.000000,0.004785,1.196172,74.162679,...,8.538462,0.213053,0.420846,0.689028,3.584072,5.575000,1,53,0,0
435,0.611799,0.103873,0.049296,0.024648,0.012324,0.017606,0.017606,0.007042,0.176056,77.112676,...,6.428571,0.271201,0.420846,0.689028,4.306006,2.508772,1,45,0,1


In [67]:
df_train.loc[error_rows]['true_label'].value_counts()

true_label
0    43
1     7
2     6
Name: count, dtype: int64

In [70]:
import numpy as np
import pandas as pd
from sklearn.utils import class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming df, X, y, drop_columns, and label_encoder are already defined
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

# Keep copies of the original data before scaling
X_train_original = X_train.copy()
X_test_original = X_test.copy()

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Calculate class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Define Logistic Regression with L1 regularization and class weights
logistic_model = LogisticRegression(C=4.5, penalty="l1", solver="liblinear", class_weight=class_weights_dict)
logistic_model.fit(X_train_scaled, y_train)

# Predict on training and testing sets
y_pred_train = logistic_model.predict(X_train_scaled)
y_pred_test = logistic_model.predict(X_test_scaled)

# Print accuracy scores
print(f"Train Accuracy: {accuracy_score(y_train, y_pred_train)}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test)}")

# Create DataFrames for error analysis with original data
df_train = pd.DataFrame(X_train_original, columns=X.columns)
df_train["true_label"] = y_train
df_train["predicted_label"] = y_pred_train

df_test = pd.DataFrame(X_test_original, columns=X.columns)
df_test["true_label"] = y_test
df_test["predicted_label"] = y_pred_test

# Function to calculate per-class accuracy
def per_class_accuracy(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    per_class_acc = cm.diagonal() / cm.sum(axis=1)
    return per_class_acc

# Training data analysis
print("Training Data Analysis:")
train_classification_report = classification_report(y_train, y_pred_train, target_names=label_encoder.classes_)
train_confusion_matrix = confusion_matrix(y_train, y_pred_train)
train_per_class_acc = per_class_accuracy(y_train, y_pred_train)

print("Classification Report (Train):")
print(train_classification_report)
print("Confusion Matrix (Train):")
print(train_confusion_matrix)
print("Per-Class Accuracy (Train):")
for idx, class_name in enumerate(label_encoder.classes_):
    print(f"{class_name}: {train_per_class_acc[idx]:.4f}")

# Testing data analysis
print("\nTesting Data Analysis:")
test_classification_report = classification_report(y_test, y_pred_test, target_names=label_encoder.classes_)
test_confusion_matrix = confusion_matrix(y_test, y_pred_test)
test_per_class_acc = per_class_accuracy(y_test, y_pred_test)

print("Classification Report (Test):")
print(test_classification_report)
print("Confusion Matrix (Test):")
print(test_confusion_matrix)
print("Per-Class Accuracy (Test):")
for idx, class_name in enumerate(label_encoder.classes_):
    print(f"{class_name}: {test_per_class_acc[idx]:.4f}")

# Additional error analysis: Misclassified samples
df_train_misclassified = df_train[df_train["true_label"] != df_train["predicted_label"]]
df_test_misclassified = df_test[df_test["true_label"] != df_test["predicted_label"]]

print("\nMisclassified Samples in Training Data:")
print(df_train_misclassified.head())

print("\nMisclassified Samples in Testing Data:")
print(df_test_misclassified.head())


Train Accuracy: 0.920749279538905
Test Accuracy: 0.8735632183908046
Training Data Analysis:
Classification Report (Train):
              precision    recall  f1-score   support

        draw       0.82      0.77      0.80       140
  team1_wins       0.95      0.96      0.95       333
  team2_wins       0.93      0.96      0.95       221

    accuracy                           0.92       694
   macro avg       0.90      0.90      0.90       694
weighted avg       0.92      0.92      0.92       694

Confusion Matrix (Train):
[[108  17  15]
 [ 14 319   0]
 [  9   0 212]]
Per-Class Accuracy (Train):
draw: 0.7714
team1_wins: 0.9580
team2_wins: 0.9593

Testing Data Analysis:
Classification Report (Test):
              precision    recall  f1-score   support

        draw       0.71      0.59      0.65        34
  team1_wins       0.94      0.97      0.95        87
  team2_wins       0.84      0.91      0.87        53

    accuracy                           0.87       174
   macro avg       