In [61]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from scipy.sparse import csr_matrix
from sklearn.metrics import classification_report

In [2]:
class FootballNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FootballNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = self.relu(self.fc1(x))
        out = self.relu(self.fc2(out))
        out = self.fc3(out)
        return out

In [116]:
class SimpleNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [119]:
df = pd.read_csv("epl-training-with-weather.csv")
df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HY,AY,HR,AR,temperature_2m_max (°C),temperature_2m_min (°C),temperature_2m_mean (°C),rain_sum (mm),snowfall_sum (cm),wind_speed_10m_max (km/h)
0,2000-08-19,Charlton,Man City,4.0,0.0,H,2.0,0.0,H,Rob Harris,...,1.0,2.0,0.0,0.0,20.5,13.7,16.9,2.6,0.0,16.0
1,2000-08-19,Chelsea,West Ham,4.0,2.0,H,1.0,0.0,H,Graham Barber,...,1.0,2.0,0.0,0.0,20.5,13.7,16.9,2.6,0.0,16.0
2,2000-08-19,Coventry,Middlesbrough,1.0,3.0,A,1.0,1.0,D,Barry Knight,...,5.0,3.0,1.0,0.0,18.0,11.4,14.4,0.9,0.0,17.7
3,2000-08-19,Derby,Southampton,2.0,2.0,D,1.0,2.0,A,Andy D'Urso,...,1.0,1.0,0.0,0.0,17.7,9.3,13.7,2.8,0.0,18.6
4,2000-08-19,Leeds,Everton,2.0,0.0,H,2.0,0.0,H,Dermot Gallagher,...,1.0,3.0,0.0,0.0,18.0,10.1,13.6,4.0,0.0,20.8


In [100]:
indices_to_drop = df[df['FTR'] == 'H'].index[:1500]
df.drop(indices_to_drop, inplace=True)

value_counts = df['FTR'].value_counts()
print(value_counts)

A    2584
H    2563
D    2193
Name: FTR, dtype: int64


In [120]:
# turn date into numerical values
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df.drop('Date', axis=1, inplace=True)

In [121]:
# remove referee for now
df.drop('Referee', axis=1, inplace=True)

In [122]:
X = df.drop('FTR', axis=1)  # Features

In [123]:
categorical_features = ['HomeTeam', 'AwayTeam', 'HTR']
numerical_features = X.columns.difference(categorical_features)

# Define ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [124]:
X_processed = preprocessor.fit_transform(X)
if isinstance(X_processed, csr_matrix):
    X_processed = X_processed.toarray()

In [125]:
ohe = preprocessor.named_transformers_['cat']
ohe_feature_names = ohe.get_feature_names_out(input_features=categorical_features)

# Combine all feature names
all_feature_names = numerical_features.tolist() + list(ohe_feature_names)

# Create a DataFrame from the processed data
X_processed_df = pd.DataFrame(X_processed, columns=all_feature_names)

In [126]:
label_enc = LabelEncoder()
df['FTR'] = label_enc.fit_transform(df['FTR'])
y = df['FTR']

In [127]:
x_train, x_test, y_train, y_test = train_test_split(X_processed_df, y, test_size=0.2, random_state=42)

In [128]:
x_train_tensor = torch.tensor(x_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

x_test_tensor = torch.tensor(x_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

In [129]:
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)

In [130]:
batch_size = 64

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [131]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleNet(input_size=x_train.shape[1], hidden_size=128, num_classes=3).to(device)

In [132]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
num_epochs = 5

# Training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    model.train()
    for inputs, targets in train_loader:
        targets = targets.type(torch.LongTensor)
        inputs, targets = inputs.to(device), targets.to(device)

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    y_pred = []
    y_true = []
    with torch.no_grad():
        for inputs,targets in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            y_pred.extend(predicted.cpu().numpy())
            y_true.extend(targets.cpu().numpy())


    report = classification_report(y_true, y_pred)
    print(report)

Epoch 1/5
              precision    recall  f1-score   support

         0.0       0.78      0.91      0.84       516
         1.0       0.77      0.40      0.53       460
         2.0       0.81      0.95      0.88       792

    accuracy                           0.80      1768
   macro avg       0.79      0.76      0.75      1768
weighted avg       0.79      0.80      0.78      1768

Epoch 2/5
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97       516
         1.0       0.99      0.89      0.94       460
         2.0       0.97      1.00      0.98       792

    accuracy                           0.97      1768
   macro avg       0.97      0.96      0.97      1768
weighted avg       0.97      0.97      0.97      1768

Epoch 3/5
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       516
         1.0       1.00      1.00      1.00       460
         2.0       1.00      1.00      1.00  