In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

# Step 1: Load and preprocess the data
file_path = '/content/drive/MyDrive/Uwa FYP/Dataset/match-status-temp.csv'  # Adjust the path if needed
data = pd.read_csv(file_path)

# Corrected Date Parsing Code
# Allow pandas to infer the correct date format
data['date'] = pd.to_datetime(data['date'], errors='coerce', dayfirst=False)

# Check for any parsing errors (NaT values)
print("Number of missing dates:", data['date'].isna().sum())

# Optionally drop rows with invalid dates
data = data.dropna(subset=['date'])

# Create a new column `outcome` based on match result (win/loss/draw)
def match_outcome(row):
    if row['home_score'] > row['away_score']:
        return 'home_win'
    elif row['home_score'] < row['away_score']:
        return 'away_win'
    else:
        return 'draw'

data['outcome'] = data.apply(match_outcome, axis=1)

# Feature engineering: Extract `year`, `month`, and `day` from the date
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

# Prepare categorical encoders for teams and outcome
team_label_encoder = LabelEncoder()
outcome_label_encoder = LabelEncoder()

# Fit the encoders on all possible values in the dataset
team_label_encoder.fit(pd.concat([data['home_team'], data['away_team']]))

# Encode the categorical columns
data['home_team_encoded'] = team_label_encoder.transform(data['home_team'])
data['away_team_encoded'] = team_label_encoder.transform(data['away_team'])
data['outcome_encoded'] = outcome_label_encoder.fit_transform(data['outcome'])

# Normalize continuous columns such as temperature
scaler = MinMaxScaler()
data['temperature_scaled'] = scaler.fit_transform(data[['temperature']])

# Prepare the feature columns for the LSTM model
features = data[['home_team_encoded', 'away_team_encoded', 'year', 'month', 'day', 'temperature_scaled']]
target = data['outcome_encoded']

# Convert features to numpy array and reshape to match LSTM input requirements
X = np.array(features).reshape((features.shape[0], 1, features.shape[1]))

# One-hot encode the target
y = to_categorical(target)

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Build the LSTM model with additional layers and dropout
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))  # Dropout to prevent overfitting
model.add(LSTM(32, activation='relu'))
model.add(Dense(3, activation='softmax'))  # 3 classes: home_win, away_win, draw

# Compile the model with a lower learning rate for better training
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



Number of missing dates: 47272


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['outcome'] = data.apply(match_outcome, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year'] = data['date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['month'] = data['date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [14]:
# Step 4: Train the model with more epochs
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Step 5: Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")



Epoch 1/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 161ms/step - accuracy: 0.5880 - loss: 3.9179 - val_accuracy: 0.3077 - val_loss: 2.2972
Epoch 2/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.4763 - loss: 2.7343 - val_accuracy: 0.4231 - val_loss: 2.1690
Epoch 3/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5036 - loss: 2.2863 - val_accuracy: 0.4231 - val_loss: 1.7232
Epoch 4/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.4995 - loss: 1.8308 - val_accuracy: 0.3077 - val_loss: 1.7665
Epoch 5/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.6197 - loss: 1.4119 - val_accuracy: 0.3077 - val_loss: 1.5823
Epoch 6/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.5230 - loss: 1.8530 - val_accuracy: 0.4231 - val_loss: 1.2356
Epoch 7/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━

In [17]:
# Save the model to an .h5 file
model.save('/content/drive/MyDrive/Uwa FYP/Models/football_prediction_model.h5')
print("Model saved to football_prediction_model.h5")

# Save the encoders and scaler
import joblib
joblib.dump(team_label_encoder, '/content/drive/MyDrive/Uwa FYP/Models/team_label_encoder.pkl')
joblib.dump(outcome_label_encoder, '/content/drive/MyDrive/Uwa FYP/Models/outcome_label_encoder.pkl')
joblib.dump(scaler, '/content/drive/MyDrive/Uwa FYP/Models/scaler.pkl')



Model saved to football_prediction_model.h5


['/content/drive/MyDrive/Uwa FYP/Models/scaler.pkl']

In [5]:
# Step 6: Predict the outcome of a new match
def predict_outcome(home_team, away_team, year, month, day, temperature):
    # Encode and scale the input data
    home_team_encoded = team_label_encoder.transform([home_team])[0]
    away_team_encoded = team_label_encoder.transform([away_team])[0]
    temperature_scaled = scaler.transform([[temperature]])[0][0]

    # Prepare the input for the model
    input_data = np.array([[home_team_encoded, away_team_encoded, year, month, day, temperature_scaled]])
    input_data = input_data.reshape((1, 1, 6))

    # Make the prediction
    prediction = model.predict(input_data)
    outcome_index = np.argmax(prediction)

    # Map the prediction back to the original outcome labels
    outcome_label = outcome_label_encoder.inverse_transform([outcome_index])
    return outcome_label[0]




In [15]:
# Example usage:
home_team = 'England'
away_team = 'Scotland'
year = 2024
month = 11  # November
day = 25  # 25th day of the month
temperature = 15

predicted_outcome = predict_outcome(home_team, away_team, year, month, day, temperature)
print(f"The predicted outcome is: {predicted_outcome}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
The predicted outcome is: home_win
