In [2]:
#This code imports the pandas and matplotlib libraries, reads a CSV file containing Premier League match data into a DataFrame named df.

import pandas as pd
import matplotlib.pyplot as plt

data = 'data/premier-league-matches.csv'

df = pd.read_csv(data)

FileNotFoundError: [Errno 2] No such file or directory: 'data/premier-league-matches.csv'

In [None]:
#This code processes match outcomes and calculates goal differences.

df['Date'] = pd.to_datetime(df['Date'])

df['HomeWin'] = (df['FTR'] == 'H').astype(int)
df['AwayWin'] = (df['FTR'] == 'A').astype(int)
df['Draw'] = (df['FTR'] == 'D').astype(int)

df['GoalDifference'] = df['HomeGoals'] - df['AwayGoals']

In [None]:
#This code calculates team statistics, points, and rankings for each season based on match outcomes and goal data.

for season in df['Season_End_Year'].unique():
    season_data = df[df['Season_End_Year'] == season]

    home_stats = season_data.groupby('Home').agg(
        Wins=('FTR', lambda x: (x == 'H').sum()),
        Draws=('FTR', lambda x: (x == 'D').sum()),
        Losses=('FTR', lambda x: (x == 'A').sum()),
        Goals_Scored=('HomeGoals', 'sum'),
        Goals_Conceded=('AwayGoals', 'sum')
    ).reset_index().rename(columns={'Home': 'Team'})

    away_stats = season_data.groupby('Away').agg(
        Wins=('FTR', lambda x: (x == 'A').sum()),
        Draws=('FTR', lambda x: (x == 'D').sum()),
        Losses=('FTR', lambda x: (x == 'H').sum()),
        Goals_Scored=('AwayGoals', 'sum'),
        Goals_Conceded=('HomeGoals', 'sum')
    ).reset_index().rename(columns={'Away': 'Team'})

    combined_stats = pd.concat([home_stats, away_stats]).groupby('Team').sum().reset_index()

    combined_stats['Goal_Difference'] = combined_stats['Goals_Scored'] - combined_stats['Goals_Conceded']
    combined_stats['Points'] = combined_stats['Wins'] * 3 + combined_stats['Draws']

    ranking = combined_stats.sort_values(by=['Points', 'Goal_Difference', 'Goals_Scored'], ascending=False)
    ranking['Rank'] = range(1, len(ranking) + 1)
    ranking['Season_End_Year'] = season

In [None]:
#This code calculates team rankings and adds previous season ranks to the dataset.

rankings_dict = {}

for season in df['Season_End_Year'].unique():
    season_data = df[df['Season_End_Year'] == season]

    home_stats = season_data.groupby('Home').agg(
        Wins=('FTR', lambda x: (x == 'H').sum()),
        Draws=('FTR', lambda x: (x == 'D').sum()),
        Losses=('FTR', lambda x: (x == 'A').sum()),
        Goals_Scored=('HomeGoals', 'sum'),
        Goals_Conceded=('AwayGoals', 'sum')
    ).reset_index().rename(columns={'Home': 'Team'})

    away_stats = season_data.groupby('Away').agg(
        Wins=('FTR', lambda x: (x == 'A').sum()),
        Draws=('FTR', lambda x: (x == 'D').sum()),
        Losses=('FTR', lambda x: (x == 'H').sum()),
        Goals_Scored=('AwayGoals', 'sum'),
        Goals_Conceded=('HomeGoals', 'sum')
    ).reset_index().rename(columns={'Away': 'Team'})

    combined_stats = pd.concat([home_stats, away_stats]).groupby('Team').sum().reset_index()
    combined_stats['Goal_Difference'] = combined_stats['Goals_Scored'] - combined_stats['Goals_Conceded']
    combined_stats['Points'] = combined_stats['Wins'] * 3 + combined_stats['Draws']
    ranking = combined_stats.sort_values(by=['Points', 'Goal_Difference', 'Goals_Scored'], ascending=False)
    ranking['Rank'] = range(1, len(ranking) + 1)

    previous_season_teams = set(rankings_dict[season - 1].keys()) if (season - 1) in rankings_dict else set()
    current_season_teams = set(ranking['Team'])
    promoted_teams = current_season_teams - previous_season_teams

    lowest_rank = ranking['Rank'].max() + 1
    for team in promoted_teams:
        ranking.loc[ranking['Team'] == team, 'Rank'] = lowest_rank

    rankings_dict[season] = ranking.set_index('Team')['Rank'].to_dict()

def get_last_season_rank(team, season):
    if season - 1 not in rankings_dict:
        return None
    return rankings_dict[season - 1].get(team, max(rankings_dict[season - 1].values()) + 1)

df['Home_Last_Season_Rank'] = df.apply(lambda row: get_last_season_rank(row['Home'], row['Season_End_Year']), axis=1)
df['Away_Last_Season_Rank'] = df.apply(lambda row: get_last_season_rank(row['Away'], row['Season_End_Year']), axis=1)

In [None]:
#This code calculates average goals scored and conceded per team per season.

team_season_stats = {}

for season in df['Season_End_Year'].unique():
    season_data = df[df['Season_End_Year'] == season]

    home_stats = season_data.groupby('Home').agg(
        Goals_Scored=('HomeGoals', 'sum'),
        Goals_Conceded=('AwayGoals', 'sum'),
        Matches=('HomeGoals', 'count')
    ).reset_index().rename(columns={'Home': 'Team'})

    away_stats = season_data.groupby('Away').agg(
        Goals_Scored=('AwayGoals', 'sum'),
        Goals_Conceded=('HomeGoals', 'sum'),
        Matches=('AwayGoals', 'count')
    ).reset_index().rename(columns={'Away': 'Team'})

    combined_stats = pd.concat([home_stats, away_stats])
    combined_stats = combined_stats.groupby('Team').sum().reset_index()

    combined_stats['Avg_Goals_Scored'] = combined_stats['Goals_Scored'] / combined_stats['Matches']
    combined_stats['Avg_Goals_Conceded'] = combined_stats['Goals_Conceded'] / combined_stats['Matches']

    team_season_stats[season] = combined_stats.set_index('Team')[['Avg_Goals_Scored', 'Avg_Goals_Conceded']].to_dict('index')

In [None]:
#This function retrieves a team's average stat from the previous season.

import numpy as np

def get_prev_season_avg(team, season, stat):
    prev_season = season - 1
    if prev_season in team_season_stats:
        return team_season_stats[prev_season].get(team, {}).get(stat, np.nan)
    else:
        return np.nan

In [None]:
#This code adds columns for each team's average goals scored and conceded in the previous season for both home and away teams.

df['Home_Avg_Goals_Scored_Prev_Season'] = df.apply(
    lambda row: get_prev_season_avg(row['Home'], row['Season_End_Year'], 'Avg_Goals_Scored'), axis=1)
df['Home_Avg_Goals_Conceded_Prev_Season'] = df.apply(
    lambda row: get_prev_season_avg(row['Home'], row['Season_End_Year'], 'Avg_Goals_Conceded'), axis=1)

df['Away_Avg_Goals_Scored_Prev_Season'] = df.apply(
    lambda row: get_prev_season_avg(row['Away'], row['Season_End_Year'], 'Avg_Goals_Scored'), axis=1)
df['Away_Avg_Goals_Conceded_Prev_Season'] = df.apply(
    lambda row: get_prev_season_avg(row['Away'], row['Season_End_Year'], 'Avg_Goals_Conceded'), axis=1)

In [None]:
#This code calculates league-wide average goals scored and conceded for each season.

league_avg = {}

for season in team_season_stats.keys():
    season_teams = team_season_stats[season]
    avg_scored = np.mean([stats['Avg_Goals_Scored'] for stats in season_teams.values()])
    avg_conceded = np.mean([stats['Avg_Goals_Conceded'] for stats in season_teams.values()])
    league_avg[season] = {
        'Avg_Goals_Scored': avg_scored,
        'Avg_Goals_Conceded': avg_conceded
    }

In [None]:
#This code fills missing values in average goals columns with league-wide averages from the previous season.

columns_to_fill = [
    ('Home_Avg_Goals_Scored_Prev_Season', 'Avg_Goals_Scored'),
    ('Home_Avg_Goals_Conceded_Prev_Season', 'Avg_Goals_Conceded'),
    ('Away_Avg_Goals_Scored_Prev_Season', 'Avg_Goals_Scored'),
    ('Away_Avg_Goals_Conceded_Prev_Season', 'Avg_Goals_Conceded')
]

for col_name, stat_name in columns_to_fill:
    df[col_name] = df.apply(
        lambda row: league_avg.get(row['Season_End_Year'] - 1, {}).get(stat_name)
        if pd.isna(row[col_name]) else row[col_name], axis=1)

In [None]:
#This code calculates cumulative points for home and away teams throughout each season, updating match-by-match based on chronological order.

df['HomePoints'] = df['HomeWin'] * 3 + df['Draw'] * 1
df['AwayPoints'] = df['AwayWin'] * 3 + df['Draw'] * 1

df['Home_CumulativePoints'] = 0
df['Away_CumulativePoints'] = 0

for season in df['Season_End_Year'].unique():
    season_data = df[df['Season_End_Year'] == season]

    cumulative_points = {team: 0 for team in pd.concat([season_data['Home'], season_data['Away']]).unique()}

    for idx, row in season_data.sort_values(by='Date').iterrows():
        home_team = row['Home']
        away_team = row['Away']

        df.loc[idx, 'Home_CumulativePoints'] = cumulative_points[home_team]
        df.loc[idx, 'Away_CumulativePoints'] = cumulative_points[away_team]

        cumulative_points[home_team] += row['HomePoints']
        cumulative_points[away_team] += row['AwayPoints']

In [None]:
required_columns = {'Date', 'Home', 'Away', 'FTR'}
missing_cols = required_columns - set(df.columns)
if missing_cols:
    raise ValueError(f"Input DataFrame is missing required columns: {missing_cols}")

if not pd.api.types.is_datetime64_any_dtype(df['Date']):
    raise TypeError("The 'Date' column must be of a datetime type.")

df = df.sort_values(by='Date')

def assign_form_score(row):
    if row['FTR'] == 'H':
        return 3
    elif row['FTR'] == 'A':
        return 3
    elif row['FTR'] == 'D':
        return 1
    return 0

df['Match_Score'] = df.apply(assign_form_score, axis=1)

def calculate_rolling_average_for_team(team_df, window=5):
    team_df = team_df.sort_values(by='Date')

    home_wins = (team_df['FTR'] == 'H').astype(int) * 3
    home_draws = (team_df['FTR'] == 'D').astype(int) * 1
    home_points = home_wins + home_draws

    away_wins = (team_df['FTR'] == 'A').astype(int) * 3
    away_draws = (team_df['FTR'] == 'D').astype(int) * 1
    away_points = away_wins + away_draws

    team_df['Team_Point'] = 0
    team_df.loc[team_df['Home'] == team_df['Team'], 'Team_Point'] = home_points
    team_df.loc[team_df['Away'] == team_df['Team'], 'Team_Point'] = away_points

    team_df['Team_Rolling_Avg'] = (
        team_df['Team_Point']
        .rolling(window=window, min_periods=1)
        .mean()
        .shift(1, fill_value=0)
    )

    return team_df

all_teams = pd.unique(df[['Home', 'Away']].values.ravel())
rolling_df = pd.DataFrame()

for team in all_teams:
    team_matches = df[(df['Home'] == team) | (df['Away'] == team)].copy()
    team_matches['Team'] = team
    team_rolled = calculate_rolling_average_for_team(team_matches)
    rolling_df = pd.concat([rolling_df, team_rolled], ignore_index=True)

df = df.merge(
    rolling_df[['Date', 'Team', 'Team_Rolling_Avg']],
    left_on=['Date', 'Home'],
    right_on=['Date', 'Team'],
    how='left'
)
df.rename(columns={'Team_Rolling_Avg': 'Home_Rolling_Avg'}, inplace=True)
df.drop(columns=['Team'], inplace=True)

df = df.merge(
    rolling_df[['Date', 'Team', 'Team_Rolling_Avg']],
    left_on=['Date', 'Away'],
    right_on=['Date', 'Team'],
    how='left'
)
df.rename(columns={'Team_Rolling_Avg': 'Away_Rolling_Avg'}, inplace=True)
df.drop(columns=['Team'], inplace=True)

print(df[['Date', 'Home', 'Away', 'Home_Rolling_Avg', 'Away_Rolling_Avg']].head())

In [None]:
teams_2023 = df[df['Season_End_Year'] == 2023]

unique_teams_2023 = pd.unique(pd.concat([teams_2023['Home'], teams_2023['Away']]))

print(unique_teams_2023)

unique_values = df['Home'].unique()
print(unique_values)

In [None]:
#Data shaping for training
teams = ['Coventry City', 'Southampton', 'Everton', 'Ipswich Town',
         'Chelsea', 'Crystal Palace', 'Sheffield Utd', 'Leeds United',
         'Arsenal', "Nott'ham Forest", 'Manchester City', 'Blackburn',
         'Wimbledon', 'QPR', 'Sheffield Weds', 'Manchester Utd', 'Norwich City',
         'Tottenham', 'Oldham Athletic', 'Aston Villa', 'Liverpool', 'Middlesbrough',
         'West Ham', 'Newcastle Utd', 'Swindon Town', 'Leicester City', 'Bolton',
         'Derby County', 'Sunderland', 'Barnsley', 'Charlton Ath', 'Watford',
         'Bradford City', 'Fulham', 'West Brom', 'Birmingham City', 'Portsmouth',
         'Wolves', 'Wigan Athletic', 'Reading', 'Hull City', 'Stoke City',
         'Burnley', 'Blackpool', 'Swansea City', 'Cardiff City',
         'Bournemouth', 'Brighton', 'Huddersfield', 'Brentford'
]

team_dfs = []

for team in teams:
    team_df = df[(df['Home'] == team) | (df['Away'] == team)].copy()

    team_df['Target_Team'] = team

    team_df['MatchWin'] = np.where(
        ((team_df['Home'] == team) & (team_df['FTR'] == 'H')) |
        ((team_df['Away'] == team) & (team_df['FTR'] == 'A')),
        1, 0
    )

    team_dfs.append(team_df)

combined_df = pd.concat(team_dfs, ignore_index=True)


from sklearn.preprocessing import LabelEncoder

label_encoders = {col: LabelEncoder() for col in ['Home', 'Away', 'Target_Team']}

for col, encoder in label_encoders.items():
    combined_df[col] = encoder.fit_transform(combined_df[col])

features = [
    'Home', 'Away', 'Target_Team',
    'Home_Last_Season_Rank', 'Away_Last_Season_Rank',
    'Home_Avg_Goals_Scored_Prev_Season', 'Home_Avg_Goals_Conceded_Prev_Season',
    'Away_Avg_Goals_Scored_Prev_Season', 'Away_Avg_Goals_Conceded_Prev_Season',
    'Home_CumulativePoints', 'Away_CumulativePoints',
    'Home_Rolling_Avg', 'Away_Rolling_Avg',
]
target = 'MatchWin'

filtered_data = combined_df[combined_df['Season_End_Year'] > 1993]

train_data = filtered_data[filtered_data['Season_End_Year'] < 2023]
test_data = filtered_data[filtered_data['Season_End_Year'] == 2023]

X_train = train_data[features].values
y_train = train_data[target].values
X_test = test_data[features].values
y_test = test_data[target].values

In [None]:
print("Training Data Shape:")
print(f"Features (X_train): {X_train.shape}")
print(f"Labels (y_train): {y_train.shape}")

print("Test Data Shape:")
print(f"Features (X_test): {X_test.shape}")
print(f"Labels (y_test): {y_test.shape}")

print(f"Number of rows in combined_df: {combined_df.shape[0]}")
print(f"Number of rows in train_data: {train_data.shape[0]}")
print(f"Number of rows in test_data: {test_data.shape[0]}")

print("Columns in combined_df:")
print(combined_df.columns)

In [3]:
print("Value counts of MatchWin:")
print(combined_df['MatchWin'].value_counts())

print("\nPercentage distribution of MatchWin:")
print(combined_df['MatchWin'].value_counts(normalize=True) * 100)

Value counts of MatchWin:


NameError: name 'combined_df' is not defined

In [None]:
#This code calculates and visualizes a Pearson correlation matrix for selected columns in Liverpool matches using a heatmap.

import seaborn as sns
import matplotlib.pyplot as plt

def perform_correlation_analysis(combined_df, selected_columns):
    for col in selected_columns:
        if col not in combined_df.columns:
            raise ValueError(f"Column '{col}' is not in the DataFrame.")

    numeric_df = combined_df[selected_columns].select_dtypes(include=['number'])
    correlation_matrix = numeric_df.corr(method='pearson')
    return correlation_matrix

selected_columns = ['HomeGoals', 'AwayGoals', 'HomeWin', 'AwayWin', 'Draw', 'GoalDifference', 'Home_Last_Season_Rank', 'Away_Last_Season_Rank', 'Home_Avg_Goals_Scored_Prev_Season', 'Home_Avg_Goals_Conceded_Prev_Season', 'Away_Avg_Goals_Scored_Prev_Season', 'Away_Avg_Goals_Conceded_Prev_Season','Home_CumulativePoints', 'Away_CumulativePoints', 'Home_Rolling_Avg', 'Away_Rolling_Avg']
correlation_matrix = perform_correlation_analysis(combined_df, selected_columns)

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix for Numeric Features')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2]
}

print("Running GridSearch for Random Forest with params:", rf_param_grid)
rf_model = RandomForestClassifier(random_state=42)
rf_grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)
rf_grid_search.fit(X_train, y_train)

best_rf_model = rf_grid_search.best_estimator_

rf_results_df = pd.DataFrame(rf_grid_search.cv_results_)
print("\n=== Random Forest Grid Search Results (Top 5) ===")
print(rf_results_df[['params', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score').head(5))

print("\n=== Best Random Forest Params ===")
print(rf_grid_search.best_params_)
print("Best RF Score:", rf_grid_search.best_score_)

xgb_param_grid = {
    'learning_rate': [0.1, 0.05],
    'max_depth': [3, 6],
    'n_estimators': [100, 200, 300]
}

print("\nRunning GridSearch for XGBoost with params:", xgb_param_grid)
xgb_model = XGBClassifier(
    eval_metric='logloss',
    random_state=42
)
xgb_grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=xgb_param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)
xgb_grid_search.fit(X_train, y_train)

best_xgb_model = xgb_grid_search.best_estimator_

xgb_results_df = pd.DataFrame(xgb_grid_search.cv_results_)
print("\n=== XGBoost Grid Search Results (Top 5) ===")
print(xgb_results_df[['params', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score').head(5))

print("\n=== Best XGBoost Params ===")
print(xgb_grid_search.best_params_)
print("Best XGB Score:", xgb_grid_search.best_score_)

ensemble_model = VotingClassifier(
    estimators=[('rf', best_rf_model), ('xgb', best_xgb_model)],
    voting='soft'
)
ensemble_model.fit(X_train, y_train)

models = {
    'Random Forest (Optimized)': best_rf_model,
    'XGBoost (Optimized)': best_xgb_model,
    'Ensemble (RF+XGB)': ensemble_model
}

results = []
for model_name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    results.append({
        'Model': model_name,
        'Accuracy': acc,
        'Precision (Class 1)': report['1']['precision'],
        'Recall (Class 1)': report['1']['recall'],
        'F1-score (Class 1)': report['1']['f1-score']
    })

results_df = pd.DataFrame(results)

print("\n=== Final Model Performance Comparison ===")
print(results_df)

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n--- {model_name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

plt.figure(figsize=(10, 6))
plt.bar(
    results_df['Model'],
    results_df['Accuracy'],
    color=['skyblue', 'orange', 'green']
)
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.show()

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [4]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, classification_report

def create_sequences(X, y, seq_length=5):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i + seq_length])
        y_seq.append(y[i + seq_length])
    return np.array(X_seq), np.array(y_seq)

sequence_length = 5

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train, sequence_length)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test, sequence_length)

hidden_units = 32
learning_rate = 0.001
epochs = 100
batch_size = 16
dropout_rate = 0.3

input_shape = (sequence_length, X_train_seq.shape[2])

model = Sequential([
    Input(shape=input_shape),
    SimpleRNN(hidden_units, activation='relu', return_sequences=True),
    Dropout(dropout_rate),
    BatchNormalization(),
    SimpleRNN(hidden_units, activation='relu'),
    Dropout(dropout_rate),
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])

optimizer = Adam(learning_rate=learning_rate, clipnorm=1.0)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(
    X_train_seq, y_train_seq,
    epochs=epochs,
    batch_size=batch_size,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)

loss, accuracy = model.evaluate(X_test_seq, y_test_seq, verbose=0)
y_pred_probs = model.predict(X_test_seq)
y_pred = (y_pred_probs > 0.5).astype(int)

print(f"\nTest Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test_seq, y_pred))

plt.figure(figsize=(12,5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

NameError: name 'X_train' is not defined