In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
from datetime import datetime
import seaborn as sns


from sklearn.preprocessing import ( LabelEncoder, StandardScaler )


from sklearn.metrics import (
    accuracy_score, classification_report, make_scorer, mean_absolute_error,
    mean_squared_error, confusion_matrix, ConfusionMatrixDisplay,
    roc_auc_score, roc_curve, f1_score
)


from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV


In [None]:
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier




In [None]:

pd.set_option("display.max_columns", None)
# Load the dataset
file_path = "PREM_LEAGUE_DATA_2009-2019.csv"
prem_data = pd.read_csv(file_path)
data = prem_data.copy() 

# Display the first few rows (for verification)
print("Original Data:")
print(data.head())

data.columns




In [None]:
season_start = 9
season_end = 10
season_list = []

# Populate the season_list
for x in range(10):
    for y in range(380):  # Adjust the range based on the number of rows in DataFrame
        season_list.append(('0' + str(season_start) + '-' + str(season_end))[-5:])
    season_start += 1
    season_end += 1

# Ensure the season_list length matches the number of rows in DataFrame
if len(season_list) > len(data):
    season_list = season_list[:len(data)]
elif len(season_list) < len(data):
    season_list.extend([''] * (len(data) - len(season_list)))

# Insert the season_list as a new column after the 'Date' column
data.insert(data.columns.get_loc('Date') + 1, 'Season', season_list)

data

In [None]:
# Convert Date column to datetime format
data['Date'] = pd.to_datetime(data['Date']).dt.date




In [None]:
# Remove div column, as it is always E0
data = data.drop(['Div'], axis=1)

# Remove Referee column, as it is not relevant for the analysis
data = data.drop(['Referee'], axis=1)



In [None]:
# Rename the columns for better readability
data.columns = ['Date', 'Season','Home_Team','Away_Team','FT_Home_Team_Goals','FT_Away_Team_Goals','FT_Result','HT_Home_Team_Goals','HT_Away_Team_Goals',
                        'HT_Result','Home_Team_Shots','Away_Team_Shots','Home_Team_Shots_On_Target','Away_Team_Shots_On_Target','Home_Team_Fouls',
                        'Away_Team_Fouls','Home_Team_Corners','Away_Team_Corners','Home_Team_Yellow_Cards','Away_Team_Yellow_Cards','Home_Team_Red_Cards','Away_Team_Red_Cards'
                ]


In [None]:
# Create a list of all the teams
team_list = data['Home_Team'].unique()

print(team_list)

In [None]:
# 'Goals_Scored_Last_5'
data['Home_Team_Goals_Scored_Last_5'] = 0
data['Away_Team_Goals_Scored_Last_5'] = 0

# Create a dictionary to store the last 5 goals scored by each team
goals_scored_last_5_dict = {team: deque(maxlen=5) for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_goals = row['FT_Home_Team_Goals']
    away_goals = row['FT_Away_Team_Goals']
    
    # Calculate sum of goals scored in the last 5 games for both teams
    data.at[index, 'Home_Team_Goals_Scored_Last_5'] = sum(goals_scored_last_5_dict[home_team])
    data.at[index, 'Away_Team_Goals_Scored_Last_5'] = sum(goals_scored_last_5_dict[away_team])
    
    # Update deque with the current game's goals
    goals_scored_last_5_dict[home_team].append(home_goals)
    goals_scored_last_5_dict[away_team].append(away_goals)
    
    if (index + 1) % 380 == 0:
        goals_scored_last_5_dict = {team: deque(maxlen=5) for team in team_list}






In [None]:
# New columns for cumulative goals scored
data['Home_Team_Cumulative_Season_Goals'] = 0
data['Away_Team_Cumulative_Season_Goals'] = 0

goals_scored_season_dict = {team: 0 for team in team_list}

# Iterate through the DataFrame row by row
for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_goals = row['FT_Home_Team_Goals']
    away_goals = row['FT_Away_Team_Goals']
    
    data.at[index, 'Home_Team_Cumulative_Season_Goals'] = goals_scored_season_dict[home_team]
    data.at[index, 'Away_Team_Cumulative_Season_Goals'] = goals_scored_season_dict[away_team]
    
    # Update the cumulative goals scored for both teams
    goals_scored_season_dict[home_team] += home_goals
    goals_scored_season_dict[away_team] += away_goals
    
    if (index + 1) % 380 == 0:  
        goals_scored_season_dict = {team: 0 for team in team_list}

data[(data['Home_Team'] == 'Aston Villa') | (data['Away_Team'] == 'Aston Villa')].head(11)[['Date','Home_Team','Away_Team','FT_Home_Team_Goals','FT_Away_Team_Goals','Home_Team_Goals_Scored_Last_5','Away_Team_Goals_Scored_Last_5','Home_Team_Cumulative_Season_Goals', 'Away_Team_Cumulative_Season_Goals']]


In [None]:
# Goals per game last 5
data['Home_Team_Goals_Per_Game_Last_5'] = 0.0
data['Away_Team_Goals_Per_Game_Last_5'] = 0.0

goals_scored_last_5_dict = {team: deque(maxlen=5) for team in team_list}


for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_goals = row['FT_Home_Team_Goals']
    away_goals = row['FT_Away_Team_Goals']
    
    # Calculate the average goals per game in the last 5 games for both teams
    if len(goals_scored_last_5_dict[home_team]) > 0:
        data.at[index, 'Home_Team_Goals_Per_Game_Last_5'] = round(
            sum(goals_scored_last_5_dict[home_team]) / len(goals_scored_last_5_dict[home_team]), 3
        )
    if len(goals_scored_last_5_dict[away_team]) > 0:
        data.at[index,'Away_Team_Goals_Per_Game_Last_5'] = round(
            sum(goals_scored_last_5_dict[away_team]) / len(goals_scored_last_5_dict[away_team]), 3
        )
    
    # Update deque with the current game's goals
    goals_scored_last_5_dict[home_team].append(home_goals)
    goals_scored_last_5_dict[away_team].append(away_goals)
    
    if (index + 1) % 380 == 0:
        goals_scored_last_5_dict = {team: deque(maxlen=5) for team in team_list}

data[(data['Home_Team'] == 'Aston Villa') | (data['Away_Team'] == 'Aston Villa')].head(11)[['Date','Home_Team','Away_Team','FT_Home_Team_Goals','FT_Away_Team_Goals','Home_Team_Goals_Per_Game_Last_5','Away_Team_Goals_Per_Game_Last_5']]

In [None]:
# Calculating a team's goals per game for the season

data['Home_Team_Season_Goals_Per_Game'] = 0.0
data['Away_Team_Season_Goals_Per_Game'] = 0.0

# Create dictionaries to store cumulative goals and games played for each team
goals_scored_season_dict = {team: 0 for team in team_list}
games_played_season_dict = {team: 0 for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_goals = row['FT_Home_Team_Goals']
    away_goals = row['FT_Away_Team_Goals']
    
    if games_played_season_dict[home_team] > 0:
        data.at[index, 'Home_Team_Season_Goals_Per_Game'] = round(
            goals_scored_season_dict[home_team] / games_played_season_dict[home_team], 3
        )
    if games_played_season_dict[away_team] > 0:
        data.at[index, 'Away_Team_Season_Goals_Per_Game'] = round(
            goals_scored_season_dict[away_team] / games_played_season_dict[away_team], 3
        )
    
    goals_scored_season_dict[home_team] += home_goals
    goals_scored_season_dict[away_team] += away_goals
    games_played_season_dict[home_team] += 1
    games_played_season_dict[away_team] += 1
    
    if (index + 1) % 380 == 0: 
        goals_scored_season_dict = {team: 0 for team in team_list}
        games_played_season_dict = {team: 0 for team in team_list}

data[(data['Home_Team'] == 'Aston Villa') | (data['Away_Team'] == 'Aston Villa')].head(11)[['Date','Home_Team','Away_Team','FT_Home_Team_Goals','FT_Away_Team_Goals', 'Home_Team_Season_Goals_Per_Game', 'Away_Team_Season_Goals_Per_Game']]

In [None]:

data['Home_Team_Goals_Conceded_Last_5'] = 0
data['Away_Team_Goals_Conceded_Last_5'] = 0

goals_conceded_last_5_dict = {team: deque(maxlen=5) for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']

    home_goals = row['FT_Home_Team_Goals']
    away_goals = row['FT_Away_Team_Goals']

    data.at[index, 'Home_Team_Goals_Conceded_Last_5'] = sum(goals_conceded_last_5_dict[home_team])
    data.at[index, 'Away_Team_Goals_Conceded_Last_5'] = sum(goals_conceded_last_5_dict[away_team])

    

    goals_conceded_last_5_dict[home_team].append(away_goals)
    goals_conceded_last_5_dict[away_team].append(home_goals)

    if (index + 1) % 380 == 0:
        goals_conceded_last_5_dict = {team: deque(maxlen=5) for team in team_list}

data

In [None]:
# A new column called 'Goals_Conceded_Season' for both home and away teams

data['Home_Team_Cumulative_Season_Goals_Conceded'] = 0
data['Away_Team_Cumulative_Season_Goals_Conceded'] = 0

goals_conceded_season_dict = {team: 0 for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_goals = row['FT_Home_Team_Goals']
    away_goals = row['FT_Away_Team_Goals']

    data.at[index,'Home_Team_Cumulative_Season_Goals_Conceded'] = goals_conceded_season_dict[home_team]
    data.at[index,'Away_Team_Cumulative_Season_Goals_Conceded'] = goals_conceded_season_dict[away_team]
    
    goals_conceded_season_dict[home_team] += away_goals
    goals_conceded_season_dict[away_team] += home_goals
    
    if (index + 1) % 380 == 0: 
        goals_conceded_season_dict = {team: 0 for team in team_list}

data[(data['Home_Team'] == 'Aston Villa') | (data['Away_Team'] == 'Aston Villa')].head(11)[['Date','Home_Team','Away_Team','FT_Home_Team_Goals','FT_Away_Team_Goals','Home_Team_Cumulative_Season_Goals_Conceded','Away_Team_Cumulative_Season_Goals_Conceded']]

In [None]:
# Goals conceded per game last 5
data['Home_Team_Goals_Conceded_Per_Game_Last_5'] = 0.0
data['Away_Team_Goals_Conceded_Per_Game_Last_5'] = 0.0

goals_conceded_last_5_dict = {team: deque(maxlen=5) for team in team_list}


for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_goals = row['FT_Home_Team_Goals']
    away_goals = row['FT_Away_Team_Goals']
    
    # Calculate the average goals conceded per game in the last 5 games for both teams
    if len(goals_conceded_last_5_dict[home_team]) > 0:
        data.at[index, 'Home_Team_Goals_Conceded_Per_Game_Last_5'] = round(
            sum(goals_conceded_last_5_dict[home_team]) / len(goals_conceded_last_5_dict[home_team]), 3
        )
    if len(goals_conceded_last_5_dict[away_team]) > 0:
        data.at[index, 'Away_Team_Goals_Conceded_Per_Game_Last_5'] = round(
            sum(goals_conceded_last_5_dict[away_team]) / len(goals_conceded_last_5_dict[away_team]), 3
        )
    
    goals_conceded_last_5_dict[home_team].append(away_goals)
    goals_conceded_last_5_dict[away_team].append(home_goals)
    
    if (index + 1) % 380 == 0:
        goals_conceded_last_5_dict = {team: deque(maxlen=5) for team in team_list}


In [None]:
# 'Goals_Conceded_Season'
data['Home_Team_Season_Goals_Conceded_Per_Game'] = 0.0
data['Away_Team_Season_Goals_Conceded_Per_Game'] = 0.0

goals_conceded_season_dict = {team: 0 for team in team_list}
games_played_season_dict = {team: 0 for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_goals = row['FT_Home_Team_Goals']
    away_goals = row['FT_Away_Team_Goals']
    
    # assign into the same columns you initialized above
    if games_played_season_dict[home_team] > 0:
        data.at[index, 'Home_Team_Season_Goals_Conceded_Per_Game'] = round(
            goals_conceded_season_dict[home_team] / games_played_season_dict[home_team], 3
        )
    if games_played_season_dict[away_team] > 0:
        data.at[index, 'Away_Team_Season_Goals_Conceded_Per_Game'] = round(
            goals_conceded_season_dict[away_team] / games_played_season_dict[away_team], 3
        )
    
    goals_conceded_season_dict[home_team] += away_goals
    goals_conceded_season_dict[away_team] += home_goals
    games_played_season_dict[home_team] += 1
    games_played_season_dict[away_team] += 1
    
    if (index + 1) % 380 == 0: 
        goals_conceded_season_dict = {team: 0 for team in team_list}
        games_played_season_dict = {team: 0 for team in team_list}

# Aston Villa's first 5 games,
data[(data['Home_Team'] == 'Aston Villa') | (data['Away_Team'] == 'Aston Villa')].head(11)[['Date','Home_Team','Away_Team','FT_Home_Team_Goals','FT_Away_Team_Goals', 'Home_Team_Season_Goals_Conceded_Per_Game', 'Away_Team_Season_Goals_Conceded_Per_Game']]

In [None]:
# goal_difference_last 5 for both home and away teams

data['Home_Team_Goal_Difference_Last_5'] = 0

data['Away_Team_Goal_Difference_Last_5'] = 0


for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']

    home_goals_scored = row['Home_Team_Goals_Scored_Last_5']
    home_goals_conceded = row['Home_Team_Goals_Conceded_Last_5']
    away_goals_scored = row['Away_Team_Goals_Scored_Last_5']
    away_goals_conceded = row['Away_Team_Goals_Conceded_Last_5']

    
    # Calculate the goal difference in the last 5 games for both teams
    data.at[index, 'Home_Team_Goal_Difference_Last_5'] = home_goals_scored - home_goals_conceded
    data.at[index, 'Away_Team_Goal_Difference_Last_5'] = away_goals_scored - away_goals_conceded

# Test with liverpools last 5 games
data[(data['Home_Team'] == 'Liverpool') | (data['Away_Team'] == 'Liverpool')].head(11)[['Date','Home_Team','Away_Team','FT_Home_Team_Goals','FT_Away_Team_Goals','Home_Team_Goal_Difference_Last_5','Away_Team_Goal_Difference_Last_5']]

In [None]:
# season_goal_difference
data['Home_Team_Season_Goal_Difference'] = 0
data['Away_Team_Season_Goal_Difference'] = 0

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']

    home_goals_scored = row['Home_Team_Cumulative_Season_Goals']
    home_goals_conceded = row['Home_Team_Cumulative_Season_Goals_Conceded']
    away_goals_scored = row['Away_Team_Cumulative_Season_Goals']
    away_goals_conceded = row['Away_Team_Cumulative_Season_Goals_Conceded']

    
    data.at[index, 'Home_Team_Season_Goal_Difference'] = home_goals_scored - home_goals_conceded
    data.at[index, 'Away_Team_Season_Goal_Difference'] = away_goals_scored - away_goals_conceded

# Test with liverpools last 10 games
data[(data['Home_Team'] == 'Liverpool') | (data['Away_Team'] == 'Liverpool')].head(11)[['Date','Home_Team','Away_Team','FT_Home_Team_Goals','FT_Away_Team_Goals','Home_Team_Season_Goal_Difference','Away_Team_Season_Goal_Difference']]

In [None]:
# A feature that calculates the teams shot count for the previous 5 matches
data['Home_Team_Shots_Last_5'] = 0
data['Away_Team_Shots_Last_5'] = 0

shots_last_5_dict = {team: deque(maxlen=5) for team in team_list}


for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']

    home_shots = row['Home_Team_Shots']
    away_shots = row['Away_Team_Shots']

    data.at[index, 'Home_Team_Shots_Last_5'] = sum(shots_last_5_dict[home_team])
    data.at[index, 'Away_Team_Shots_Last_5'] = sum(shots_last_5_dict[away_team])

    shots_last_5_dict[home_team].append(home_shots)
    shots_last_5_dict[away_team].append(away_shots)
    
    if (index + 1) % 380 == 0:
        shots_last_5_dict = {team: deque(maxlen=5) for team in team_list}

        

In [None]:
# A feature that calculates a teams shot count for the season
data['Home_Team_Cumulative_Season_Shots'] = 0
data['Away_Team_Cumulative_Season_Shots'] = 0

shots_season_dict = {team: 0 for team in team_list}


for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_shots = row['Home_Team_Shots']
    away_shots = row['Away_Team_Shots']

    data.at[index, 'Home_Team_Cumulative_Season_Shots'] = shots_season_dict[home_team]
    data.at[index, 'Away_Team_Cumulative_Season_Shots'] = shots_season_dict[away_team]

    shots_season_dict[home_team] += home_shots
    shots_season_dict[away_team] += away_shots
    
    if (index + 1) % 380 == 0: 
        shots_season_dict = {team: 0 for team in team_list}


# Show Aston Villa's first 5 games, both home and away, and show me home goal, away goal, home shots season, away shots season
data[(data['Home_Team'] == 'Aston Villa') | (data['Away_Team'] == 'Aston Villa')].head(11)[['Date','Home_Team','Away_Team','Home_Team_Shots','Away_Team_Shots','Home_Team_Cumulative_Season_Shots','Away_Team_Cumulative_Season_Shots']]

In [None]:
# Shots per game season

data['Home_Team_Season_Shots_Per_Game'] = 0.0
data['Away_Team_Season_Shots_Per_Game'] = 0.0

shots_season_dict = {team: 0 for team in team_list}

games_played_season_dict = {team: 0 for team in team_list}
for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_shots = row['Home_Team_Shots']
    away_shots = row['Away_Team_Shots']
    
    if games_played_season_dict[home_team] > 0:
        data.at[index, 'Home_Team_Season_Shots_Per_Game'] = round(
            shots_season_dict[home_team] / games_played_season_dict[home_team], 3
        )
    if games_played_season_dict[away_team] > 0:
        data.at[index, 'Away_Team_Season_Shots_Per_Game'] = round(
            shots_season_dict[away_team] / games_played_season_dict[away_team], 3
        )
    
    shots_season_dict[home_team] += home_shots
    shots_season_dict[away_team] += away_shots
    games_played_season_dict[home_team] += 1
    games_played_season_dict[away_team] += 1
    
    if (index + 1) % 380 == 0:  
        shots_season_dict = {team: 0 for team in team_list}
        games_played_season_dict = {team: 0 for team in team_list}

# Show Aston Villa's first 5 games, both home and away, and show me home and away shots, and shots per game season

data[(data['Home_Team'] == 'Aston Villa') | (data['Away_Team'] == 'Aston Villa')].head(11)[['Date','Home_Team','Away_Team','Home_Team_Shots','Away_Team_Shots','Home_Team_Season_Shots_Per_Game','Away_Team_Season_Shots_Per_Game']]



In [None]:
# Shots per game last 5

data['Home_Team_Shots_Per_Game_Last_5'] = 0.0
data['Away_Team_Shots_Per_Game_Last_5'] = 0.0

shots_last_5_dict = {team: deque(maxlen=5) for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_shots = row['Home_Team_Shots']
    away_shots = row['Away_Team_Shots']
    
    # Calculate the average shots per game in the last 5 games for both teams
    if len(shots_last_5_dict[home_team]) > 0:
        data.at[index, 'Home_Team_Shots_Per_Game_Last_5'] = round(
            sum(shots_last_5_dict[home_team]) / len(shots_last_5_dict[home_team]), 3
        )
    if len(shots_last_5_dict[away_team]) > 0:
        data.at[index, 'Away_Team_Shots_Per_Game_Last_5'] = round(
            sum(shots_last_5_dict[away_team]) / len(shots_last_5_dict[away_team]), 3
        )
    
    shots_last_5_dict[home_team].append(home_shots)
    shots_last_5_dict[away_team].append(away_shots)
    
    if (index + 1) % 380 == 0:
        shots_last_5_dict = {team: deque(maxlen=5) for team in team_list}

        

In [None]:
# Calculates the teams shots on target count for the previous 5 matches

data['Home_Shots_On_Target_Last_5'] = 0
data['Away_Shots_On_Target_Last_5'] = 0

shots_on_target_last_5_dict = {team: deque(maxlen=5) for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']

    home_shots_on_target = row['Home_Team_Shots_On_Target']
    away_shots_on_target = row['Away_Team_Shots_On_Target']

    data.at[index, 'Home_Team_Shots_On_Target_Last_5'] = sum(shots_on_target_last_5_dict[home_team])
    data.at[index, 'Away_Team_Shots_On_Target_Last_5'] = sum(shots_on_target_last_5_dict[away_team])

    shots_on_target_last_5_dict[home_team].append(home_shots_on_target)
    shots_on_target_last_5_dict[away_team].append(away_shots_on_target)
    
    if (index + 1) % 380 == 0:
        shots_on_target_last_5_dict = {team: deque(maxlen=5) for team in team_list}

        



In [None]:
# Season Shots on target 
data['Home_Team_Cumulative_Season_Shots_On_Target'] = 0
data['Away_Team_Cumulative_Season_Shots_On_Target'] = 0

shots_on_target_season_dict = {team: 0 for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_shots_on_target = row['Home_Team_Shots_On_Target']
    away_shots_on_target = row['Away_Team_Shots_On_Target']

    # Assign the current cumulative shots on target to the DataFrame before updating
    data.at[index, 'Home_Team_Cumulative_Season_Shots_On_Target'] = shots_on_target_season_dict[home_team]
    data.at[index, 'Away_Team_Cumulative_Season_Shots_On_Target'] = shots_on_target_season_dict[away_team]

    shots_on_target_season_dict[home_team] += home_shots_on_target
    shots_on_target_season_dict[away_team] += away_shots_on_target
    
    if (index + 1) % 380 == 0: 
        shots_on_target_season_dict = {team: 0 for team in team_list}

data[(data['Home_Team'] == 'Aston Villa') | (data['Away_Team'] == 'Aston Villa')].head(11)[['Date','Home_Team','Away_Team','Home_Team_Shots_On_Target','Away_Team_Shots_On_Target','Home_Team_Cumulative_Season_Shots_On_Target','Away_Team_Cumulative_Season_Shots_On_Target']]

In [None]:
# calculates a teams average shots on target for the last 5 games
data['Home_Team_Avg_Shots_On_Target_Last_5'] = 0.0
data['Away_Team_Avg_Shots_On_Target_Last_5'] = 0.0

avg_shots_on_target_last_5_dict = {team: deque(maxlen=5) for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_shots_on_target = row['Home_Team_Shots_On_Target']
    away_shots_on_target = row['Away_Team_Shots_On_Target']
    
    data.at[index, 'Home_Team_Avg_Shots_On_Target_Last_5'] = sum(avg_shots_on_target_last_5_dict[home_team]) / len(avg_shots_on_target_last_5_dict[home_team]) if avg_shots_on_target_last_5_dict[home_team] else 0.0
    data.at[index, 'Away_Team_Avg_Shots_On_Target_Last_5'] = sum(avg_shots_on_target_last_5_dict[away_team]) / len(avg_shots_on_target_last_5_dict[away_team]) if avg_shots_on_target_last_5_dict[away_team] else 0.0
    
    avg_shots_on_target_last_5_dict[home_team].append(home_shots_on_target)
    avg_shots_on_target_last_5_dict[away_team].append(away_shots_on_target)

    if (index + 1) % 380 == 0:
        avg_shots_on_target_last_5_dict = {team: deque(maxlen=5) for team in team_list}


# Test with Aston Villa's first 6 games
data[(data['Home_Team'] == 'Aston Villa') | (data['Away_Team'] == 'Aston Villa')].head(11)[['Date','Home_Team','Away_Team','Home_Team_Shots_On_Target','Away_Team_Shots_On_Target','Home_Team_Avg_Shots_On_Target_Last_5','Away_Team_Avg_Shots_On_Target_Last_5']]

In [None]:
# a column that calculates a teams average shots on target for the season
data['Home_Team_Season_Avg_Shots_On_Target'] = 0.0
data['Away_Team_Season_Avg_Shots_On_Target'] = 0.0

shots_on_target_season_dict = {team: 0 for team in team_list}
games_played_season_dict = {team: 0 for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_shots_on_target = row['Home_Team_Shots_On_Target']
    away_shots_on_target = row['Away_Team_Shots_On_Target']
    
    # Assign the current cumulative shots on target to the DataFrame before updating
    if games_played_season_dict[home_team] > 0:
        data.at[index, 'Home_Team_Season_Avg_Shots_On_Target'] = round(
            shots_on_target_season_dict[home_team] / games_played_season_dict[home_team], 3
        )
    if games_played_season_dict[away_team] > 0:
        data.at[index, 'Away_Team_Season_Avg_Shots_On_Target'] = round(
            shots_on_target_season_dict[away_team] / games_played_season_dict[away_team], 3
        )
    
    shots_on_target_season_dict[home_team] += home_shots_on_target
    shots_on_target_season_dict[away_team] += away_shots_on_target
    games_played_season_dict[home_team] += 1
    games_played_season_dict[away_team] += 1
    
    if (index + 1) % 380 == 0: 
        shots_on_target_season_dict = {team: 0 for team in team_list}
        games_played_season_dict = {team: 0 for team in team_list}


data[(data['Home_Team'] == 'Aston Villa') | (data['Away_Team'] == 'Aston Villa')].head(11)[['Date','Home_Team','Away_Team','Home_Team_Shots_On_Target','Away_Team_Shots_On_Target','Home_Team_Season_Avg_Shots_On_Target','Away_Team_Season_Avg_Shots_On_Target']]

In [None]:
# A feature that calculates teams cumulative points count for the past 5 games, changes after every 5 games

data['Home_Team_Points_Last_5'] = 0
data['Away_Team_Points_Last_5'] = 0

points_last_5_dict = {team: deque(maxlen=5) for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    result = row['FT_Result']

    data.at[index, 'Home_Team_Points_Last_5'] = sum(points_last_5_dict[home_team])
    data.at[index, 'Away_Team_Points_Last_5'] = sum(points_last_5_dict[away_team])

    if result == 'H':
        points_last_5_dict[home_team].append(3)
        points_last_5_dict[away_team].append(0)
    elif result == 'A':
        points_last_5_dict[home_team].append(0)
        points_last_5_dict[away_team].append(3)
    else:
        points_last_5_dict[home_team].append(1)
        points_last_5_dict[away_team].append(1)
    
    if (index + 1) % 380 == 0:
        points_last_5_dict = {team: deque(maxlen=5) for team in team_list}


data.loc[(data['Home_Team'] == 'Liverpool') | (data['Away_Team'] == 'Liverpool'),
            ['Date', 'Home_Team', 'Away_Team', 'FT_Home_Team_Goals', 'FT_Away_Team_Goals', 'FT_Result', 
            'Home_Team_Points_Last_5', 'Away_Team_Points_Last_5']].head(10)

In [None]:
# calculate a teams cumulative points for the season
data['Home_Team_Cumulative_Season_Points'] = 0
data['Away_Team_Cumulative_Season_Points'] = 0

# Create a dictionary to store the cumulative points for each team
points_season_dict = {team: 0 for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    result = row['FT_Result']

    # Assign the current cumulative points to the DataFrame before updating
    data.at[index, 'Home_Team_Cumulative_Season_Points'] = points_season_dict[home_team]
    data.at[index, 'Away_Team_Cumulative_Season_Points'] = points_season_dict[away_team]

    # Update the cumulative points for both teams
    if result == 'H':
        points_season_dict[home_team] += 3
        points_season_dict[away_team] += 0
    elif result == 'A':
        points_season_dict[home_team] += 0
        points_season_dict[away_team] += 3
    else:
        points_season_dict[home_team] += 1
        points_season_dict[away_team] += 1
    
    if (index + 1) % 380 == 0: 
        points_season_dict = {team: 0 for team in team_list}

        
data[(data['Home_Team'] == 'Aston Villa') | (data['Away_Team'] == 'Aston Villa')].head(11)[['Date','Home_Team','Away_Team','FT_Home_Team_Goals','FT_Away_Team_Goals','FT_Result','Home_Team_Cumulative_Season_Points','Away_Team_Cumulative_Season_Points']]



In [None]:
# A feature that calculates a team's points per game for the last 5 games
data['Home_Team_Points_Per_Game_Last_5'] = 0.0
data['Away_Team_Points_Per_Game_Last_5'] = 0.0

# Create a dictionary to store the last 5 points for each team
points_last_5_dict = {team: deque(maxlen=5) for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    result = row['FT_Result']

    # Calculate the average points per game in the last 5 games for both teams
    if len(points_last_5_dict[home_team]) > 0:
        data.at[index, 'Home_Team_Points_Per_Game_Last_5'] = round(
            sum(points_last_5_dict[home_team]) / len(points_last_5_dict[home_team]), 3
        )
    if len(points_last_5_dict[away_team]) > 0:
        data.at[index, 'Away_Team_Points_Per_Game_Last_5'] = round(
            sum(points_last_5_dict[away_team]) / len(points_last_5_dict[away_team]), 3
        )

    # Update deque with the current game's points
    if result == 'H':
        points_last_5_dict[home_team].append(3)
        points_last_5_dict[away_team].append(0)
    elif result == 'A':
        points_last_5_dict[home_team].append(0)
        points_last_5_dict[away_team].append(3)
    else:
        points_last_5_dict[home_team].append(1)
        points_last_5_dict[away_team].append(1)

    # Reset the form guide after every 380 games
    if (index + 1) % 380 == 0:
        points_last_5_dict = {team: deque(maxlen=5) for team in team_list}


# Test with Aston Villa's first 10 games
data[(data['Home_Team'] == 'Aston Villa') | (data['Away_Team'] == 'Aston Villa')].head(11)[['Date','Home_Team','Away_Team','FT_Home_Team_Goals','FT_Away_Team_Goals','FT_Result','Home_Team_Points_Per_Game_Last_5','Away_Team_Points_Per_Game_Last_5']]


In [None]:
# A feature that calculates a team's points per game for the season
data['Home_Team_Season_Points_Per_Game'] = 0.0
data['Away_Team_Season_Points_Per_Game'] = 0.0

points_season_dict = {team: 0 for team in team_list}
games_played_season_dict = {team: 0 for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    result = row['FT_Result']

    # Assign the current cumulative points per game to the dataFrame before updating
    if games_played_season_dict[home_team] > 0:
        data.at[index, 'Home_Team_Season_Points_Per_Game'] = round(
            points_season_dict[home_team] / games_played_season_dict[home_team], 3
        )
    if games_played_season_dict[away_team] > 0:
        data.at[index, 'Away_Team_Season_Points_Per_Game'] = round(
            points_season_dict[away_team] / games_played_season_dict[away_team], 3
        )

    if result == 'H':
        points_season_dict[home_team] += 3
        points_season_dict[away_team] += 0
    elif result == 'A':
        points_season_dict[home_team] += 0
        points_season_dict[away_team] += 3
    else:
        points_season_dict[home_team] += 1
        points_season_dict[away_team] += 1

    games_played_season_dict[home_team] += 1
    games_played_season_dict[away_team] += 1

    if (index + 1) % 380 == 0: 
        points_season_dict = {team: 0 for team in team_list}
        games_played_season_dict = {team: 0 for team in team_list}

data[(data['Home_Team'] == 'Liverpool') | (data['Away_Team'] == 'Liverpool')].head(11)[['Date','Home_Team','Away_Team','FT_Home_Team_Goals','FT_Away_Team_Goals','FT_Result','Home_Team_Season_Points_Per_Game','Away_Team_Season_Points_Per_Game']]




In [None]:
# Fouls season

data['Home_Team_Cumulative_Season_Fouls'] = 0
data['Away_Team_Cumulative_Season_Fouls'] = 0

fouls_season_dict = {team: 0 for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_fouls = row['Home_Team_Fouls']
    away_fouls = row['Away_Team_Fouls']
    
    data.at[index, 'Home_Team_Cumulative_Season_Fouls'] = fouls_season_dict[home_team]
    data.at[index, 'Away_Team_Cumulative_Season_Fouls'] = fouls_season_dict[away_team]
    
    fouls_season_dict[home_team] += home_fouls
    fouls_season_dict[away_team] += away_fouls
    
    if (index + 1) % 380 == 0:  
        fouls_season_dict = {team: 0 for team in team_list}

# Test with liverpools first 10 games
data[(data['Home_Team'] == 'Liverpool') | (data['Away_Team'] == 'Liverpool')].head(11)[['Date','Home_Team','Away_Team','FT_Home_Team_Goals','FT_Away_Team_Goals','FT_Result','Home_Team_Cumulative_Season_Fouls','Away_Team_Cumulative_Season_Fouls']]

In [None]:
# fouls last 5

data['Home_Team_Fouls_Last_5'] = 0
data['Away_Team_Fouls_Last_5'] = 0

fouls_last_5_dict = {team: deque(maxlen=5) for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_fouls = row['Home_Team_Fouls']
    away_fouls = row['Away_Team_Fouls']

    # Calculate the sum of fouls in the last 5 games for both teams
    data.at[index, 'Home_Team_Fouls_Last_5'] = sum(fouls_last_5_dict[home_team])
    data.at[index, 'Away_Team_Fouls_Last_5'] = sum(fouls_last_5_dict[away_team])

    fouls_last_5_dict[home_team].append(home_fouls)
    fouls_last_5_dict[away_team].append(away_fouls)
    
    if (index + 1) % 380 == 0:
        fouls_last_5_dict = {team: deque(maxlen=5) for team in team_list}

# Test with liverpools first 10 games
data[(data['Home_Team'] == 'Liverpool') | (data['Away_Team'] == 'Liverpool')].head(11)[['Date','Home_Team','Away_Team','FT_Home_Team_Goals','FT_Away_Team_Goals','FT_Result','Home_Team_Fouls_Last_5','Away_Team_Fouls_Last_5']]

In [None]:
# Season corners

data['Home_Team_Cumulative_Season_Corners'] = 0
data['Away_Team_Cumulative_Season_Corners'] = 0

corners_season_dict = {team: 0 for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_corners = row['Home_Team_Corners']
    away_corners = row['Away_Team_Corners']
    
    data.at[index, 'Home_Team_Cumulative_Season_Corners'] = corners_season_dict[home_team]
    data.at[index, 'Away_Team_Cumulative_Season_Corners'] = corners_season_dict[away_team]

    corners_season_dict[home_team] += home_corners
    corners_season_dict[away_team] += away_corners

    if (index + 1) % 380 == 0:
        corners_season_dict = {team: 0 for team in team_list}

# Test with liverpools first 11 games
data[(data['Home_Team'] == 'Liverpool') | (data['Away_Team'] == 'Liverpool')].head(11)[['Date','Home_Team','Away_Team','Home_Team_Corners', 'Away_Team_Corners','Home_Team_Cumulative_Season_Corners','Away_Team_Cumulative_Season_Corners']]
                                                        

In [None]:
# Corners last 5

data['Home_Team_Corners_Last_5'] = 0
data['Away_Team_Corners_Last_5'] = 0

corners_last_5_dict = {team: deque(maxlen=5) for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']

    home_corners = row['Home_Team_Corners']
    away_corners = row['Away_Team_Corners']

    data.at[index, 'Home_Team_Corners_Last_5'] = sum(corners_last_5_dict[home_team])
    data.at[index, 'Away_Team_Corners_Last_5'] = sum(corners_last_5_dict[away_team])

    corners_last_5_dict[home_team].append(home_corners)
    corners_last_5_dict[away_team].append(away_corners)

    if (index + 1) % 380 == 0:
        corners_last_5_dict = {team: deque(maxlen=5) for team in team_list}

        

In [None]:
# Season yellow cards

data['Home_Team_Cumulative_Season_Yellow_Cards'] = 0
data['Away_Team_Cumulative_Season_Yellow_Cards'] = 0


yellow_cards_season_dict = {team: 0 for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_yellow_cards = row['Home_Team_Yellow_Cards']
    away_yellow_cards = row['Away_Team_Yellow_Cards']
    
    data.at[index, 'Home_Team_Cumulative_Season_Yellow_Cards'] = yellow_cards_season_dict[home_team]
    data.at[index, 'Away_Team_Cumulative_Season_Yellow_Cards'] = yellow_cards_season_dict[away_team]
    
    yellow_cards_season_dict[home_team] += home_yellow_cards
    yellow_cards_season_dict[away_team] += away_yellow_cards
    
    if (index + 1) % 380 == 0: 
        yellow_cards_season_dict = {team: 0 for team in team_list}

# Test with liverpools first 10 games

data.loc[(data['Home_Team'] == 'Liverpool') | (data['Away_Team'] == 'Liverpool'),
            [ 'Home_Team', 'Away_Team', 'Home_Team_Yellow_Cards', 'Away_Team_Yellow_Cards', 
            'Home_Team_Cumulative_Season_Yellow_Cards', 'Away_Team_Cumulative_Season_Yellow_Cards']].head(10)


In [None]:
# Yellow cards last 5

data['Home_Team_Yellow_Cards_Last_5'] = 0
data['Away_Team_Yellow_Cards_Last_5'] = 0


yellow_cards_last_5_dict = {team: deque(maxlen=5) for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_yellow_cards = row['Home_Team_Yellow_Cards']
    away_yellow_cards = row['Away_Team_Yellow_Cards']
    
    data.at[index, 'Home_Team_Yellow_Cards_Last_5'] = sum(yellow_cards_last_5_dict[home_team])
    data.at[index, 'Away_Team_Yellow_Cards_Last_5'] = sum(yellow_cards_last_5_dict[away_team])

    yellow_cards_last_5_dict[home_team].append(home_yellow_cards)
    yellow_cards_last_5_dict[away_team].append(away_yellow_cards)
    
    if (index + 1) % 380 == 0:
        yellow_cards_last_5_dict = {team: deque(maxlen=5) for team in team_list}

In [None]:
# Season red cards

data['Home_Team_Cumulative_Season_Red_Cards'] = 0
data['Away_Team_Cumulative_Season_Red_Cards'] = 0

red_cards_season_dict = {team: 0 for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_red_cards = row['Home_Team_Red_Cards']
    away_red_cards = row['Away_Team_Red_Cards']
    
    data.at[index, 'Home_Team_Cumulative_Season_Red_Cards'] = red_cards_season_dict[home_team]
    data.at[index, 'Away_Team_Cumulative_Season_Red_Cards'] = red_cards_season_dict[away_team]

    red_cards_season_dict[home_team] += home_red_cards
    red_cards_season_dict[away_team] += away_red_cards
    
    if (index + 1) % 380 == 0: 
        red_cards_season_dict = {team: 0 for team in team_list}


# Test with liverpools first 10 games
data.loc[(data['Home_Team'] == 'Liverpool') | (data['Away_Team'] == 'Liverpool'),
            [ 'Home_Team', 'Away_Team', 'Home_Team_Red_Cards', 'Away_Team_Red_Cards', 
            'Home_Team_Cumulative_Season_Red_Cards', 'Away_Team_Cumulative_Season_Red_Cards']].head(11)

In [None]:
# Red cards last 5

data['Home_Team_Red_Cards_Last_5'] = 0
data['Away_Team_Red_Cards_Last_5'] = 0

red_cards_last_5_dict = {team: deque(maxlen=5) for team in team_list}


for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_red_cards = row['Home_Team_Red_Cards']
    away_red_cards = row['Away_Team_Red_Cards']
    
    data.at[index, 'Home_Team_Red_Cards_Last_5'] = sum(red_cards_last_5_dict[home_team])
    data.at[index, 'Away_Team_Red_Cards_Last_5'] = sum(red_cards_last_5_dict[away_team])

    red_cards_last_5_dict[home_team].append(home_red_cards)
    red_cards_last_5_dict[away_team].append(away_red_cards)
    
    if (index + 1) % 380 == 0:
        red_cards_last_5_dict = {team: deque(maxlen=5) for team in team_list}

        

In [None]:
# shot efficiency metric, goals per shots on target

data['Home_Team_Season_Shot_Efficiency'] = 0.0
data['Away_Team_Season_Shot_Efficiency'] = 0.0

# Create dictionaries to store cumulative goals and shots on target for each team
goals_season_dict = {team: 0 for team in team_list}
shots_on_target_season_dict = {team: 0 for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_goals = row['FT_Home_Team_Goals']
    away_goals = row['FT_Away_Team_Goals']
    home_shots_on_target = row['Home_Team_Shots_On_Target']
    away_shots_on_target = row['Away_Team_Shots_On_Target']

    # Assign the current cumulative goals and shots on target to the DataFrame before updating
    if shots_on_target_season_dict[home_team] > 0:
        data.at[index, 'Home_Team_Season_Shot_Efficiency'] = round(
            goals_season_dict[home_team] / shots_on_target_season_dict[home_team], 3
        )
    if shots_on_target_season_dict[away_team] > 0:
        data.at[index, 'Away_Team_Season_Shot_Efficiency'] = round(
            goals_season_dict[away_team] / shots_on_target_season_dict[away_team], 3
        )

    goals_season_dict[home_team] += home_goals
    goals_season_dict[away_team] += away_goals
    shots_on_target_season_dict[home_team] += home_shots_on_target
    shots_on_target_season_dict[away_team] += away_shots_on_target
    
    if (index + 1) % 380 == 0: 
        goals_season_dict = {team: 0 for team in team_list}
        shots_on_target_season_dict = {team: 0 for team in team_list}

# Test with liverpools first 10 games
data.loc[(data['Home_Team'] == 'Liverpool') | (data['Away_Team'] == 'Liverpool'),
            [ 'Home_Team', 'Away_Team', 'FT_Home_Team_Goals', 'FT_Away_Team_Goals', 'Home_Team_Shots_On_Target', 'Away_Team_Shots_On_Target',
            'Home_Team_Season_Shot_Efficiency', 'Away_Team_Season_Shot_Efficiency']].head(10)



In [None]:
# Last 5 shot efficiency

data['Home_Team_Shot_Efficiency_Last_5'] = 0.0
data['Away_Team_Shot_Efficiency_Last_5'] = 0.0

# Create dictionaries to store cumulative goals and shots on target for the last 5 games for each team
goals_last_5_dict = {team: deque(maxlen=5) for team in team_list}
shots_on_target_last_5_dict = {team: deque(maxlen=5) for team in team_list}

for index, row in data.iterrows():
    home_team = row['Home_Team']
    away_team = row['Away_Team']
    home_goals = row['FT_Home_Team_Goals']
    away_goals = row['FT_Away_Team_Goals']
    home_shots_on_target = row['Home_Team_Shots_On_Target']
    away_shots_on_target = row['Away_Team_Shots_On_Target']

    # Calculate the sum of goals and shots on target in the last 5 games for both teams
    data.at[index, 'Home_Team_Shot_Efficiency_Last_5'] = round(
        sum(goals_last_5_dict[home_team]) / sum(shots_on_target_last_5_dict[home_team]), 3
    ) if sum(shots_on_target_last_5_dict[home_team]) > 0 else 0.0
    data.at[index, 'Away_Team_Shot_Efficiency_Last_5'] = round(
        sum(goals_last_5_dict[away_team]) / sum(shots_on_target_last_5_dict[away_team]), 3
    ) if sum(shots_on_target_last_5_dict[away_team]) > 0 else 0.0

    # Update cumulative goals and shots on target for both teams
    goals_last_5_dict[home_team].append(home_goals)
    goals_last_5_dict[away_team].append(away_goals)
    shots_on_target_last_5_dict[home_team].append(home_shots_on_target)
    shots_on_target_last_5_dict[away_team].append(away_shots_on_target)

    if (index + 1) % 380 == 0: 
        goals_last_5_dict = {team: deque(maxlen=5) for team in team_list}
        shots_on_target_last_5_dict = {team: deque(maxlen=5) for team in team_list}

In [None]:
# Interaction Features:

# Measure's a team's seasonal scoring ability against the opponent's seasonal defensive ability
data['Home_Attack_vs_Away_Defense_Season'] = data['Home_Team_Season_Goals_Per_Game'] * data['Away_Team_Season_Goals_Conceded_Per_Game']
data['Away_Attack_vs_Home_Defense_Season'] = data['Away_Team_Season_Goals_Per_Game'] * data['Home_Team_Season_Goals_Conceded_Per_Game']


# Measure's a team's recent scoring ability against the opponent's recent defensive ability
data
data['Home_Attack_vs_Away_Defense_Last5'] = data['Home_Team_Goals_Per_Game_Last_5'] * data['Away_Team_Goals_Conceded_Per_Game_Last_5']
data['Away_Attack_vs_Home_Defense_Last5'] = data['Away_Team_Goals_Per_Game_Last_5'] * data['Home_Team_Goals_Conceded_Per_Game_Last_5']




In [None]:
# Create a days since last match column
from datetime import datetime

data['Days_Since_Last_Match'] = 0
data['Days_Since_Last_Match_Away'] = 0
data['Days_Since_Last_Match_Home'] = 0

# Create a dictionary to store the last match date for each team
last_match_date_dict = {team: None for team in team_list}

for index, row in data.iterrows():
    current_date = row['Date']
    home_team = row['Home_Team']
    away_team = row['Away_Team']

    # Calculate the days since the last match for both teams
    if last_match_date_dict[home_team] is not None:
        days_diff = (current_date - last_match_date_dict[home_team]).days
        data.at[index, 'Days_Since_Last_Match_Home'] = days_diff
    if last_match_date_dict[away_team] is not None:
        days_diff = (current_date - last_match_date_dict[away_team]).days
        data.at[index, 'Days_Since_Last_Match_Away'] = days_diff

    # Update the last match date for both teams
    last_match_date_dict[home_team] = current_date
    last_match_date_dict[away_team] = current_date
    
    if (index + 1) % 380 == 0:  
        last_match_date_dict = {team: None for team in team_list}

# Test with liverpools first 10 games
data.loc[(data['Home_Team'] == 'Liverpool') | (data['Away_Team'] == 'Liverpool'),
            ['Date', 'Home_Team', 'Away_Team', 'Days_Since_Last_Match_Home', 'Days_Since_Last_Match_Away']].head(10)





In [None]:
# Drop the raw data columns to get only the engineered features
model_data = data.drop(['FT_Home_Team_Goals','FT_Away_Team_Goals','HT_Home_Team_Goals','HT_Away_Team_Goals',
                        'HT_Result','Home_Team_Shots','Away_Team_Shots','Home_Team_Shots_On_Target','Away_Team_Shots_On_Target','Home_Team_Fouls',
                        'Away_Team_Fouls','Home_Team_Corners','Away_Team_Corners','Home_Team_Yellow_Cards','Away_Team_Yellow_Cards','Home_Team_Red_Cards','Away_Team_Red_Cards',
                ], axis=1)






In [None]:
seasonal_data = [col for col in model_data.columns if 'Season' in col]

last_5_data = [col for col in model_data.columns if 'Last_5' in col]

# Add home team and away team to both datasets

seasonal_data = seasonal_data + ['Home_Team','Away_Team', 'Date', 'FT_Result', 'Days_Since_Last_Match']
last_5_data = last_5_data + ['Home_Team','Away_Team', 'Date', 'FT_Result', 'Season', 'Days_Since_Last_Match']

seasonal_data = model_data[seasonal_data]
last_5_data = model_data[last_5_data]


last_5_data.columns
# seasonal_data.columns



In [None]:
seasonal_data[seasonal_data['Season'] < '17-18']

In [None]:
# Create actual DataFrames using the column lists

# Encode the target variable
le = LabelEncoder()
seasonal_data['FT_Result'] = le.fit_transform(seasonal_data['FT_Result'])
last_5_data['FT_Result'] = le.fit_transform(last_5_data['FT_Result'])

# Convert categorical columns to numerical using one-hot encoding
seasonal_data_dummies = pd.get_dummies(seasonal_data, columns=['Home_Team', 'Away_Team'])
last_5_data_dummies = pd.get_dummies(last_5_data, columns=['Home_Team', 'Away_Team'])

# Time-based split: train on seasons before 17-18, test on 17-18 and later
train_data = seasonal_data_dummies[seasonal_data_dummies['Season'].astype(str) < '16-17']
test_data = seasonal_data_dummies[seasonal_data_dummies['Season'].astype(str) >= '16-17']

train_data_last_5 = last_5_data_dummies[last_5_data_dummies['Season'].astype(str) < '16-17']
test_data_last_5 = last_5_data_dummies[last_5_data_dummies['Season'].astype(str) >= '17-18']

# Separate features and target variable for training and testing sets
X_train = train_data.drop(['FT_Result', 'Date', 'Season'], axis=1)
y_train = train_data['FT_Result']

X_test = test_data.drop(['FT_Result', 'Date', 'Season'], axis=1)
y_test = test_data['FT_Result']

# Separate features and target variable for training and testing sets for last 5 games
X_train_last_5 = train_data_last_5.drop(['FT_Result', 'Date', 'Season'], axis=1)
y_train_last_5 = train_data_last_5['FT_Result']

X_test_last_5 = test_data_last_5.drop(['FT_Result', 'Date', 'Season'], axis=1)
y_test_last_5 = test_data_last_5['FT_Result']







## Initial Implementation

In [None]:

# Train a Random Forest Classifier on the seasonal data
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)
# Make predictions on the test set

y_pred = rf_model.predict(X_test)
# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred)
print(f"Accuracy of Random Forest on seasonal data: {accuracy_rf:.2f}")
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))



# Train a Random Forest Classifier on the last 5 games data
rf_model_last_5 = RandomForestClassifier()
rf_model_last_5.fit(X_train_last_5, y_train_last_5)
# Make predictions on the test set
y_pred_last_5 = rf_model_last_5.predict(X_test_last_5)
# Calculate accuracy
accuracy_rf_last_5 = accuracy_score(y_test_last_5, y_pred_last_5)
print(f"Accuracy of Random Forest on last 5 games data: {accuracy_rf_last_5:.2f}")
# Calculate classification report
print(classification_report(y_test_last_5, y_pred_last_5))







In [None]:
# Train a XG Boost Classifier on the seasonal data, using no parameter tuning to start with
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)
# Calculate accuracy
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy of XGBoost on seasonal data: {accuracy_xgb:.2f}")
# Calculate classification report
print(classification_report(y_test, y_pred_xgb))




# Train a XG Boost Classifier on the last 5 games data
xgb_model_last_5 = xgb.XGBClassifier()
xgb_model_last_5.fit(X_train_last_5, y_train_last_5)
# Make predictions on the test set
y_pred_xgb_last_5 = xgb_model_last_5.predict(X_test_last_5)
# Calculate accuracy

accuracy_xgb_last_5 = accuracy_score(y_test_last_5, y_pred_xgb_last_5)
print(f"Accuracy of XGBoost on last 5 games data: {accuracy_xgb_last_5:.2f}")

# Calculate classification report
print(classification_report(y_test_last_5, y_pred_xgb_last_5))


In [None]:
# Initialise SVM model
svm_model = SVC()
# Train the model on the training data
svm_model.fit(X_train, y_train)
# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test)

# Calculate accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)

print(f"Accuracy of SVM on seasonal data: {accuracy_svm:.2f}")
# Calculate classification report
print(classification_report(y_test, y_pred_svm))


# Train a SVM Classifier on the last 5 games data
svm_model_last_5 = SVC()
svm_model_last_5.fit(X_train_last_5, y_train_last_5)
# Make predictions on the test set
y_pred_svm_last_5 = svm_model_last_5.predict(X_test_last_5)
# Calculate accuracy
accuracy_svm_last_5 = accuracy_score(y_test_last_5, y_pred_svm_last_5)
print(f"Accuracy of SVM on last 5 games data: {accuracy_svm_last_5:.2f}")
# Calculate classification report
print(classification_report(y_test_last_5, y_pred_svm_last_5))





In [None]:
# Initialise SVM model
svm_model_scaled = SVC()

#Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_last_5_scaled = scaler.fit_transform(X_train_last_5)
X_test_last_5_scaled = scaler.transform(X_test_last_5)

# Train the model on the scaled seasonal data
svm_model_scaled.fit(X_train_scaled, y_train)
# Make predictions on the test set
y_pred_svm = svm_model_scaled.predict(X_test_scaled)
# Calculate accuracy
accuracy_svm_scaled = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy of SVM_scaled on seasonal data: {accuracy_svm_scaled:.2f}")
# Calculate classification report
print(classification_report(y_test, y_pred_svm))


# Train the model on the scaled last 5 games data
svm_model_scaled.fit(X_train_last_5_scaled, y_train_last_5)
# Make predictions on the test set
y_pred_svm_last_5 = svm_model.predict(X_test_last_5_scaled)
# Calculate accuracy
accuracy_svm_scaled_last_5 = accuracy_score(y_test_last_5, y_pred_svm_last_5)
print(f"Accuracy of SVM_scaled on last 5 games data: {accuracy_svm_scaled_last_5:.2f}")
# Calculate classification report
print(classification_report(y_test_last_5, y_pred_svm_last_5))








In [None]:
# Train a Logistic Regression model on the seasonal data
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)
# Make predictions on the test set
y_pred_log_reg = log_reg_model.predict(X_test)
# Calculate accuracy
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print(f"Accuracy of Logistic Regression on seasonal data: {accuracy_log_reg:.2f}")
# Calculate classification report
print(classification_report(y_test, y_pred_log_reg))

# Train a Logistic Regression model on the last 5 games data
log_reg_model_last_5 = LogisticRegression()
log_reg_model_last_5.fit(X_train_last_5, y_train_last_5)
# Make predictions on the test set
y_pred_log_reg_last_5 = log_reg_model_last_5.predict(X_test_last_5)
# Calculate accuracy
accuracy_log_reg_last_5 = accuracy_score(y_test_last_5, y_pred_log_reg_last_5)
print(f"Accuracy of Logistic Regression on last 5 games data: {accuracy_log_reg_last_5:.2f}")
# Calculate classification report
print(classification_report(y_test_last_5, y_pred_log_reg_last_5))

In [None]:
# Initialize KNN model

knn_model = KNeighborsClassifier()
# Train the model on the seasonal data
knn_model.fit(X_train, y_train)
# Make predictions on the test set
y_pred_knn = knn_model.predict(X_test)
# Calculate accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"Accuracy of KNN on seasonal data: {accuracy_knn:.2f}")
# Calculate classification report
print(classification_report(y_test, y_pred_knn))

# Train a KNN Classifier on the last 5 games data
knn_model_last_5 = KNeighborsClassifier()
knn_model_last_5.fit(X_train_last_5, y_train_last_5)
# Make predictions on the test set
y_pred_knn_last_5 = knn_model_last_5.predict(X_test_last_5)
# Calculate accuracy
accuracy_knn_last_5 = accuracy_score(y_test_last_5, y_pred_knn_last_5)
print(f"Accuracy of KNN on last 5 games data: {accuracy_knn_last_5:.2f}")
# Calculate classification report
print(classification_report(y_test_last_5, y_pred_knn_last_5))



In [None]:
from matplotlib.font_manager import FontProperties

# Create a list of model names and their corresponding accuracies
model_names = ['Random Forest', 'XGBoost', 'SVM', 'Logistic Regression', 'KNN']
accuracies = [accuracy_rf, accuracy_xgb,  accuracy_svm, accuracy_log_reg, accuracy_knn]
accuracies_last_5 = [accuracy_rf_last_5, accuracy_xgb_last_5, accuracy_svm_last_5, accuracy_log_reg_last_5, accuracy_knn_last_5]

# Set the bar width and positions
bar_width = 0.25
x = np.arange(len(model_names))

plt.style.use('default')
plt.figure(figsize=(12, 6))
# Create a bar chart for seasonal data
plt.bar(x, accuracies, width=bar_width, label='Seasonal Data', color='b', alpha=0.7)
# Create a bar chart for last 5 games data
plt.bar(x + bar_width, accuracies_last_5, width=bar_width, label='Last 5 Games Data', color='r', alpha=0.7)
plt.xlabel('Models', fontsize=14, color='black')
plt.ylabel('Accuracy', fontsize=14, color='black')
plt.title('Model Accuracies for Seasonal and Last 5 Games Data', fontsize=16)
plt.xticks(x + bar_width / 2, model_names, fontsize=12)
plt.legend(fontsize=12)
plt.tight_layout()
plt.show()







In [None]:
# Scale the features


scaler = StandardScaler()
# Scale the features for seasonal data

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Scale the features for last 5 games data
X_train_last_5 = scaler.fit_transform(X_train_last_5)
X_test_last_5 = scaler.transform(X_test_last_5)





In [None]:
# Hyperparameter tuning for the seasonal Random Forest model




param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}



from joblib import Parallel, delayed


# wrap each search in its own function
def _run_grid():
    gs = GridSearchCV(
        RandomForestClassifier(),
        param_grid=param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=1,
        verbose=2,
        error_score='raise'
    )
    return gs.fit(X_train, y_train)

def _run_random():
    rs = RandomizedSearchCV(
        RandomForestClassifier(),
        param_distributions=param_grid,
        n_iter=10,
        cv=5,
        scoring='accuracy',
        n_jobs=1,
        verbose=2,
        random_state=42,
        error_score='raise'
    )
    return rs.fit(X_train, y_train)

# launch both searches in parallel
grid_search_rf, random_search_rf = Parallel(n_jobs=-1)(
    delayed(f)() for f in (_run_grid, _run_random)
)

# extract and print results for GridSearch
print("GridSearch best params:", grid_search_rf.best_params_)
print("GridSearch best CV score:", grid_search_rf.best_score_)
y_pred = grid_search_rf.predict(X_test)
print("Test accuracy (GridSearch):", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# extract and print results for RandomizedSearch
print("RandomSearch best params:", random_search_rf.best_params_)
print("RandomSearch best CV score:", random_search_rf.best_score_)
y_pred_r = random_search_rf.predict(X_test)
print("Test accuracy (RandomSearch):", accuracy_score(y_test, y_pred_r))
print(classification_report(y_test, y_pred_r))









GridSearch best params: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
GridSearch best CV score: 0.4947368421052632
Test accuracy (GridSearch): 0.5649122807017544
              precision    recall  f1-score   support

           0       0.58      0.54      0.56       345
           1       0.20      0.02      0.04       254
           2       0.57      0.84      0.68       541

    accuracy                           0.56      1140
   macro avg       0.45      0.47      0.42      1140
weighted avg       0.49      0.56      0.50      1140

RandomSearch best params: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10}
RandomSearch best CV score: 0.48458646616541345
Test accuracy (RandomSearch): 0.5578947368421052
              precision    recall  f1-score   support

           0       0.58      0.52      0.55       345
           1       0.08      0.01      0.01       254
           2       0.56      0.84      0.67       541

    accuracy                           0.56      1140
   macro avg       0.41      0.46      0.41      1140
weighted avg       0.46      0.56      0.49      1140

not running again.. took 33 minutes. for seasonal

In [None]:


rf_model = RandomForestClassifier()

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
# fit the model wih last 5
grid_search_rf.fit(X_train_last_5, y_train_last_5)
# Make predictions on the test set
y_pred_rf = grid_search_rf.predict(X_test_last_5)
# Calculate accuracy
accuracy_rf_last_5 = accuracy_score(y_test_last_5, y_pred_rf)
print(f"Accuracy of Random Forest on last 5 games data: {accuracy_rf_last_5}")
# Calculate classification report
print(classification_report(y_test_last_5, y_pred_rf))

# Extract the best parameters and best score
print("Best parameters for Random Forest on last 5 games data:", grid_search_rf.best_params_)
print("Best cross-validation score for Random Forest on last 5 games data:", grid_search_rf.best_score_)


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Accuracy of Random Forest on last 5 games data: 0.52
              precision    recall  f1-score   support

           0       0.50      0.48      0.49       236
           1       0.06      0.01      0.01       170
           2       0.54      0.79      0.65       354

    accuracy                           0.52       760
   macro avg       0.37      0.43      0.38       760
weighted avg       0.42      0.52      0.46       760

Best parameters for Random Forest on last 5 games data: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best cross-validation score for Random Forest on last 5 games data: 0.5003759398496241
Accuracy of best Random Forest on last 5 games data: 0.52
              precision    recall  f1-score   support

           0       0.50      0.48      0.49       236
           1       0.06      0.01      0.01       170
           2       0.54      0.79      0.65       354

    accuracy                           0.52       760
   macro avg       0.37      0.43      0.38       760
weighted avg       0.42      0.52      0.46       760

In [None]:



rf_model_random = RandomForestClassifier()

random_search_rf = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid, n_iter=10, cv=5, n_jobs=1, verbose=2, scoring='accuracy', random_state=42)


random_search_rf.fit(X_train_last_5, y_train_last_5)

y_pred_rf_random = random_search_rf.predict(X_test_last_5)

accuracy_rf_random_last_5 = accuracy_score(y_test_last_5, y_pred_rf_random)

print(f"Accuracy of Random Forest (RandomizedSearchCV) on last 5 games data: {accuracy_rf_random_last_5:.2f}")
print(classification_report(y_test_last_5, y_pred_rf_random))
print("Best parameters for Random Forest (RandomizedSearchCV) on last 5 games data:", random_search_rf.best_params_)
print("Best cross-validation score for Random Forest (RandomizedSearchCV) on last 5 games data:", random_search_rf.best_score_)


Accuracy of Random Forest (RandomizedSearchCV) on last 5 games data: 0.53
              precision    recall  f1-score   support

           0       0.51      0.46      0.48       236
           1       0.10      0.01      0.01       170
           2       0.54      0.82      0.65       354

    accuracy                           0.53       760
   macro avg       0.39      0.43      0.38       760
weighted avg       0.43      0.53      0.46       760

Best parameters for Random Forest (RandomizedSearchCV) on last 5 games data: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10}
Best cross-validation score for Random Forest (RandomizedSearchCV) on last 5 games data: 0.49624060150375937

In [None]:


# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_model = xgb.XGBClassifier()
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search_xgb.fit(X_train, y_train)
best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = grid_search_xgb.best_score_
y_pred_xgb = grid_search_xgb.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy of XGBoost after tuning: {accuracy_xgb:.2f}")
# Calculate classification report
print(classification_report(y_test, y_pred_xgb))

print("Best parameters for XGBoost:", best_params_xgb)
print("Best cross-validation score for XGBoost:", best_score_xgb)





Fitting 5 folds for each of 108 candidates, totalling 540 fits
Accuracy of XGBoost after tuning: 0.57
              precision    recall  f1-score   support

           0       0.63      0.45      0.53       345
           1       0.31      0.02      0.03       254
           2       0.56      0.90      0.69       541

    accuracy                           0.57      1140
   macro avg       0.50      0.46      0.42      1140
weighted avg       0.52      0.57      0.49      1140

Best parameters for XGBoost: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
Best cross-validation score for XGBoost: 0.5056390977443609

In [None]:
#Hyperparameter tuning for the XGBoost model on last 5 games data

xgb_model_last_5 = xgb.XGBClassifier()
grid_search_xgb_last_5 = GridSearchCV(estimator=xgb_model_last_5, param_grid=param_grid_xgb, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search_xgb_last_5.fit(X_train_last_5, y_train_last_5)

best_params_xgb_last_5 = grid_search_xgb_last_5.best_params_
best_score_xgb_last_5 = grid_search_xgb_last_5.best_score_

y_pred_xgb_last_5 = grid_search_xgb_last_5.predict(X_test_last_5)
accuracy_xgb_last_5 = accuracy_score(y_test_last_5, y_pred_xgb_last_5)


print(f"Accuracy of XGBoost after tuning on last 5 games data: {accuracy_xgb_last_5:.2f}")

print(classification_report(y_test_last_5, y_pred_xgb_last_5))

print("Best parameters for XGBoost on last 5 games data:", best_params_xgb_last_5)
print("Best cross-validation score for XGBoost on last 5 games data:", best_score_xgb_last_5)



Fitting 5 folds for each of 108 candidates, totalling 540 fits
Accuracy of XGBoost after tuning on last 5 games data: 0.53
              precision    recall  f1-score   support

           0       0.51      0.56      0.53       236
           1       0.15      0.01      0.02       170
           2       0.55      0.76      0.64       354

    accuracy                           0.53       760
   macro avg       0.40      0.44      0.40       760
weighted avg       0.45      0.53      0.47       760

Best parameters for XGBoost on last 5 games data: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
Best cross-validation score for XGBoost on last 5 games data: 0.5003759398496241

In [None]:
#Hyperparameter tuning for the XGBoost model on seasonal data randomsearch

xgb_model_random = xgb.XGBClassifier()
random_search_xgb = RandomizedSearchCV(estimator=xgb_model_random, param_distributions=param_grid_xgb, n_iter=10, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

random_search_xgb.fit(X_train, y_train)

best_params_xgb_random = random_search_xgb.best_params_
best_score_xgb_random = random_search_xgb.best_score_
y_pred_xgb_random = random_search_xgb.predict(X_test)
accuracy_xgb_random = accuracy_score(y_test, y_pred_xgb_random)


print(f"Accuracy of XGBoost (RandomizedSearchCV) on seasonal data: {accuracy_xgb_random:.2f}")
print(classification_report(y_test, y_pred_xgb_random))
print("Best parameters for XGBoost (RandomizedSearchCV) on seasonal data:", best_params_xgb_random)
print("Best cross-validation score for XGBoost (RandomizedSearchCV) on seasonal data:", best_score_xgb_random)



Fitting 5 folds for each of 10 candidates, totalling 50 fits
Accuracy of XGBoost (RandomizedSearchCV) on seasonal data: 0.57
              precision    recall  f1-score   support

           0       0.58      0.52      0.55       345
           1       0.38      0.01      0.02       254
           2       0.56      0.86      0.68       541

    accuracy                           0.57      1140
   macro avg       0.51      0.46      0.42      1140
weighted avg       0.53      0.57      0.49      1140

Best parameters for XGBoost (RandomizedSearchCV) on seasonal data: {, 'n_estimators': 200, 'colsample_bytree': 0.8, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 3, }
Best cross-validation score for XGBoost (RandomizedSearchCV) on seasonal data: 0.5011278195488722

In [None]:
#Hyperparameter tuning for the XGBoost model on last 5 games data

xgb_model_random_last_5 = xgb.XGBClassifier()
random_search_xgb_last_5 = RandomizedSearchCV(estimator=xgb_model_random_last_5, param_distributions=param_grid_xgb, n_iter=10, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

random_search_xgb_last_5.fit(X_train_last_5, y_train_last_5)

best_params_xgb_random_last_5 = random_search_xgb_last_5.best_params_
best_score_xgb_random_last_5 = random_search_xgb_last_5.best_score_

y_pred_xgb_random_last_5 = random_search_xgb_last_5.predict(X_test_last_5)

accuracy_xgb_random_last_5 = accuracy_score(y_test_last_5, y_pred_xgb_random_last_5)
print(f"Accuracy of XGBoost (RandomizedSearchCV) after tuning on last 5 games data: {accuracy_xgb_random_last_5:.2f}")

print(classification_report(y_test_last_5, y_pred_xgb_random_last_5))
print("Best parameters for XGBoost (RandomizedSearchCV) on last 5 games data:", best_params_xgb_random_last_5)
print("Best cross-validation score for XGBoost (RandomizedSearchCV) on last 5 games data:", best_score_xgb_random_last_5)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Accuracy of XGBoost (RandomizedSearchCV) after tuning on last 5 games data: 0.51
              precision    recall  f1-score   support

           0       0.48      0.50      0.49       236
           1       0.07      0.01      0.01       170
           2       0.53      0.75      0.62       354

    accuracy                           0.51       760
   macro avg       0.36      0.42      0.38       760
weighted avg       0.41      0.51      0.45       760

Best parameters for XGBoost (RandomizedSearchCV) on last 5 games data: {'subsample': 0.8, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 1.0}
Best cross-validation score for XGBoost (RandomizedSearchCV) on last 5 games data: 0.4981203007518797

In [None]:
# Hyperparameter tuning for svm seasonal data using gridsearch


# Define the parameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4]
}
svm_model = SVC()
grid_search_svm = GridSearchCV(estimator=svm_model, param_grid=param_grid_svm, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search_svm.fit(X_train, y_train)

best_params_svm = grid_search_svm.best_params_
best_score_svm = grid_search_svm.best_score_
y_pred_svm = grid_search_svm.predict(X_test)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy of SVM after tuning: {accuracy_svm:.2f}")

print(classification_report(y_test, y_pred_svm))
print("Best parameters for SVC (GridSearchCV) on seasonal data:", best_params_svm)
print("Best cross-validation score for SVC (GridSearch) on seasonal data:", best_score_svm)



Fitting 5 folds for each of 54 candidates, totalling 270 fits
Accuracy of SVC (GridSearchCV) after tuning: 0.56
              precision    recall  f1-score   support

           0       0.53      0.54      0.53       236
           1       0.37      0.01      0.11       170
           2       0.53      0.75      0.62       354

    accuracy                           0.56       760
   macro avg       0.50      0.46      0.43       760
weighted avg       0.55      0.56      0.50       760

Best parameters for SVC (GridSearchCV) on seasonal data: { C’: 100, ‘kernel’: 'rbf', ‘gamma’:'scale', ‘degree’: 1}
Best cross-validation score for SVC (GridSearch) on seasonal data: 0.5111278195488722

In [None]:
# Hyperparameter tuning for svm last 5 games data using gridsearch

svm_model_last_5 = SVC()
grid_search_svm_last_5 = GridSearchCV(estimator=svm_model_last_5, param_grid=param_grid_svm, cv=5, n_jobs=8, verbose=2, scoring='accuracy')

grid_search_svm_last_5.fit(X_train_last_5, y_train_last_5)

best_params_svm_last_5 = grid_search_svm_last_5.best_params_
best_score_svm_last_5 = grid_search_svm_last_5.best_score_

y_pred_svm_last_5 = grid_search_svm_last_5.predict(X_test_last_5)

accuracy_svm_last_5 = accuracy_score(y_test_last_5, y_pred_svm_last_5)
print(f"Accuracy of SVM after tuning on last 5 games data: {accuracy_svm_last_5:.2f}")

print(classification_report(y_test_last_5, y_pred_svm_last_5))

print("Best parameters for SVC (RandomizedSearchCV) on last 5 data:", best_params_svm_last_5)
print("est cross-validation score for SVC (RandomizedSearch) on last 5 data:", best_score_svm_last_5)


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Accuracy of SVC (RandomizedSearchCV) after tuning on last 5 games data: 0.54
              precision    recall  f1-score   support

           0       0.50      0.54      0.53       236
           1       0.37      0.01      0.11       170
           2       0.53      0.70      0.62       354

    accuracy                           0.56       760
   macro avg       0.50      0.46      0.43       760
weighted avg       0.55      0.56      0.50       760

Best parameters for SVC (RandomizedSearchCV) on last 5 data: { C’: 100, ‘kernel’: 'rbf', ‘gamma’:'scale', ‘degree’: 1}
Best cross-validation score for SVC (RandomizedSearch) on last 5 data: 0.490213548195273093

In [None]:

# Hyperparameter tuning for svm seasonal data using randomsearch

svm_model_random = SVC()

random_search_svm = RandomizedSearchCV(estimator=svm_model_random, param_distributions=param_grid_svm, n_iter=10, cv=5, n_jobs=1, verbose=2, scoring='accuracy', random_state=42)

random_search_svm.fit(X_train, y_train)
best_params_svm_random = random_search_svm.best_params_
best_score_svm_random = random_search_svm.best_score_

y_pred_svm_random = random_search_svm.predict(X_test)
accuracy_svm_random = accuracy_score(y_test, y_pred_svm_random)
print(f"Accuracy of SVM (RandomizedSearchCV) after tuning: {accuracy_svm_random:.2f}")

print(classification_report(y_test, y_pred_svm_random))

print("Best parameters for SVM (RandomizedSearchCV):", best_params_svm_random)
print("Best cross-validation score for SVM (RandomizedSearchCV):", best_score_svm_random)




Fitting 5 folds for each of 54 candidates, totalling 270 fits
Accuracy of SVC (RandomizedSearchCV) after tuning: 0.54
              precision    recall  f1-score   support

           0       0.52      0.54      0.51       236
           1       0.21      0.03      0.11       170
           2       0.49      0.68      0.60       354

    accuracy                           0.56       760
   macro avg       0.50      0.46      0.47       760
weighted avg       0.50      0.52      0.51       760

Best parameters for SVC (RandomizedSearchCV) on seasonal data: { C’: 100, ‘kernel’: 'poly', ‘gamma’:'auto', ‘degree’: 1}
Best cross-validation score for SVC (RandomizedSearchCV) on seasonal data: 0.50142129328428722

In [None]:

# Hyperparameter tuning for svm last 5 games data using randomsearch
svm_model_random_last_5 = SVC()

random_search_svm_last_5 = RandomizedSearchCV(estimator=svm_model_random_last_5, param_distributions=param_grid_svm, n_iter=10, cv=5, n_jobs=1, verbose=2, scoring='accuracy', random_state=42)
random_search_svm_last_5.fit(X_train_last_5, y_train_last_5)

best_params_svm_random_last_5 = random_search_svm_last_5.best_params_

best_score_svm_random_last_5 = random_search_svm_last_5.best_score_
y_pred_svm_random_last_5 = random_search_svm_last_5.predict(X_test_last_5)

accuracy_svm_random_last_5 = accuracy_score(y_test_last_5, y_pred_svm_random_last_5)
print(f"Accuracy of SVM (RandomizedSearchCV) after tuning on last 5 games data: {accuracy_svm_random_last_5:.2f}")

print(classification_report(y_test_last_5, y_pred_svm_random_last_5))
print("Best parameters for SVM (RandomizedSearchCV) on last 5 games data:", best_params_svm_random_last_5)
print("Best cross-validation score for SVM (RandomizedSearchCV) on last 5 games data:", best_score_svm_random_last_5)



Fitting 5 folds for each of 54 candidates, totalling 270 fits
Accuracy of SVC (RandomizedSearchCV) after tuning on last 5 games data: 0.53
              precision    recall  f1-score   support

           0       0.52      0.54      0.51       236
           1       0.21      0.03      0.11       170
           2       0.49      0.68      0.60       354

    accuracy                           0.56       760
   macro avg       0.50      0.46      0.47       760
weighted avg       0.50      0.52      0.51       760

Best parameters for SVC (RandomizedSearchCV) on seasonal data: { C’: 100, ‘kernel’: 'poly', ‘gamma’:'auto', ‘degree’: 1}
Best cross-validation score for SVC (RandomizedSearchCV) on seasonal data: 0.142129823748121

In [None]:
# Hyperparameter tuning for logistic regression seasonal data using gridsearch

# Define the parameter grid for Logistic Regression
param_grid_log_reg = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

# Initialize the Logistic Regression model
log_reg_model = LogisticRegression()
# Initialize GridSearchCV
grid_search_log_reg = GridSearchCV(estimator=log_reg_model, param_grid=param_grid_log_reg, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
# Fit the model to the training data
grid_search_log_reg.fit(X_train, y_train)
# Get the best parameters and best score
best_params_log_reg = grid_search_log_reg.best_params_
best_score_log_reg = grid_search_log_reg.best_score_

# Make predictions on the test set
y_pred_log_reg = grid_search_log_reg.predict(X_test)
# Calculate accuracy
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print(f"Accuracy of Logistic Regression after tuning: {accuracy_log_reg:.2f}")
# Calculate classification report
print(classification_report(y_test, y_pred_log_reg))

# Extract the best parameters and best score
print("Best parameters for Logistic Regression:", best_params_log_reg)
print("Best cross-validation score for Logistic Regression:", best_score_log_reg)



Fitting 5 folds for each of 48 candidates, totalling 240 fits
Accuracy of Logistic Regression after tuning: 0.56
              precision    recall  f1-score   support

           0       0.55      0.60      0.57       345
           1       0.26      0.03      0.06       254
           2       0.59      0.79      0.67       541

    accuracy                           0.56      1140
   macro avg       0.46      0.47      0.43      1140
weighted avg       0.50      0.56      0.51      1140

Best parameters for Logistic Regression: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best cross-validation score for Logistic Regression: 0.5018796992481203

In [None]:
# Hyperparameter tuning for logistic regression seasonal data using randomsearch

log_reg_model_random = LogisticRegression()
random_search_log_reg = RandomizedSearchCV(estimator=log_reg_model_random, param_distributions=param_grid_log_reg, n_iter=10, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

random_search_log_reg.fit(X_train, y_train)

best_params_log_reg_random = random_search_log_reg.best_params_
best_score_log_reg_random = random_search_log_reg.best_score_

y_pred_log_reg_random = random_search_log_reg.predict(X_test)

accuracy_log_reg_random = accuracy_score(y_test, y_pred_log_reg_random)
print(f"Accuracy of Logistic Regression (RandomizedSearchCV) after tuning: {accuracy_log_reg_random:.2f}")

print(classification_report(y_test, y_pred_log_reg_random))

print("Best parameters for Logistic Regression (RandomizedSearchCV):", best_params_log_reg_random)
print("Best cross-validation score for Logistic Regression (RandomizedSearchCV):", best_score_log_reg_random)



Fitting 5 folds for each of 10 candidates, totalling 50 fits
Accuracy of Logistic Regression (RandomizedSearchCV) after tuning: 0.57
              precision    recall  f1-score   support

           0       0.55      0.59      0.57       345
           1       0.50      0.01      0.02       254
           2       0.58      0.82      0.68       541

    accuracy                           0.57      1140
   macro avg       0.54      0.47      0.42      1140
weighted avg       0.55      0.57      0.50      1140

Best parameters for Logistic Regression (RandomizedSearchCV): {'solver': 'saga', 'penalty': 'l2', 'max_iter': 200, 'C': 10}
Best cross-validation score for Logistic Regression (RandomizedSearchCV): 0.5011278195488722

In [None]:
# Hyperparameter tuning for logistic regression last 5 games data using gridsearch


log_reg_model_last_5 = LogisticRegression()
grid_search_log_reg_last_5 = GridSearchCV(estimator=log_reg_model_last_5, param_grid=param_grid_log_reg, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search_log_reg_last_5.fit(X_train_last_5, y_train_last_5)

best_params_log_reg_last_5 = grid_search_log_reg_last_5.best_params_
best_score_log_reg_last_5 = grid_search_log_reg_last_5.best_score_

y_pred_log_reg_last_5 = grid_search_log_reg_last_5.predict(X_test_last_5)

accuracy_log_reg_last_5 = accuracy_score(y_test_last_5, y_pred_log_reg_last_5)
print(f"Accuracy of Logistic Regression after tuning on last 5 games data: {accuracy_log_reg_last_5:.2f}")

print(classification_report(y_test_last_5, y_pred_log_reg_last_5))

print("Best parameters for Logistic Regression on last 5 games data:", best_params_log_reg_last_5)
print("Best cross-validation score for Logistic Regression on last 5 games data:", best_score_log_reg_last_5)



Fitting 5 folds for each of 48 candidates, totalling 240 fits
Accuracy of Logistic Regression after tuning on last 5 games data: 0.54
              precision    recall  f1-score   support

           0       0.52      0.56      0.54       236
           1       0.19      0.04      0.06       170
           2       0.57      0.76      0.65       354

    accuracy                           0.54       760
   macro avg       0.43      0.45      0.42       760
weighted avg       0.47      0.54      0.48       760

Best parameters for Logistic Regression on last 5 games data: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best cross-validation score for Logistic Regression on last 5 games data: 0.5048872180451127


In [None]:
# Hyperparameter tuning for logistic regression last 5 games data using randomsearch


log_reg_model_random_last_5 = LogisticRegression()


random_search_log_reg_last_5 = RandomizedSearchCV(estimator=log_reg_model_random_last_5, param_distributions=param_grid_log_reg, n_iter=10, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

random_search_log_reg_last_5.fit(X_train_last_5, y_train_last_5)

best_params_log_reg_random_last_5 = random_search_log_reg_last_5.best_params_
best_score_log_reg_random_last_5 = random_search_log_reg_last_5.best_score_

y_pred_log_reg_random_last_5 = random_search_log_reg_last_5.predict(X_test_last_5)


accuracy_log_reg_random_last_5 = accuracy_score(y_test_last_5, y_pred_log_reg_random_last_5)

print(f"Accuracy of Logistic Regression (RandomizedSearchCV) after tuning on last 5 games data: {accuracy_log_reg_random_last_5:.2f}")

print(classification_report(y_test_last_5, y_pred_log_reg_random_last_5))

print("Best parameters for Logistic Regression (RandomizedSearchCV) on last 5 games data:", best_params_log_reg_random_last_5)
print("Best cross-validation score for Logistic Regression (RandomizedSearchCV) on last 5 games data:", best_score_log_reg_random_last_5)



Fitting 5 folds for each of 10 candidates, totalling 50 fits
Accuracy of Logistic Regression (RandomizedSearchCV) after tuning on last 5 games data: 0.54
              precision    recall  f1-score   support

           0       0.52      0.60      0.56       236
           1       0.26      0.07      0.11       170
           2       0.58      0.72      0.64       354

    accuracy                           0.54       760
   macro avg       0.45      0.46      0.44       760
weighted avg       0.49      0.54      0.50       760

Best parameters for Logistic Regression (RandomizedSearchCV) on last 5 games data: {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 100, 'C': 1}
Best cross-validation score for Logistic Regression (RandomizedSearchCV) on last 5 games data: 0.5041353383458647



In [None]:
# Hyperparameter tuning for KNN seasonal data using gridsearch


# Define the parameter grid for KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30]
}

knn_model = KNeighborsClassifier()

grid_search_knn = GridSearchCV(estimator=knn_model, param_grid=param_grid_knn, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search_knn.fit(X_train, y_train)

best_params_knn = grid_search_knn.best_params_
best_score_knn = grid_search_knn.best_score_

y_pred_knn = grid_search_knn.predict(X_test)
 
accuracy_knn = accuracy_score(y_test, y_pred_knn)

print(f"Accuracy of KNN after tuning: {accuracy_knn:.2f}")

print(classification_report(y_test, y_pred_knn))

print("Best parameters for KNN:", best_params_knn)
print("Best cross-validation score for KNN:", best_score_knn)




Fitting 5 folds for each of 96 candidates, totalling 480 fits
Accuracy of KNN after tuning: 0.51
              precision    recall  f1-score   support

           0       0.50      0.48      0.49       345
           1       0.23      0.14      0.18       254
           2       0.58      0.70      0.64       541

    accuracy                           0.51      1140
   macro avg       0.44      0.44      0.44      1140
weighted avg       0.48      0.51      0.49      1140

Best parameters for KNN: {'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 9, 'weights': 'distance'}
Best cross-validation score for KNN: 0.4334586466165414


In [None]:
# Hyperparameter tuning for KNN last 5 games data using gridsearch
knn_model_last_5 = KNeighborsClassifier()

grid_search_knn_last_5 = GridSearchCV(estimator=knn_model_last_5, param_grid=param_grid_knn, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search_knn_last_5.fit(X_train_last_5, y_train_last_5)

best_params_knn_last_5 = grid_search_knn_last_5.best_params_
best_score_knn_last_5 = grid_search_knn_last_5.best_score_

y_pred_knn_last_5 = grid_search_knn_last_5.predict(X_test_last_5)
accuracy_knn_last_5 = accuracy_score(y_test_last_5, y_pred_knn_last_5)

print(f"Accuracy of KNN after tuning on last 5 games data: {accuracy_knn_last_5:.2f}")

print(classification_report(y_test_last_5, y_pred_knn_last_5))

print("Best parameters for KNN on last 5 games data:", best_params_knn_last_5)
print("Best cross-validation score for KNN on last 5 games data:", best_score_knn_last_5)




Fitting 5 folds for each of 96 candidates, totalling 480 fits
Accuracy of KNN after tuning on last 5 games data: 0.48
              precision    recall  f1-score   support

           0       0.46      0.44      0.45       236
           1       0.24      0.17      0.20       170
           2       0.57      0.66      0.61       354

    accuracy                           0.48       760
   macro avg       0.42      0.42      0.42       760
weighted avg       0.46      0.48      0.47       760

Best parameters for KNN on last 5 games data: {'algorithm': 'ball_tree', 'leaf_size': 10, 'n_neighbors': 9, 'weights': 'distance'}
Best cross-validation score for KNN on last 5 games data: 0.43496240601503755





In [None]:
# Hyperparameter tuning for KNN seasonal data using randomsearch



knn_model_random = KNeighborsClassifier()

random_search_knn = RandomizedSearchCV(estimator=knn_model_random, param_distributions=param_grid_knn, n_iter=10, cv=5, n_jobs=-1, verbose=2, scoring='accuracy', random_state=42)

random_search_knn.fit(X_train, y_train)

best_params_knn_random = random_search_knn.best_params_
best_score_knn_random = random_search_knn.best_score_

y_pred_knn_random = random_search_knn.predict(X_test)

accuracy_knn_random = accuracy_score(y_test, y_pred_knn_random)
print(f"Accuracy of KNN (RandomizedSearchCV) after tuning: {accuracy_knn_random:.2f}")

print(classification_report(y_test, y_pred_knn_random))
print("Best parameters for KNN (RandomizedSearchCV):", best_params_knn_random)
print("Best cross-validation score for KNN (RandomizedSearchCV):", best_score_knn_random)



Fitting 5 folds for each of 10 candidates, totalling 50 fits
Accuracy of KNN (RandomizedSearchCV) after tuning: 0.51
              precision    recall  f1-score   support

           0       0.50      0.48      0.49       345
           1       0.23      0.14      0.18       254
           2       0.58      0.70      0.64       541

    accuracy                           0.51      1140
   macro avg       0.44      0.44      0.44      1140
weighted avg       0.48      0.51      0.49      1140

Best parameters for KNN (RandomizedSearchCV): {'weights': 'distance', 'n_neighbors': 9, 'leaf_size': 10, 'algorithm': 'brute'}
Best cross-validation score for KNN (RandomizedSearchCV): 0.4334586466165414

In [None]:
# Hyperparameter tuning for KNN last 5 games data using randomsearch

knn_model_random_last_5 = KNeighborsClassifier()

random_search_knn_last_5 = RandomizedSearchCV(estimator=knn_model_random_last_5, param_distributions=param_grid_knn, n_iter=10, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

random_search_knn_last_5.fit(X_train_last_5, y_train_last_5)

best_params_knn_random_last_5 = random_search_knn_last_5.best_params_
best_score_knn_random_last_5 = random_search_knn_last_5.best_score_

y_pred_knn_random_last_5 = random_search_knn_last_5.predict(X_test_last_5)
accuracy_knn_random_last_5 = accuracy_score(y_test_last_5, y_pred_knn_random_last_5)

print(f"Accuracy of KNN (RandomizedSearchCV) after tuning on last 5 games data: {accuracy_knn_random_last_5:.2f}")
print(classification_report(y_test_last_5, y_pred_knn_random_last_5))

print("Best parameters for KNN (RandomizedSearchCV) on last 5 games data:", best_params_knn_random_last_5)
print("Best cross-validation score for KNN (RandomizedSearchCV) on last 5 games data:", best_score_knn_random_last_5)



Fitting 5 folds for each of 10 candidates, totalling 50 fits
Accuracy of KNN (RandomizedSearchCV) after tuning on last 5 games data: 0.48
              precision    recall  f1-score   support

           0       0.45      0.44      0.45       236
           1       0.24      0.17      0.20       170
           2       0.57      0.66      0.61       354

    accuracy                           0.48       760
   macro avg       0.42      0.42      0.42       760
weighted avg       0.46      0.48      0.47       760

Best parameters for KNN (RandomizedSearchCV) on last 5 games data: {'weights': 'distance', 'n_neighbors': 9, 'leaf_size': 10, 'algorithm': 'brute'}
Best cross-validation score for KNN (RandomizedSearchCV) on last 5 games data: 0.43458646616541347

In [None]:

# Initialize the models with the best parameters from seasonal data tuning
rf_model_best = RandomForestClassifier(n_estimators=200, max_depth=30, min_samples_split=2, min_samples_leaf=4, max_features='log2')
xgb_model_best = xgb.XGBClassifier(n_estimators=200, max_depth=3, learning_rate=0.01, subsample=0.8, colsample_bytree=0.8)
svm_model_best = SVC(C=100, kernel='rbf', gamma='scale', degree=1)
log_reg_model_best = LogisticRegression(C=10, penalty='l2', solver='saga', max_iter=200)
knn_model_best = KNeighborsClassifier(n_neighbors=9, weights='distance', algorithm='brute', leaf_size=10)

# Train the models on the seasonal data
rf_model_best.fit(X_train, y_train)
xgb_model_best.fit(X_train, y_train)
svm_model_best.fit(X_train, y_train)
log_reg_model_best.fit(X_train, y_train)
knn_model_best.fit(X_train, y_train)




# RF
# Make predictions on the test set
y_pred_rf_best = rf_model_best.predict(X_test)
# Calculate accuracy

accuracy_rf_best = accuracy_score(y_test, y_pred_rf_best)
print(f"Accuracy of Random Forest (best params) on seasonal data: {accuracy_rf_best:.2f}")

# Calculate classification report
print(classification_report(y_test, y_pred_rf_best))

# XGB
y_pred_xgb_best = xgb_model_best.predict(X_test)
accuracy_xgb_best = accuracy_score(y_test, y_pred_xgb_best)
print(f"Accuracy of XGBoost (best params) on seasonal data: {accuracy_xgb_best:.2f}")
print(classification_report(y_test, y_pred_xgb_best))


# SVM predictions
y_pred_svm_best = svm_model_best.predict(X_test)

accuracy_svm_best = accuracy_score(y_test, y_pred_svm_best)
print(f"Accuracy of SVM (best params) on seasonal data: {accuracy_svm_best:.2f}")
print(classification_report(y_test, y_pred_svm_best))

#log reg
y_pred_log_reg_best = log_reg_model_best.predict(X_test)

accuracy_log_reg_best = accuracy_score(y_test, y_pred_log_reg_best)
print(f"Accuracy of Logistic Regression (best params) on seasonal data: {accuracy_log_reg_best:.2f}")
print(classification_report(y_test, y_pred_log_reg_best))

# KNN
y_pred_knn_best = knn_model_best.predict(X_test)

accuracy_knn_best = accuracy_score(y_test, y_pred_knn_best)
print(f"Accuracy of KNN (best params) on seasonal data: {accuracy_knn_best:.2f}")
# Calculate classification report
print(classification_report(y_test, y_pred_knn_best))







In [None]:
# Initialize the models with the best parameters from last 5 games data tuning
rf_model_best_last_5 = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=2, min_samples_leaf=1, max_features='log2')
xgb_model_best_last_5 = xgb.XGBClassifier(n_estimators=50, max_depth=3, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8)
svm_model_best_last_5 = SVC(C=10, kernel='rbf', gamma='scale', degree=1)
log_reg_model_best_last_5 = LogisticRegression(C=10, penalty='l2', solver='saga', max_iter=200)
knn_model_best_last_5 = KNeighborsClassifier(n_neighbors=9, weights='distance', algorithm='brute', leaf_size=10)

rf_model_best_last_5.fit(X_train_last_5, y_train_last_5)
xgb_model_best_last_5.fit(X_train_last_5, y_train_last_5)
svm_model_best_last_5.fit(X_train_last_5, y_train_last_5)
log_reg_model_best_last_5.fit(X_train_last_5, y_train_last_5)
knn_model_best_last_5.fit(X_train_last_5, y_train_last_5)

y_pred_rf_best_last_5 = rf_model_best_last_5.predict(X_test_last_5)
accuracy_rf_best_last_5 = accuracy_score(y_test_last_5, y_pred_rf_best_last_5)
print(f"Accuracy of Random Forest (best params) on last 5 games data: {accuracy_rf_best_last_5:.2f}")
print(classification_report(y_test_last_5, y_pred_rf_best_last_5))

y_pred_xgb_best_last_5 = xgb_model_best_last_5.predict(X_test_last_5)
accuracy_xgb_best_last_5 = accuracy_score(y_test_last_5, y_pred_xgb_best_last_5)
print(f"Accuracy of XGBoost (best params) on last 5 games data: {accuracy_xgb_best_last_5:.2f}")
print(classification_report(y_test_last_5, y_pred_xgb_best_last_5))

y_pred_svm_best_last_5 = svm_model_best_last_5.predict(X_test_last_5)
accuracy_svm_best_last_5 = accuracy_score(y_test_last_5, y_pred_svm_best_last_5)
print(f"Accuracy of SVM (best params) on last 5 games data: {accuracy_svm_best_last_5:.2f}")
print(classification_report(y_test_last_5, y_pred_svm_best_last_5))

y_pred_log_reg_best_last_5 = log_reg_model_best_last_5.predict(X_test_last_5)
accuracy_log_reg_best_last_5 = accuracy_score(y_test_last_5, y_pred_log_reg_best_last_5)
print(f"Accuracy of Logistic Regression (best params) on last 5 games data: {accuracy_log_reg_best_last_5:.2f}")
print(classification_report(y_test_last_5, y_pred_log_reg_best_last_5))

y_pred_knn_best_last_5 = knn_model_best_last_5.predict(X_test_last_5)
accuracy_knn_best_last_5 = accuracy_score(y_test_last_5, y_pred_knn_best_last_5)
print(f"Accuracy of KNN (best params) on last 5 games data: {accuracy_knn_best_last_5:.2f}")
print(classification_report(y_test_last_5, y_pred_knn_best_last_5))

