In [1]:
import pandas as pd # type: ignore

# Load the dataset
matches = pd.read_csv('laligadata.csv')

# Drop rows with missing values
matches.dropna(inplace=True)

# Define the target variables for scores
y_home = matches['FTHG']
y_away = matches['FTAG']

# Define features
matches['goal_difference'] = matches['FTHG'] - matches['FTAG']
matches['home_team_form'] = matches.groupby('HomeTeam')['goal_difference'].rolling(5).mean().reset_index(level=0, drop=True)
matches['away_team_form'] = matches.groupby('AwayTeam')['goal_difference'].rolling(5).mean().reset_index(level=0, drop=True)


features = ['HomeTeam', 'AwayTeam', 'home_team_form', 'away_team_form']
X = matches[features]
X.fillna(0, inplace=True)

from sklearn.preprocessing import StandardScaler, OneHotEncoder # type: ignore
from sklearn.compose import ColumnTransformer # type: ignore
from sklearn.model_selection import train_test_split # type: ignore

# Split the data into training and testing sets
X_train, X_test, y_home_train, y_home_test = train_test_split(X, y_home, test_size=0.2, random_state=42)
_, _, y_away_train, y_away_test = train_test_split(X, y_away, test_size=0.2, random_state=42)

# Preprocessing pipeline
numeric_features = ['home_team_form', 'away_team_form']
categorical_features = ['HomeTeam', 'AwayTeam']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

from sklearn.ensemble import RandomForestRegressor # type: ignore
from sklearn.metrics import root_mean_squared_error # type: ignore

# Train a RandomForestRegressor for home team goals
home_goal_model = RandomForestRegressor(n_estimators=100, random_state=42)
home_goal_model.fit(X_train, y_home_train)

# Train a RandomForestRegressor for away team goals
away_goal_model = RandomForestRegressor(n_estimators=100, random_state=42)
away_goal_model.fit(X_train, y_away_train)

# Evaluate the models
y_home_pred = home_goal_model.predict(X_test)
y_away_pred = away_goal_model.predict(X_test)

# print(f'Home Goal Prediction RMSE: {root_mean_squared_error(y_home_test, y_home_pred,)}')
# print(f'Away Goal Prediction RMSE: {root_mean_squared_error(y_away_test, y_away_pred,)}')

def custom_round(value):
    # Check if the decimal part is .8 or higher
    if value - int(value) >= 0.8:
        return int(value) + 1
    else:
        return int(value)


def calculate_average_form(team, historical_matches, home_or_away):
    
    if home_or_away == 'home':
        avg_form = historical_matches[historical_matches['HomeTeam'] == team]['goal_difference'].rolling(5).mean().iloc[-1]
    else:
        avg_form = historical_matches[historical_matches['AwayTeam'] == team]['goal_difference'].rolling(5).mean().iloc[-1]
    
    return avg_form

#A numerical value representing the average goal difference over the last 5 matches. This could be positive, negative, or zero.

def predict_match(HomeTeam, AwayTeam, historical_matches):
    # Check if there's enough data to calculate form
    if historical_matches[historical_matches['HomeTeam'] == HomeTeam].shape[0] < 5:
        home_team_form = historical_matches[historical_matches['HomeTeam'] == HomeTeam]['goal_difference'].mean()
        # print(home_team_form)
    else:
        home_team_form = calculate_average_form(HomeTeam, historical_matches, 'home')
        # print(home_team_form)
    
    if historical_matches[historical_matches['AwayTeam'] == AwayTeam].shape[0] < 5:
        away_team_form = historical_matches[historical_matches['AwayTeam'] == AwayTeam]['goal_difference'].mean()
        # print(away_team_form)
    else:
        away_team_form = calculate_average_form(AwayTeam, historical_matches, 'away')
        # print(away_team_form)

    # Handle case with no previous meetings
    if pd.isna(home_team_form):
        home_team_form = 0  # or some default value
    if pd.isna(away_team_form):
        away_team_form = 0  # or some default value

    # Create a dataframe for the new match
    new_match = pd.DataFrame({
        'HomeTeam': [HomeTeam],
        'AwayTeam': [AwayTeam],
        'home_team_form': [home_team_form],
        'away_team_form': [away_team_form]
    })

    # Preprocess the new match data
    new_match_preprocessed = preprocessor.transform(new_match)

    # Predict the goals
    home_goals = custom_round(home_goal_model.predict(new_match_preprocessed)[0])
    away_goals = custom_round(away_goal_model.predict(new_match_preprocessed)[0])

    print(f'Predicted goals: {HomeTeam} {home_goals:} - {away_goals:} {AwayTeam}')

    # Determine the outcome based on predicted goals
    if home_goals > away_goals:
        return 'Home Win'
    elif home_goals < away_goals:
        return 'Away Win'
    else:
        return 'Draw'

# Example usage
historical_matches = matches.copy()
HomeTeam = 'Barcelona'
AwayTeam = 'Sevilla'
result = predict_match(HomeTeam, AwayTeam, historical_matches)

# Assuming result is determined elsewhere, e.g., result = 'Home Win' or 'Away Win'
if result == 'Home Win':
    winning_team = HomeTeam
    print(f'The match result prediction: {winning_team} wins the match!')

elif result == 'Away Win':
    winning_team = AwayTeam
    print(f'The match result prediction: {winning_team} wins the match!')
else:
    winning_team = 'Draw'
    print(f'The match result prediction: The match ends in a draw!')




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)


Predicted goals: Barcelona 2 - 0 Sevilla
The match result prediction: Barcelona wins the match!
