In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Read match results from a CSV file
match_results = pd.read_csv('all_matches.csv')

# Function to clean and normalize team names
def normalize_team_name(name):
    return name.strip().lower()

# Step 1: Calculate team points and other stats
def calculate_team_points(match_results):
    team_stats = {}

    for _, row in match_results.iterrows():
        home_team = normalize_team_name(row['home_team'])
        away_team = normalize_team_name(row['away_team'])
        home_score = row['home_score']
        away_score = row['away_score']

        if home_team not in team_stats:
            team_stats[home_team] = {'points': 0, 'matches_played': 0, 'goals_scored': 0, 'goals_conceded': 0}
        if away_team not in team_stats:
            team_stats[away_team] = {'points': 0, 'matches_played': 0, 'goals_scored': 0, 'goals_conceded': 0}

        # Update matches played
        team_stats[home_team]['matches_played'] += 1
        team_stats[away_team]['matches_played'] += 1

        # Update goals scored and conceded
        team_stats[home_team]['goals_scored'] += home_score
        team_stats[home_team]['goals_conceded'] += away_score
        team_stats[away_team]['goals_scored'] += away_score
        team_stats[away_team]['goals_conceded'] += home_score

        # Update points
        if home_score > away_score:
            team_stats[home_team]['points'] += 3  # Home team wins
        elif home_score < away_score:
            team_stats[away_team]['points'] += 3  # Away team wins
        else:
            team_stats[home_team]['points'] += 1  # Draw
            team_stats[away_team]['points'] += 1  # Draw

    # Convert to DataFrame
    stats_df = pd.DataFrame.from_dict(team_stats, orient='index')

    # Add additional features
    stats_df['home_avg_goals'] = stats_df['goals_scored'] / stats_df['matches_played']
    stats_df['away_avg_goals'] = stats_df['goals_conceded'] / stats_df['matches_played']
    return stats_df

# Step 2: Prepare features for machine learning
def prepare_features(match_results, team_stats_df):
    features = []
    home_goals = []
    away_goals = []

    for _, row in match_results.iterrows():
        home_team = normalize_team_name(row['home_team'])
        away_team = normalize_team_name(row['away_team'])
        home_score = row['home_score']
        away_score = row['away_score']

        # Ensure teams are in the stats DataFrame
        if home_team in team_stats_df.index and away_team in team_stats_df.index:
            # Extract features for machine learning
            home_points = team_stats_df.loc[home_team]['points']
            away_points = team_stats_df.loc[away_team]['points']
            point_diff = home_points - away_points
            home_avg_goals = team_stats_df.loc[home_team]['home_avg_goals']
            away_avg_goals = team_stats_df.loc[away_team]['away_avg_goals']
            is_home = 1 if home_team == row['home_team'] else 0

            # Append the features and target values
            features.append([home_points, away_points, point_diff, home_avg_goals, away_avg_goals, is_home])
            home_goals.append(home_score)
            away_goals.append(away_score)

    return pd.DataFrame(features, columns=['home_points', 'away_points', 'point_diff', 'home_avg_goals', 'away_avg_goals', 'is_home']), home_goals, away_goals

# Step 3: Train and predict using regression models
def train_and_predict(match_results, team_stats_df):
    X, y_home, y_away = prepare_features(match_results, team_stats_df)

    # Split the data into training and testing sets
    X_train, X_test, y_home_train, y_home_test = train_test_split(X, y_home, test_size=0.2, random_state=42)
    X_train, X_test, y_away_train, y_away_test = train_test_split(X, y_away, test_size=0.2, random_state=42)

    # Train a Random Forest model for home goals and away goals
    home_model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
    away_model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
    
    home_model_rf.fit(X_train, y_home_train)
    away_model_rf.fit(X_train, y_away_train)

    # Predictions
    home_rf_predictions = home_model_rf.predict(X_test)
    away_rf_predictions = away_model_rf.predict(X_test)

    # Evaluate the model using Mean Squared Error
    home_rf_mse = mean_squared_error(y_home_test, home_rf_predictions)
    away_rf_mse = mean_squared_error(y_away_test, away_rf_predictions)

    print(f"Home Goals MSE (RF): {home_rf_mse}")
    print(f"Away Goals MSE (RF): {away_rf_mse}")

    return home_model_rf, away_model_rf

# Step 4: Use the trained model to predict a new match outcome
def predict_match(home_team, away_team, team_stats_df, home_model_rf, away_model_rf):
    new_match = pd.DataFrame({
        'home_points': [team_stats_df.loc[home_team]['points']],
        'away_points': [team_stats_df.loc[away_team]['points']],
        'point_diff': [team_stats_df.loc[home_team]['points'] - team_stats_df.loc[away_team]['points']],
        'home_avg_goals': [team_stats_df.loc[home_team]['home_avg_goals']],
        'away_avg_goals': [team_stats_df.loc[away_team]['away_avg_goals']],
        'is_home': [1]  # Assume we're predicting for the home team
    })

    predicted_home_goals = home_model_rf.predict(new_match)[0]
    predicted_away_goals = away_model_rf.predict(new_match)[0]

    print(f"Predicted Home Goals: {round(predicted_home_goals)}")
    print(f"Predicted Away Goals: {round(predicted_away_goals)}")

# Calculate team stats from the CSV file
team_stats_df = calculate_team_points(match_results)

# Train the model
home_model_rf, away_model_rf = train_and_predict(match_results, team_stats_df)

# Example: Predict a match
home_team = 'leicester city fc'
away_team = 'chelsea fc'

predict_match(home_team, away_team, team_stats_df, home_model_rf, away_model_rf)


Home Goals MSE (RF): 1.4882004844961239
Away Goals MSE (RF): 1.7521941645133507
Predicted Home Goals: 1
Predicted Away Goals: 2
