Reasearch - Nguyen Hung Anh (Robert Nguyen)

In [45]:
# Importing various libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [46]:
def load_and_preprocess_data(file_path):
    """
        Load and preprocess the match data
    """
    df = pd.read_csv(file_path)

    # Rename columns (adjust as needed based on your data structure)
    """
    Purpose: The inplace parameter controls whether the operation modifies the original DataFrame or returns a new one.
    When inplace=True:
    The original DataFrame df is modified directly. The changes (renamed columns, in this case) are applied to df, and no new DataFrame is created.
    The method does not return a new DataFrame; instead, it returns None.
    When inplace=False (default):
    A new DataFrame is created with the changes applied, and the original DataFrame df remains unchanged.
    You would typically assign the result to a new variable if you want to keep the changes, like this:
    df_renamed = df.rename(columns={'Home Play': 'Home - Description', 'Away Play': 'Away - Description'})
    When you set inplace=False (which is the default behavior) in methods like rename(), drop(), 
    or any similar operation in pandas, you do not create a separate file on your disk automatically. 
    Instead, you create a new DataFrame in memory with the specified changes while the original DataFrame remains unchanged.
    """
    df.rename(columns=
              {'Home Play': 'Home - Description',
               'Away Play': 'Away - Description'}, 
              inplace = True)

    #Droping unnecessary columns
    #df.drop(...): This method is used to drop (remove) rows or columns from a DataFrame.
    #columns_to_drop: This is the variable that contains a list of column names to be removed from the DataFrame.
    #axis=1: The axis parameter specifies whether you want to drop rows or columns. axis=0 means drop rows. axis=1 means drop columns.
    columns_to_drop = ['Team Logo', 'Team Indicator']
    df.drop(columns_to_drop, axis = 1, inplace = True)

    df['Home - Description'] = df['Home - Description'].fillna('').astype(str)
    df['Away - Description'] = df['Away - Description'].fillna('').astype(str)
    return df

In [47]:
import re

def parse_goalkeeper(row):
    """Parse the goalkeeper from the description"""
    description = str(row['Home - Description'] + " " + row['Away - Description'])
    match = re.search(r'at goalie for (.+?)(?:\.\s|$)', description)
    if match:
        return match.group(1)  # Return the team name
    return None

In [48]:
def parse_corner_kicks(row):
    """Parse corner kicks from the description"""
    for team in ['Home', 'Away']:
        description = str(row[f'{team} - Description'])
        match = re.search(r'Corner kick by (.+?) (.+?) \[', description)  # (Player) from (Team)
        if match:
            player = match.group(1).strip()
            team_name = match.group(2).strip()
            return team_name, player
    return None, None

In [49]:
def parse_shots(row):
    """Parse shots from the description"""
    for team in ['Home', 'Away']:
        description = str(row[f'{team} - Description'])
        match = re.search(r'Shot by (.+?) (.+?), (.+)', description)  # (Team) (Player), (Result)
        if match:
            team_name = match.group(1).strip()
            player = match.group(2).strip()
            result = match.group(3).strip()  # e.g., "Wide", "On target"
            return team_name, player, result
    return None, None, None

In [50]:
def parse_substitutions(row):
    """Parse substitutions from the description"""
    for team in ['Home', 'Away']:
        description = str(row[f'{team} - Description'])
        match = re.search(r'(\w+ \w+) substitution: (\w+ \w+) for (\w+ \w+)', description)  # Team Substitution: Player in for Player out
        if match:
            team_name = match.group(1).strip()
            in_player = match.group(2).strip()
            out_player = match.group(3).strip()
            return team_name, in_player, out_player
    return None, None, None

In [51]:
def parse_goals(row):
    """Parse goals and assists from the description"""
    for team in ['Home', 'Away']:
        description = str(row[f'{team} - Description'])
        goal_match = re.search(r'GOAL by (.+?) (.+?)(?:\s*\(FIRST GOAL\)|\(|$)', description)  # (Player) from (Team)
        assist_match = re.search(r'Assist by (.+?),', description)  # Assist
        if goal_match:
            team_name = team
            scorer = goal_match.group(1).strip()
            first_goal = 'FIRST GOAL' in description
            assister = assist_match.group(1).strip() if assist_match else None
            return team_name, scorer, assister, first_goal
    return None, None, None, False

In [52]:
def analyze_match_data(df):
    """Analyze the match data and create detailed statistics"""
    # Initialize new columns
    df['Goalkeeper'] = ''
    df['Corner Kick Team'] = ''
    df['Corner Kick Player'] = ''
    df['Shot Team'] = ''
    df['Shot Player'] = ''
    df['Shot Result'] = ''
    df['Substitution Team'] = ''
    df['Substitution In'] = ''
    df['Substitution Out'] = ''
    df['Goal Team'] = ''
    df['Goal Scorer'] = ''
    df['Goal Assister'] = ''
    df['First Goal'] = False

    # Parse each row
    for index, row in df.iterrows():
        # Parse goalkeeper
        goalkeeper = parse_goalkeeper(row)
        df.at[index, 'Goalkeeper'] = goalkeeper

        # Parse corner kicks
        corner_team, corner_player = parse_corner_kicks(row)
        df.at[index, 'Corner Kick Team'] = corner_team
        df.at[index, 'Corner Kick Player'] = corner_player

        # Parse shots
        shot_team, shot_player, shot_result = parse_shots(row)
        df.at[index, 'Shot Team'] = shot_team
        df.at[index, 'Shot Player'] = shot_player
        df.at[index, 'Shot Result'] = shot_result

        # Parse substitutions
        sub_team, in_player, out_player = parse_substitutions(row)
        df.at[index, 'Substitution Team'] = sub_team
        df.at[index, 'Substitution In'] = in_player
        df.at[index, 'Substitution Out'] = out_player

        # Parse goals
        goal_team, goal_scorer, goal_assister, first_goal = parse_goals(row)
        df.at[index, 'Goal Team'] = goal_team
        df.at[index, 'Goal Scorer'] = goal_scorer
        df.at[index, 'Goal Assister'] = goal_assister
        df.at[index, 'First Goal'] = first_goal

    return df

In [53]:
def generate_player_statistics(df):
    """Generate statistics for each player"""
    player_stats = {}

    # Count corner kicks
    corner_kicks = df[df['Corner Kick Player'].notna()].groupby(['Corner Kick Team', 'Corner Kick Player']).size().reset_index(name='Corner Kicks')
    for _, row in corner_kicks.iterrows():
        player = row['Corner Kick Player']
        if player not in player_stats:
            player_stats[player] = {'Team': row['Corner Kick Team']}
        player_stats[player]['Corner Kicks'] = row['Corner Kicks']

    # Count shots
    shots = df[df['Shot Player'].notna()].groupby(['Shot Team', 'Shot Player']).size().reset_index(name='Shots')
    for _, row in shots.iterrows():
        player = row['Shot Player']
        if player not in player_stats:
            player_stats[player] = {'Team': row['Shot Team']}
        player_stats[player]['Shots'] = row['Shots']

    # Count goals and assists
    goals = df[df['Goal Scorer'].notna()].groupby(['Goal Team', 'Goal Scorer']).size().reset_index(name='Goals')
    for _, row in goals.iterrows():
        player = row['Goal Scorer']
        if player not in player_stats:
            player_stats[player] = {'Team': row['Goal Team']}
        player_stats[player]['Goals'] = row['Goals']

    assists = df[df['Goal Assister'].notna()].groupby(['Goal Team', 'Goal Assister']).size().reset_index(name='Assists')
    for _, row in assists.iterrows():
        player = row['Goal Assister']
        if player not in player_stats:
            player_stats[player] = {'Team': row['Goal Team']}
        player_stats[player]['Assists'] = row['Assists']

    # Convert to DataFrame
    player_stats_df = pd.DataFrame.from_dict(player_stats, orient='index').reset_index()
    player_stats_df.columns = ['Player', 'Team', 'Corner Kicks', 'Shots', 'Goals', 'Assists']
    player_stats_df = player_stats_df.fillna(0)

    return player_stats_df

In [54]:
def analyze_goals(df):
    """
        Analyze goal-related-statistic.
    """
    
    #Taking the highest number of goals scored for both teams
    #df.max() Auto exclude NA/null values when computing the result.
    max_away_goals = df["Visiting Team Score"].max()
    max_home_goals = df["Home Team Score"].max() 

    #Finding the index of those highest number of goals
    #idxmax() returns the index of the first occurrence of the maximum value in the specified column.
    #So, max_index_away will give you the index (row number) in the DataFrame where the maximum goals scored by the visiting team occurred.
    max_index_away = df['Visiting Team Score'].idxmax()
    max_index_home = df['Home Team Score'].idxmax()

    #Replacing all the values after the maximum goals scored
    #df.loc: This is the indexing method of a pandas DataFrame. 
    #.loc allows you to access a group of rows and columns by labels or a boolean array.
    #max_index_away + 1:: This defines the row selection part. 
    #It selects all rows starting from the row after max_index_away (which is an index) 
    #and goes until the end (: means from this index to the last row).
    df.loc[max_index_home + 1:, "Home Team Score"] = df.loc[max_index_home, "Home Team Score"]
    df.loc[max_index_away + 1:, "Visiting Team Score"] = df.loc[max_index_away, "Visiting Team Score"]
    if max_away_goals == 0:
        df.loc[max_index_away + 1:, "Home Team Score"] = 0
    df.loc[max_index_away + 1:, "Visiting Team Score"] = df.loc[max_index_away, "Visiting Team Score"]
    if max_home_goals == 0:
        df.loc[max_index_home + 1:, "Visiting Team Score"] = 0

    #Finding the index of the row where first goal was scored by the visiting team and by the home team
    first_goal_index_away = df[(df['Visiting Team Score'].notna()) & (df['Visiting Team Score'] > 0)].index[0]
    # Initialize first_goal_index_home with None or some default value
    first_goal_index_home = None
    if df['Home Team Score'].gt(0).any():
        first_goal_index_home = df[(df['Home Team Score'].notna()) & (df['Home Team Score'] > 0)].index[0]

    # Only fill values for home team if first_goal_index_home is set (i.e., a goal was scored)
    if first_goal_index_home is not None:
        df.loc[:first_goal_index_home, 'Home Team Score'] = df.loc[:first_goal_index_home, 'Home Team Score'].fillna(0)
    df.loc[:first_goal_index_away, 'Visiting Team Score'] = df.loc[:first_goal_index_away, 'Visiting Team Score'].fillna(0)

    first_goal_minute = df.loc[df['Home - Description'].str.contains('GOAL|own goal', na=False) | 
                               df['Away - Description'].str.contains('GOAL|own goal', na=False), 'Clock'].min()
    
    if df.loc[df['Clock'] == first_goal_minute, 'Home - Description'].str.contains('GOAL|own goal', na=False).any():
        print(f"Home team scored first in minute {first_goal_minute}")
    elif df.loc[df['Clock'] == first_goal_minute, 'Away - Description'].str.contains('GOAL|own goal', na=False).any():
        print(f"Away team scored first in minute {first_goal_minute}")
    else:
        print("No goal was scored.")

In [55]:
def count_events(df, event, home_team, away_team):
    """
    Count occurrences of a specific event for both teams.
    """
    home_count = df['Home - Description'].str.contains(event, case=False).sum()
    away_count = df['Away - Description'].str.contains(event, case=False).sum()
    
    print(f"{home_team} had {home_count} {event}s.", end=' | ')
    print(f"{away_team} had {away_count} {event}s.",)
    
    return home_count, away_count

In [56]:
def analyze_statistics(df, home_team, away_team):
    """
    Analyze various statistics for both teams.
    """
    stats = {
        'Corner kicks': ('Corner kick', ),
        'Substitutions': ('substitution', ),
        'Shots': ('Shot by', ),
        'Fouls': ('foul', ),
        'Yellow cards': ('Yellow card', ),
        'Red cards': ('Red card', ),
        'Offsides': ('Offside', )
    }
    
    results = {}
    
    for stat, keywords in stats.items():
        home_count, away_count = count_events(df, keywords[0], home_team, away_team)
        results[stat] = {
            f'{home_team} Count': home_count, 
            f'{away_team} Count': away_count
        }
        
        if home_count > away_count:
            print(f"{home_team} had more {stat.lower()} than {away_team} by {home_count - away_count}.", end='\n\n')
        elif away_count > home_count:
            print(f"{away_team} had more {stat.lower()} than {home_team} by {away_count - home_count}.", end='\n\n')
        else:
            print(f"{home_team} and {away_team} had the same number of {stat.lower()}: {home_count}.", end='\n\n')
    
    return results

In [57]:
def create_summary_dataframe(results, home_team, away_team):
    """
    Create a summary DataFrame of all statistics.
    """
    data = {
        'Statistics': list(results.keys()),
        f'{home_team} Count': [results[stat][f'{home_team} Count'] for stat in results], 
        f'{away_team} Count': [results[stat][f'{away_team} Count'] for stat in results]
    }
    
    df = pd.DataFrame(data)
    
    for team in [home_team, away_team]:
        df[f'{team} Percentage'] = df[f'{team} Count'] / (df[f'{home_team} Count'] + df[f'{away_team} Count']) * 100
    
    return df

In [58]:
def plot_statistics(df, home_team, away_team):
    """
    Plot bar charts for each statistic.
    """
    sns.set_style("whitegrid")

    # Loop through each statistic
    for stat in df['Statistics'].unique():
        plt.figure(figsize=(8, 6))
        
        df_stat = df[df['Statistics'] == stat].melt(id_vars='Statistics', var_name='Team', value_name='Count')
        
        # Filter for relevant team data
        df_stat = df_stat[df_stat['Team'].str.contains('Count')]
        
        # Plot the statistics for each team
        sns.barplot(x='Team', y='Count', data=df_stat, hue='Team', palette='muted', legend=False)
        plt.title(stat)
        plt.ylabel('Count')
        plt.show()

In [59]:
def main(file_path):
    """Main function to run the analysis"""
    df = load_and_preprocess_data(file_path)
    df = analyze_match_data(df)
    player_stats = generate_player_statistics(df)
    
    print("Player Statistics:")
    print(player_stats.to_string(index=False))

    # Save the results
    df.to_csv('parsed_match_data.csv', index=False)
    player_stats.to_csv('player_statistics.csv', index=False)
    print("\nDetailed match data saved to 'parsed_match_data.csv'")
    print("Player statistics saved to 'player_statistics.csv'")

if __name__ == "__main__":
    file_path = 'Centre_v_Emory.csv'  # Replace with your file path
    main(file_path)

ValueError: too many values to unpack (expected 2)