# Data Processing

Here, we build the pipeline to perform the necessary feature transformations prior to model fitting. The necessary changes are listed below. 

* Removing observations where 'Minutes Played' == 0
* Include 'Venue' as important feature
* Include 'Minutes Played' as important feature
* One-hot encode 'Position', and then further filter into 'Defenders', 'Midfielders' and 'Attackers'
* Create 'Season' feature using 'kickoff_time', as this feature is necessary to calculate 'Designated Penalty Takers' 
* Use 'Penalties Attempted' to calculate 'Designated Penalty Takers' 
* Include 'Shots on Target' as important feature 
* Include 'npxG' as important feature
* Include 'Penalty Area Touches' as important feature
* Compute 'Rolling xG' which will replace 'npxG', then drop 'npxG'
* Calculate 'Team Rolling xG Matchups' by first using team data to first calculate 'Team Rolling xG' and 'Team Rolling xGA'. We can then calculate 'Team Rolling xG Difference', which allows us to calculate 'Team Rolling xG Matchups'
* Calculate 'Rolling Shots on Target' to replace 'Shots on Target', then drop 'Shots on Target'
* Calculate 'Rolling Penalty Area Touches' to replace 'Penalty Area Touches', then drop 'Penalty Area Touches'

This should leave the final dataframe with the following features - Venue, Designated Penalty Taker, Rolling Shots on Target, Rolling xG, Rolling Penalty Area Touches, Team Rolling xG Matchup, Defenders, Midfielders, Attackers


In [3]:
#import necessary packages
import pandas as pd 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline, FunctionTransformer


In [4]:
#import relevant dataframes from source
att_train = pd.read_csv('att_explore_original.csv', index_col = 0)

In [39]:
#function to select specific columns
def select_columns(dataframe):
    columns = ['Player ID', 'Team', 'Venue', 'Minutes Played', 'Position', 'kickoff_time', 'Penalties Attempted', 'Shots on Target', 'npxG', 
               'Penalty Area Touches'] 
    return dataframe[columns].copy()

#transformer to drop rows with empty 'Position'
class DropEmptyPositions(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[X['Position'] != '0']

#transformer for one-hot encoding and creating position categories
class PositionEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        #one hot encode positions
        positions_encode = X['Position'].str.get_dummies(sep=',')
        
        #create new position categories
        positions_encode['Defender'] = positions_encode[['RB', 'LB', 'CB']].any(axis=1).astype(int)
        positions_encode['Midfielder'] = positions_encode[['DM', 'CM', 'LM', 'RM', 'AM']].any(axis=1).astype(int)
        positions_encode['Attacker'] = positions_encode[['LW', 'RW', 'FW']].any(axis=1).astype(int)
        
        # Drop original position columns
        positions_encode = positions_encode.drop(columns=['RB', 'LB', 'CB', 'DM', 'CM', 'LM', 'RM', 'LW', 'RW', 'AM', 'FW', 'WB'], errors='ignore')
        
        return pd.concat([X.reset_index(drop=True), positions_encode.reset_index(drop=True)], axis=1)

#transformer for determining the season
class SeasonDeterminer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Convert 'kickoff_time' to datetime if not already done
        X['kickoff_time'] = pd.to_datetime(X['kickoff_time'])
        
        # Function to determine the season
        def determine_season(kickoff_time):
            month = kickoff_time.month
            year = kickoff_time.year
            if month >= 8:  # August to December
                return f'{year}-{year + 1}'  # Current year to next year
            else:  # January to July
                return f'{year - 1}-{year}'  # Previous year to current year
        
        # Apply the function to create the 'Season' column
        X['Season'] = X['kickoff_time'].apply(determine_season)
        return X
    
    
#transformer for calculating 'Designated Penalty Taker'
class DesigPenTaker(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self 
    def transform(self, X):
        #group observations by player ID and penalties attempted 
        pen_group = X.groupby('Player ID', as_index = False)['Penalties Attempted'].sum()

        #remove obs with 0 penalties attempted 
        pen_group = pen_group[pen_group['Penalties Attempted'] > 0 ]

        #create new dataframe which has 'kickoff_time', 'Season', 'penalties attempted' and 'team in it 
        team_pens = X[['kickoff_time', 'Season', 'Penalties Attempted', 'Team']].copy()

        #now we group by team and season to compute how many penalties were taken by each team in each season
        team_pens_summary = team_pens.groupby(['Season', 'Team'], as_index=False)['Penalties Attempted'].sum()
        team_pens_summary.rename(columns={'Penalties Attempted': 'Team Penalties'}, inplace=True)

        #create empty dataframe
        pen_prop = pd.DataFrame()

        #loop through to get the Player ID and Penalties Attempted for each team in each season, filtering so that we only include observations with at 
        #least 1 penalty taken 
        for index, row in team_pens_summary.iterrows():
            team = row['Team']
            season = row['Season']
    
            filtered = X[(X['Season'] == season) & (X['Team'] == team) & (X['Penalties Attempted'] > 0)][['Player ID', 'Penalties Attempted']]
            filtered['Team'] = team
            filtered['Season'] = season
            pen_prop = pd.concat([pen_prop, filtered], ignore_index= True)

        #adding a new column into pen_prop called 'Team Penalties' which merges the relevant information from team_pens_summary
        pen_prop = pen_prop.merge(team_pens_summary, on=['Team', 'Season'], how='left')

        #we now merge rows that have the same player ID, team and season together. For the rows that satisfy this, we sum the penalties attempted to 
        #reflect the number of penalties a particular player ID took in a given season 
        merged_penprop = pen_prop.groupby(['Team', 'Season', 'Player ID'], as_index=False).agg({
            'Penalties Attempted': 'sum',
            'Team Penalties': 'first'  
        })
        merged_penprop = merged_penprop.sort_values(by='Player ID')

        #adding new column called Proportion of Team Penalties Taken
        merged_penprop['Proportion of Team Penalties Taken'] = (merged_penprop['Penalties Attempted'] / merged_penprop['Team Penalties'])

        #final dataframe which merges the rows based on Player ID. Each row now corresponds to one unique player ID, the penalties attempted and team 
        #penalties columns are now summed. The proportion is then recalculated 
        penprop_summary = merged_penprop.groupby('Player ID').agg(
            Penalties_Attempted=('Penalties Attempted', 'sum'),
            Team_Penalties=('Team Penalties', 'sum')
        ).reset_index()

        penprop_summary['Proportion of Team Penalties Taken'] = (
        penprop_summary['Penalties_Attempted'] / penprop_summary['Team_Penalties'])

        #first off, we can probably include all player ID's with 100% team penalties taken as 'designated penalty takers'
        desig_pen_takers = penprop_summary.loc[penprop_summary['Proportion of Team Penalties Taken'] == 1, 'Player ID'].tolist()

        #we now add the Player ID's of players that took more than 50% of their team's penalties 
        additional_takers = penprop_summary.loc[penprop_summary['Proportion of Team Penalties Taken'] > 0.5, 'Player ID'].tolist()
        desig_pen_takers.extend(additional_takers)

        #construct 'Designated Penalty Taker' feature 
        X['Designated Penalty Taker'] = X['Player ID'].isin(desig_pen_takers).astype(int)
    
        return X


#transformer for calculating 'Rolling xG'
class RollingxG(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self 
    def transform(self, X):
        #sort values by Player ID and kickoff_time, we also need to reset the index to ensure that the shifting in the function below works as intended
        X.sort_values(by=['Player ID', 'kickoff_time'], inplace=True)
        X.reset_index(drop=True, inplace=True)

        #function to calculate rolling xG
        def calculate_rolling_xg(group):
            # Calculate the cumulative sum and the number of games played
            cumulative_sum = group['npxG'].cumsum()
            count = pd.Series(range(1, len(group) + 1), index=group.index)
    
            # Create a new Series for rolling xG
            rolling_xg = cumulative_sum.shift(1)/count.shift(1)
    
            return rolling_xg

        #apply function
        X['Rolling xG'] = X.groupby(['Player ID', 'Season']).apply(calculate_rolling_xg).reset_index(drop = True)
        return X


#FunctionTransformer to select the necessary columns
select_transformer = FunctionTransformer(select_columns)

# Create a Pipeline
pipe = Pipeline(steps=[
    ('select', select_transformer), 
    ('drop_empty_pos', DropEmptyPositions()), 
    ('encode_positions', PositionEncoder()), 
    ('determine_season', SeasonDeterminer()), 
    ('desig_pen_taker', DesigPenTaker()), 
    ('rolling_xg', RollingxG())
])

# Use the pipeline to transform the DataFrame
test_df = pipe.fit_transform(att_train)

# Display the result
test_df.head()

  X['Rolling xG'] = X.groupby(['Player ID', 'Season']).apply(calculate_rolling_xg).reset_index(drop = True)


Unnamed: 0,Player ID,Team,Venue,Minutes Played,Position,kickoff_time,Penalties Attempted,Shots on Target,npxG,Penalty Area Touches,Defender,Midfielder,Attacker,Season,Designated Penalty Taker,Rolling xG
0,1,Brighton,Home,23,AM,2022-02-19 15:00:00+00:00,0,0,0.0,1,0,1,0,2021-2022,0,
1,1,Brighton,Away,8,FW,2022-12-26 15:00:00+00:00,0,0,0.0,0,0,0,1,2022-2023,0,
2,1,Brighton,Home,31,FW,2022-12-31 17:30:00+00:00,0,2,0.3,3,0,0,1,2022-2023,0,0.0
3,1,Brighton,Home,65,FW,2023-01-14 15:00:00+00:00,0,1,0.1,4,0,0,1,2022-2023,0,0.15
4,1,Brighton,Away,25,FW,2023-01-21 15:00:00+00:00,0,1,0.1,2,0,0,1,2022-2023,0,0.133333
