# 02 - Feature Engineering: Player-Level Stats
This notebook processes World Cup 2018 event data to compute per-player, per-match features for modeling.
Features include xG, pass accuracy, pressures, and minutes played.

In [3]:
import pandas as pd
from pathlib import Path

In [4]:
# Load event data
data_dir = Path("../data/raw")
events_df = pd.read_csv(data_dir / "events_worldcup_2018.csv")
events_df = events_df[~events_df['type.name'].isin(['Starting XI', 'Half Start'])]
events_df['player.name'] = events_df['player.name'].fillna('Unknown')
events_df['team.name'] = events_df['team.name'].fillna('Unknown')
events_df = events_df.dropna(subset=['player.name', 'team.name'])

  events_df = pd.read_csv(data_dir / "events_worldcup_2018.csv")


In [5]:
# Feature: Total xG and Shot Count
shots = events_df[events_df['type.name'] == 'Shot']
xg_stats = shots.groupby(['match_id', 'player.name'])['shot.statsbomb_xg'].agg(['sum', 'count']).reset_index()
xg_stats.columns = ['match_id', 'player.name', 'total_xg', 'num_shots']

In [6]:
# Feature: Passes and Completion
passes = events_df[events_df['type.name'] == 'Pass']
passes['complete'] = passes['pass.outcome.name'].isna().astype(int)
pass_stats = passes.groupby(['match_id', 'player.name'])['complete'].agg(['count', 'sum']).reset_index()
pass_stats.columns = ['match_id', 'player.name', 'total_passes', 'completed_passes']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  passes['complete'] = passes['pass.outcome.name'].isna().astype(int)


In [7]:
# Feature: Pressures
pressures = events_df[events_df['type.name'] == 'Pressure']
pressure_stats = pressures.groupby(['match_id', 'player.name']).size().reset_index(name='num_pressures')

In [8]:
# Combine all features
features_df = xg_stats.merge(pass_stats, on=['match_id', 'player.name'], how='outer')
features_df = features_df.merge(pressure_stats, on=['match_id', 'player.name'], how='outer')
features_df = features_df.fillna(0)
features_df['pass_accuracy'] = features_df['completed_passes'] / features_df['total_passes'].replace(0, 1)

In [9]:
# Save engineered features
features_df.to_csv(data_dir / "features_player_match_worldcup_2018.csv", index=False)
features_df.head()

Unnamed: 0,match_id,player.name,total_xg,num_shots,total_passes,completed_passes,num_pressures,pass_accuracy
0,7525,Abdullah Ibrahim Al Maiouf,0.0,0.0,35.0,15.0,0.0,0.428571
1,7525,Abdullah Ibrahim Otayf,0.0,0.0,54.0,47.0,14.0,0.87037
2,7525,Alan Dzagoev,0.075106,1.0,10.0,8.0,8.0,0.8
3,7525,Aleksandr Golovin,0.074455,1.0,36.0,24.0,33.0,0.666667
4,7525,Aleksandr Samedov,0.080169,2.0,19.0,16.0,18.0,0.842105
