# Intuition
Evaluating Player Vectors is challenging as no objective ground truth exists for characterizing playing style.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import csv
import matplotlib.pyplot as plt
import warnings
from sklearn.decomposition import NMF

from src.playervectors import PlayerVectors

from src.ExtractData import (
    ExtractCoordinates, 
    ExtractPlayers, 
    ExtractTeams, 
    ExtractMinutesPlayed, 
    ExtractGoalkeepers 
)

## Load: Players

In [2]:
df_players = pd.read_csv('event_streams/players.csv')
df_playerank = pd.read_csv('event_streams/playerank.csv')

# Extract Player Information
# Mapping {player_id -> [list of selected attributes]}
players = ExtractPlayers(df=df_players, wy_id='wyId', attributes=['firstName', 'lastName', 'currentTeamId'])

# Extract Goalkeepers
# Mapping player_id -> None
keepers = ExtractGoalkeepers(df_players, wy_code='name', keeper_str='Goalkeeper')

# Group action indices by game id in a list
minutes = df_playerank.groupby('matchId').apply(lambda x: x.index.tolist(), include_groups=False).to_dict()

# List of dictionary of played minutes for each game_id
minutes_played = []
for game_id, indices in minutes.items():
    df = df_playerank.loc[indices]
    minutes_played.append(ExtractMinutesPlayed(df=df,
                                               column_player='playerId',
                                               column_minutes='minutesPlayed'))

## Load: Event Streams

In [3]:
df_events = pd.read_csv('event_streams/actions.csv')

# Normalize playing direction
df_events['start_x'] = 100 - df_events['start_x']
df_events['start_y'] = 100 - df_events['start_y']

# Extract relevant action coordinates
# {action -> {playerID -> ([list of x coordinates], [list of y coordinates])}}
coordinates = ExtractCoordinates(df=df_events,
                                 column_player_id='player_id',
                                 column_event_name='type_name',
                                 column_x='start_x',
                                 column_y='start_y',
                                 actions=['pass', 'cross', 'dribble', 'shot'])

# Fine tuning: Remove Goalkeepers
for action, player_coordinates in coordinates.items():
    # Create a list of player IDs to be removed
    players_to_remove = [player_id for player_id in player_coordinates if player_id in keepers]
    
    # Remove players from the action dictionary
    for player_id in players_to_remove:
        del coordinates[action][player_id]

# Mapping game_id to list of all player_ids in that game
game_player_mapping = df_events.groupby('game_id')['player_id'].unique().apply(list).to_dict()

#### Select Relevant Actions with respect to thier x,y coordinates

##### How does ExtractCoordinates work?

```python
>>> df
       playerID action   x   y
    0         1   pass  55  45
    1         1   pass  50  40
    2         1   shot  53  43
    3         1   shot  54  44
    4         2   pass  43  33
    5         2  cross  43  23
    6         2  cross  42  32
    7         3   pass  17  57
    8         3   pass  10  50
    9         3   pass  15  55
    
    >>> action_coordinates = ExtractCoordinates(df=df,
    ...                                         column_player_id='playerID',
    ...                                         column_event_name='action',
    ...                                         column_x='x',
    ...                                         column_y='y',
    ...                                         actions=['pass', 'shot', 'cross'])
    >>> action_coordinates
    {'pass': {1: ([55, 50], [45, 40]), 2: ([43], [33]), 3: ([17, 10, 15], [57, 50, 55])},
     'shot': {1: ([53, 54], [43, 44])},
     'cross': {2: ([43, 42], [23, 32])}}
```

## Load: Teams

In [4]:
df_teams = pd.read_csv('event_streams/teams.csv')

# Extracting Team Information
# Mapping {teamID -> [selected attributes e.g. TeamName, City, ...]}
teams = ExtractTeams(df=df_teams, wy_id='wyId', attributes=['name'])

# Mapping {teamID -> [list of playerIDs]}
teamID_to_playerIDs = {}

# Mapping {playerID -> Player Name}
playersID_to_name = {}

for playerID, attributes in players.items():
    first, last, teamID = attributes[0], str(attributes[1]), str(attributes[2])
    
    if teamID not in teamID_to_playerIDs:
        teamID_to_playerIDs[teamID] = [playerID]
    else:
        teamID_to_playerIDs[teamID].append(playerID)
    
    if playerID not in playersID_to_name:
        playersID_to_name[playerID] = f'{first} {last}'

## Create for every game player vectors

In [5]:
game_ids = df_events['game_id'].unique()
print(f'Number of Games in Dataset: {len(game_ids)}')

Number of Games in Dataset: 1941


In [6]:
# Group action indices by game id in a list
games = df_events.groupby('game_id').apply(lambda x: x.index.tolist()).to_dict()

  games = df_events.groupby('game_id').apply(lambda x: x.index.tolist()).to_dict()


In [7]:
# Get unique game_ids in the order they appear in the DataFrame
game_ids_in_order = df_events['game_id'].unique()

# Create a dictionary with game_id as keys and corresponding DataFrames in the original order
games_df_list = {game_id: df_events[df_events['game_id'] == game_id] for game_id in game_ids_in_order}

In [8]:
coords = ExtractCoordinates(df=games_df_list[2500089].copy(),
                            column_player_id='player_id',
                            column_event_name='type_name',
                            column_x='start_x',
                            column_y='start_y',
                            actions=['pass', 'cross', 'dribble', 'shot'])

In [9]:
print(coords['pass'].keys())
print(coords['pass'][9637])

dict_keys([93, 7989, 8125, 8284, 8296, 8351, 8433, 8643, 8726, 8925, 8980, 9127, 9179, 9206, 9277, 9285, 9433, 9637, 9739, 10108, 11669, 12242, 62224, 134102, 239411, 245813, 259531, 532949])
([47.5, 64.3, 42.25, 53.8, 72.7, 63.25, 80.05, 40.15, 59.05, 96.85, 47.5, 59.05, 56.95, 60.099999999999994], [66.0, 80.96000000000001, 68.03999999999999, 57.16, 82.32, 73.47999999999999, 70.76, 93.88, 92.52000000000001, 83.0, 89.11999999999999, 93.2, 95.92, 92.52000000000001])


In [10]:
game_coordinates = []
for i, game_id in enumerate(games_df_list):
    df_copy = games_df_list[game_id].copy() 
    coords = ExtractCoordinates(df=df_copy,
                                column_player_id='player_id',
                                column_event_name='type_name',
                                column_x='start_x',
                                column_y='start_y',
                                actions=['pass', 'cross', 'dribble', 'shot'])
    game_coordinates.append(coords)

In [11]:
# Check if games are missing
print(f'Number of Games: {len(games_df_list)}\t(via games_df_list)')
print(f'Number of Games: {len(game_coordinates)}\t(via game_coordinates)')
print(f'Number of Games: {len(game_ids)}\t(via game_ids)')

Number of Games: 1941	(via games_df_list)
Number of Games: 1941	(via game_coordinates)
Number of Games: 1941	(via game_ids)


In [12]:
num_actions = 0
for coord in game_coordinates:
    num_actions += sum(
        len(player_data[0]) 
        for action in coord.values() 
        for player_data in action.values()
    )

print(f'Number of actions: {num_actions}')

Number of actions: 1946101


In [13]:
# List of Player Vectors
pvs = []
verbose = True
for i, coord in enumerate(game_coordinates):
    PVs = PlayerVectors(actions=['shot', 'cross', 'dribble', 'pass'],
                        sigma=4.0,
                        components=[4, 4, 5, 5])
    PVs.fit(coordinates=coord, minutes_played=minutes_played[i], player_names=playersID_to_name) 
    pvs.append(PVs)
    if verbose and i % 100 == 0:
        progress = (i / len(game_ids)) * 100 
        print(f'Processing progress: {progress:.2f}% completed ({i}/{len(game_coordinates)} games)')

Processing progress: 0.00% completed (0/1941 games)
Processing progress: 5.15% completed (100/1941 games)
Processing progress: 10.30% completed (200/1941 games)
Processing progress: 15.46% completed (300/1941 games)
Processing progress: 20.61% completed (400/1941 games)
Processing progress: 25.76% completed (500/1941 games)
Processing progress: 30.91% completed (600/1941 games)
Processing progress: 36.06% completed (700/1941 games)
Processing progress: 41.22% completed (800/1941 games)
Processing progress: 46.37% completed (900/1941 games)
Processing progress: 51.52% completed (1000/1941 games)
Processing progress: 56.67% completed (1100/1941 games)
Processing progress: 61.82% completed (1200/1941 games)
Processing progress: 66.98% completed (1300/1941 games)
Processing progress: 72.13% completed (1400/1941 games)
Processing progress: 77.28% completed (1500/1941 games)
Processing progress: 82.43% completed (1600/1941 games)
Processing progress: 87.58% completed (1700/1941 games)
Proces

In [14]:
names = ['game_id', 'player_id'] + [f'component{i}' for i in range(1, 19)]
data = [names]
for i, game_id in enumerate(game_ids_in_order):
    for player_id, component_values in pvs[i].player_vectors.items():
        if len(component_values) == 18: 
            if player_id in playersID_to_name:
                entry = [game_id, player_id] + component_values
                data.append(entry)

In [15]:
with open ('player_vectors.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)