In [1]:
import glob
import numpy as np
import ipywidgets as widgets

### Ideas
* Transformation of data using transAxes / transFigure instead of calculating the positions

In [136]:
location_columns = [col for col in match.df.columns if 'location' in col]

In [149]:
for col in location_columns:
    display(match.df[col].dropna().apply(lambda x: x[0]))

4        60.0
5        69.0
6        38.0
7        38.0
8        92.0
9        91.0
10       29.0
11       86.0
12       41.0
13       40.0
14       36.0
15       88.0
16       33.0
17       21.0
18       95.0
19       21.0
20       15.0
21       13.0
22       16.0
23       16.0
24       28.0
25       28.0
26       27.0
27       40.0
28       40.0
29       56.0
30       64.0
31       62.0
32       46.0
33       46.0
        ...  
2644    107.0
2645    106.0
2646      2.0
2647     10.0
2648     53.0
2649     54.0
2650     42.0
2651     45.0
2652     56.0
2653     75.0
2654     88.0
2655     18.0
2656     10.0
2657    111.0
2658    113.0
2659    109.0
2660    110.0
2661    111.0
2662    111.0
2663      1.0
2664     29.0
2665    109.0
2666     17.0
2667     25.0
2668     98.0
2669     97.0
2670     24.0
2671     93.0
2672     28.0
2673     33.0
Name: location, Length: 2610, dtype: float64

99      2.0
138     3.0
555     3.0
585     3.0
620     3.0
1342    4.0
1496    3.0
1519    2.0
1885    3.0
1954    2.0
2194    3.0
2209    3.0
2360    3.0
2604    2.0
Name: goalkeeper_end_location, dtype: float64

4        38.0
6        93.0
10       41.0
13       36.0
16       21.0
19       15.0
21       16.0
23       28.0
25       27.0
27       40.0
29       64.0
31       46.0
33       56.0
35       87.0
37       86.0
39      101.0
42       89.0
44       98.0
46       85.0
48      101.0
51      111.0
53       37.0
55       82.0
57       89.0
59       81.0
62       70.0
65       62.0
67       61.0
69       81.0
72       65.0
        ...  
2588     97.0
2591    115.0
2594    118.0
2596    105.0
2598    100.0
2601     99.0
2607     67.0
2609     92.0
2613     53.0
2615     49.0
2617     54.0
2619     57.0
2621     53.0
2623     87.0
2625     93.0
2628    100.0
2630    100.0
2634     80.0
2636     85.0
2638    117.0
2640    120.0
2642    106.0
2647     53.0
2649     42.0
2651     56.0
2653     88.0
2658    109.0
2660    111.0
2665     98.0
2673     93.0
Name: pass_end_location, Length: 948, dtype: float64

96      118.0
136     105.0
436     110.0
553     101.0
583     107.0
619     106.0
1001    120.0
1147    118.0
1341    119.0
1373    118.0
1485    120.0
1494    107.0
1517    120.0
1665    119.0
1883    107.0
1952     97.0
2192    107.0
2208    106.0
2246    119.0
2358    120.0
2379    118.0
2502    119.0
2603    119.0
2644    119.0
2662    120.0
Name: shot_end_location, dtype: float64

In [135]:
match.df.goalkeeper_end_location.dropna()

99      [2.0, 40.0]
138     [3.0, 43.0]
555     [3.0, 40.0]
585     [3.0, 38.0]
620     [3.0, 41.0]
1342    [4.0, 38.0]
1496    [3.0, 41.0]
1519    [2.0, 38.0]
1885    [3.0, 41.0]
1954    [2.0, 40.0]
2194    [3.0, 42.0]
2209    [3.0, 40.0]
2360    [3.0, 38.0]
2604    [2.0, 41.0]
Name: goalkeeper_end_location, dtype: object

In [165]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Rectangle
from sklearn.preprocessing import minmax_scale, MinMaxScaler

from pybomb.data import df_matches
from pybomb.data import unstack_dict_column, unstack_dict_column_repeated

class Match:
    def __init__(self, match_id, pitch_x=105.0, pitch_y=68.0, x_min=0.04338, x_max=0.95652, y_min=0.0641, y_max=0.9359,
                 pitch_color='#78AB46', home_color='red', away_color='white', 
                 event_data_path='data/events'):
        self.event_data_path = event_data_path

        # TODO: Get pitch_x and pitch_y from data?
        self.pitch_x = pitch_x
        self.pitch_y = pitch_y
        self.x_min = x_min
        self.x_max = x_max
        self.y_min = y_min
        self.y_max = y_max
        self.pitch_color = pitch_color
        self.team_colors = {
            'H': home_color,
            'A': away_color
        }
        self.outcome_colors = {
            'Incomplete': 'C3'
        }
        
        self.match_id = match_id
        self.df = self._get_match_df()
        
        x_loc_range, y_loc_range = self._get_locations_range()
        self.x_loc_range = x_loc_range
        self.y_loc_range = y_loc_range
        

        meta_info = self._get_meta_info()
        self.home_score = meta_info['home_score'].values[0]
        self.away_score = meta_info['away_score'].values[0]
        self.competition = meta_info['competition'].values[0]
        self.date = meta_info['match_date'].values[0]
        self.kickoff_time = meta_info['kick_off'].values[0]
        self.stadium = meta_info['stadium_name'].values[0]
        self.home_team_name = meta_info['home_team'].values[0]['home_team_name']
        self.away_team_name = meta_info['away_team'].values[0]['away_team_name']
        self.home_team_id = meta_info['home_team'].values[0]['home_team_id']
        self.away_team_id = meta_info['away_team'].values[0]['away_team_id']

    
    def _get_meta_info(self):
        return df_matches[df_matches['match_id'] == self.match_id]
    
    def _get_match_df(self):
        df = pd.read_json(f'{self.event_data_path}/{self.match_id}.json', typ='frame')

        dict_columns = [c for c in df.columns if isinstance(df[c].dropna().iloc[0], dict)]
        for column in dict_columns:
            df = unstack_dict_column_repeated(df, column)
        return df
    
    def _get_locations_range(self):
        location_columns = [col for col in match.df.columns if 'location' in col]
        x_locations = []
        y_locations = []

        for col in location_columns:
            x_locations.extend(self.df[col].dropna().apply(lambda x: x[0]).values)
            y_locations.extend(self.df[col].dropna().apply(lambda x: x[1]).values)
        
        x_loc_range = np.max(x_locations) - np.min(x_locations) 
        y_loc_range = np.max(y_locations) - np.min(y_locations)
        print(x_loc_range, y_loc_range)

        return x_loc_range, y_loc_range


    def correct_positions(self, column='location'):
        xs, ys = get_normalized_positions(self.df[column])
        self.df[column] = [[x, y] for x, y in zip(xs, ys)]

    def get_passes_df(self):
        return self.df[self.df['type_name'] == 'Pass']

    def get_shot_df(self):
        return self.df[self.df['type_name'] == 'Shot']
                
    def draw_pitch(self):
        #set up field    
        fig = plt.figure(figsize=(self.pitch_x/10, self.pitch_y/10))

        axes = fig.add_subplot(1, 1, 1)

        axes.xaxis.set_visible(False)
        axes.yaxis.set_visible(False)

        plt.xlim([-5,self.pitch_x+5])
        plt.ylim([-5,self.pitch_y+5])

        box_height = ((16.5*2 + 7.32)/self.pitch_y)/1.15
        box_width = (16.5/self.pitch_x)/1.15 

        # pitch field
        field = plt.Rectangle(
            (0, 0), 1, 1, edgecolor=self.pitch_color, facecolor=self.pitch_color, 
            alpha=1, transform=axes.transAxes, zorder=1
        ) 
        
        #pitch boundary lines
        boundary = plt.Rectangle(
            (self.x_min, self.y_min), (self.x_max - self.x_min), (self.y_max - self.y_min),
            edgecolor="white", facecolor="none", alpha=1, transform=axes.transAxes, zorder=1
        ) 
        
        #half-way line
        mid_line = plt.Line2D(
            [0.5, 0.5], [self.y_max, self.y_min],
            c='w', transform=axes.transAxes, zorder=1) 
        
        #middle circle
        mid_circle = Ellipse(
            (0.5, 0.5), 9.15*2 / self.pitch_x, 9.15*2 / self.pitch_y,
            ec='w', fc='none', transform=axes.transAxes, zorder=1
        ) 

        #penalty area
        penalty_left = plt.Rectangle(
            (self.x_min, (1-box_height)/2), box_width, box_height,
            ec='w', fc='none', transform=axes.transAxes, zorder=1
        ) 
        
        #penalty area
        penalty_right = plt.Rectangle(
            (self.x_max - box_width, (1 - box_height) / 2), box_width, box_height,
            ec='w', fc='none', transform=axes.transAxes, zorder=1) 

        # extend lines to plot
        fig.lines.extend([field, boundary, mid_line, mid_circle, penalty_left, penalty_right])
    
        return fig, axes
    
    
    def get_normalized_positions(self, positions):
        x = positions.apply(lambda x: x[0]).values
        y = positions.apply(lambda x: x[1]).values
        #x_range = (self.x_min, self.x_max)
        #y_range = (self.y_min, self.y_max)
        #x_scaled = minmax_scale(x, feature_range=x_range)
        #y_scaled = minmax_scale(y, feature_range=y_range)
        
        # TODO: change range to max value, since pitch positions should start at 0.
        
        print(self.x_loc_range, (self.x_max - self.x_min))
        print(self.y_loc_range, (self.y_max - self.y_min))

        x_scaled = x / self.x_loc_range * (self.x_max - self.x_min)
        y_scaled = y / self.y_loc_range * (self.y_max - self.y_min)

        return x_scaled, y_scaled

    def filter_column(self, df, column, value):
        if column in df.columns:
            if value in df[column].unique():
                df = df.loc[df[column] == value]
            else:
                raise ValueError(f'Value {value} is not in column {column} of the match data.')
        else:
            raise ValueError(f'Column {column} is not in the match data.') 
        
        return df
        

    def get_passes(self, team_id=None, player_id=None, recipient_id=None, return_df=False):
        '''
        Returns start_x_scaled, start_y_scaled, end_x_scaled, end_y_scaled, outcomes
        '''
        df = self.get_passes_df()
        
        if team_id is not None:
            df = self.filter_column(df, 'team_id', team_id)
            
        if player_id is not None:
            df = self.filter_column(df, 'player_id', player_id)

        if recipient_id is not None:
            df = self.filter_column(df, 'pass_recipient_id', recipient_id)

        start_x_scaled, start_y_scaled = self.get_normalized_positions(df['location'])
        end_x_scaled, end_y_scaled = self.get_normalized_positions(df['pass_end_location'])
        outcomes = df['pass_outcome_name']
        if return_df:
            return df

        return start_x_scaled, start_y_scaled, end_x_scaled, end_y_scaled, outcomes
    

    def draw_passes(self, passes, ax):
        for x1, y1, x2, y2, outcome in zip(*passes):
            color = self.outcome_colors.get(outcome, 'C0')

            ax.annotate(
                '', xy=(x2, y2), xytext=(x1, y1), xycoords='axes fraction', textcoords='axes fraction', 
                arrowprops=dict(arrowstyle="->",connectionstyle="arc3", color=color), zorder=100
            )
        return ax

    
    def plot_match_passes_home_away(self):
        for team_id, team_name in zip(
            [self.home_team_id, self.away_team_id],
            [self.home_team_name, self.away_team_name]
        ):
            fig, ax = self.draw_pitch()
            self.draw_passes(self.get_passes(team_id), ax)
            ax.set_title(f'Passes for {team_name}', fontdict=dict(color='C5'))
            ax.plot([0, 1], [0, 1])
            while ax.texts:
                fig.texts.append(ax.texts.pop())

        return fig, ax
    
    def plot_passes(self, team_id=None, player_id=None, recipient_id=None):
        fig, ax = self.draw_pitch()
        passes = self.get_passes(team_id, player_id, recipient_id)
        print(passes)
        self.draw_passes(passes, ax)
        ax.plot([0, 1], [0, 1])
        while ax.texts:
            fig.texts.append(ax.texts.pop())
        return fig, ax
    

    def plot_shot(self):
        pass

In [166]:
world_cup_matches = pd.read_json('data/matches/43.json').sort_values('match_id')
match_ids = world_cup_matches['match_id']


In [167]:
world_cup_matches['home_team'].iloc[0]

{'home_team_id': 796, 'home_team_name': 'Russia'}

In [168]:
for i, row in world_cup_matches[['match_id', 'home_team', 'away_team']].iterrows():
    home = row[1]['home_team_name']
    away = row[2]['away_team_name']
    print(f'{row[0]}: {home} vs. {away}')

7525: Russia vs. Saudi Arabia
7529: Croatia vs. Nigeria
7530: France vs. Australia
7531: Argentina vs. Iceland
7532: Peru vs. Denmark
7533: Brazil vs. Switzerland
7534: Germany vs. Mexico
7535: Costa Rica vs. Serbia
7536: Belgium vs. Panama
7537: Tunisia vs. England
7538: Sweden vs. South Korea
7539: Poland vs. Senegal
7540: Russia vs. Egypt
7541: Colombia vs. Japan
7542: Portugal vs. Morocco
7543: Iran vs. Spain
7544: Uruguay vs. Saudi Arabia
7545: Argentina vs. Croatia
7546: France vs. Peru
7547: Denmark vs. Australia
7548: Brazil vs. Costa Rica
7549: Nigeria vs. Iceland
7550: Serbia vs. Switzerland
7551: Germany vs. Sweden
7552: Belgium vs. Tunisia
7553: South Korea vs. Mexico
7554: England vs. Panama
7555: Poland vs. Colombia
7556: Japan vs. Senegal
7557: Iran vs. Portugal
7558: Uruguay vs. Russia
7559: Saudi Arabia vs. Egypt
7560: Spain vs. Morocco
7561: Iceland vs. Croatia
7562: Australia vs. Peru
7563: Denmark vs. France
7564: Nigeria vs. Argentina
7565: Serbia vs. Brazil
7566: 

In [171]:
match = Match(7551)

119.0 79.0


In [172]:
def show_pass_plot(match, team_id=None, player_id=None, recipient_id=None):
    print(team_id, player_id, recipient_id)
    match.plot_passes(team_id=team_id, player_id=player_id, recipient_id=recipient_id)

In [173]:
def get_dropdown(df, id_column, value_column):
    options = [('None', None)] + [tuple(option) for option in df[[value_column, id_column]].drop_duplicates().values]
    return widgets.Dropdown(options=options, value=None)

In [174]:
team_dropdown = get_dropdown(match.df, 'team_id', 'team_name')
player_dropdown = get_dropdown(match.df, 'player_id', 'player_name')
recipient_dropdown = get_dropdown(match.df, 'pass_recipient_id', 'pass_recipient_name')

In [175]:
widgets.interact_manual(
    show_pass_plot,
    match=widgets.fixed(match),
    team_id=team_dropdown,
    player_id=player_dropdown,
    recipient_id=recipient_dropdown,
)

interactive(children=(Dropdown(description='team_id', options=(('None', None), ('Germany', 770), ('Sweden', 79…

<function __main__.show_pass_plot(match, team_id=None, player_id=None, recipient_id=None)>

In [63]:
passes_df[['player_id', 'player_name']].drop_duplicates()

Unnamed: 0,player_id,player_name
4,5625.0,Emil Forsberg
6,5607.0,Andreas Granqvist
10,3167.0,Antonio Rüdiger
13,3186.0,Julian Draxler
16,5574.0,Toni Kroos
19,6040.0,Jonas Hector
23,5578.0,Jérôme Boateng
25,5579.0,Joshua Kimmich
31,6039.0,Sebastian Rudy
46,5559.0,Marco Reus


#### Analysis of passes

In [85]:
df_passes = match.get_passes_df()

In [86]:
df_filtered = df_passes[df_passes['player_id']==6039.0][['location', 'pass_end_location', 'pass_outcome_name', 'player_name', 'minute']]

In [84]:
match.x_max, match.x_min

(0.95652, 0.04338)

In [80]:
df_filtered.reset_index()

Unnamed: 0,index,location,pass_end_location,pass_outcome_name,player_name,minute
0,31,"[62.0, 21.0]","[46.0, 28.0]",,Sebastian Rudy,0
1,55,"[84.0, 76.0]","[82.0, 65.0]",,Sebastian Rudy,1
2,128,"[87.0, 44.0]","[80.0, 51.0]",,Sebastian Rudy,2
3,191,"[47.0, 20.0]","[56.0, 53.0]",,Sebastian Rudy,4
4,263,"[28.0, 32.0]","[20.0, 4.0]",,Sebastian Rudy,6
5,301,"[70.0, 41.0]","[66.0, 18.0]",,Sebastian Rudy,6
6,362,"[75.0, 43.0]","[67.0, 20.0]",,Sebastian Rudy,9
7,378,"[28.0, 19.0]","[30.0, 3.0]",,Sebastian Rudy,10
8,399,"[92.0, 43.0]","[103.0, 68.0]",,Sebastian Rudy,11
9,415,"[91.0, 24.0]","[88.0, 17.0]",,Sebastian Rudy,11


In [79]:
pd.DataFrame(match.get_normalized_positions(df_filtered['location'])).transpose()

Unnamed: 0,0,1
0,0.506764,0.094689
1,0.806601,0.9359
2,0.847488,0.446468
3,0.30233,0.079395
4,0.04338,0.262932
5,0.615796,0.400584
6,0.683941,0.431174
7,0.04338,0.0641
8,0.915633,0.431174
9,0.902004,0.140574


In [82]:
pd.DataFrame(match.get_normalized_positions(df_filtered['pass_end_location'])).transpose()

Unnamed: 0,0,1
0,0.307176,0.3547
1,0.672432,0.784788
2,0.65214,0.622052
3,0.408636,0.6453
4,0.04338,0.075724
5,0.510096,0.23846
6,0.520242,0.261708
7,0.14484,0.0641
8,0.885498,0.81966
9,0.733308,0.226836


In [55]:
match.get_normalized_positions(df_filtered['location'])

(array([0.50676448, 0.80660149, 0.84748836, 0.30233015, 0.04338   ,
        0.61579612, 0.6839409 , 0.04338   , 0.91563313, 0.90200418,
        0.76571463, 0.95652   , 0.64305403, 0.45224866, 0.20692746,
        0.20692746, 0.28870119]),
 array([0.09468947, 0.9359    , 0.44646842, 0.07939474, 0.26293158,
        0.40058421, 0.43117368, 0.0641    , 0.43117368, 0.14057368,
        0.23234211, 0.76765789, 0.66059474, 0.59941579, 0.14057368,
        0.52294211, 0.43117368]))

Notes what to check:
* Passes of players Brandt, Rudy, etc. look strange as well as coordinate transformations for recipient passes.
* Look into that if something goes wrong

In [None]:
df = a.get_passes_df()

In [None]:
df[df['player_id'] == 5002]['player_name']