# __INTERACTIVE PASSING NETWORK | Data Loading & Preparation__

_Ryan Ferrera | ryanferrera.com | Twitter: @RyanFerrera_

## __SETUP__

In [1]:
import numpy as np
import pandas as pd
import json
import requests
import random

## __DATA LOADING__

Adapted from a script written by Devin Pleuler: 
https://github.com/devinpleuler/analytics-handbook/blob/master/notebooks/data_extraction_and_transformation.ipynb

In [2]:
url = "https://raw.githubusercontent.com/statsbomb/open-data/master/data/"
comp_url = url + "matches/{}/{}.json"
match_url = url + "events/{}.json"

In [3]:
def get_data(competition_id, season_id):
    matches = requests.get(url=comp_url.format(competition_id, season_id)).json()
    match_ids = [m['match_id'] for m in matches]

    all_events = []
    for match_id in match_ids:

        events = requests.get(url=match_url.format(match_id)).json()

        passes = [x for x in events if x['type']['name'] == "Pass"]
        for p in passes:
            try:
                
                try:
                    outcome = p['pass']['outcome']['name']
                except:
                    outcome = 'Complete'
    
                attributes = {
                    "match_id": match_id,
                    "team": p["possession_team"]["name"],
                    "player_from": p['player']['name'],
                    "player_to": p['pass']['recipient']['name'],
                    'location': p['location'],
                    'pass_type': p['pass']['height']['name'],
                    'outcome': outcome
                }
                
                all_events.append(attributes)
            
            except KeyError:
                next
            
    return pd.DataFrame(all_events)

In [4]:
competition_id = 43 # world cup 2018
season_id = 3

In [5]:
df = get_data(competition_id, season_id)

In [6]:
df.head()

Unnamed: 0,match_id,team,player_from,player_to,location,pass_type,outcome
0,7562,Peru,José Paolo Guerrero González,Renato Fabrizio Tapia Cortijo,"[60.0, 41.0]",Ground Pass,Complete
1,7562,Peru,Renato Fabrizio Tapia Cortijo,Víctor Yoshimar Yotún Flores,"[46.0, 41.0]",Ground Pass,Complete
2,7562,Peru,Víctor Yoshimar Yotún Flores,Luis Jan Piers Advíncula Castrillón,"[40.0, 28.0]",Ground Pass,Complete
3,7562,Peru,Luis Jan Piers Advíncula Castrillón,Anderson Santamaría Bardales,"[43.0, 76.0]",Ground Pass,Complete
4,7562,Peru,Anderson Santamaría Bardales,Víctor Yoshimar Yotún Flores,"[38.0, 38.0]",Ground Pass,Complete


In [7]:
df.shape

(58964, 7)

## __DATA CLEANING__

Pass Outcome: simplifying to complete/incomplete

In [8]:
df['outcome'].unique()

array(['Complete', 'Incomplete', 'Out', 'Pass Offside', 'Unknown'],
      dtype=object)

In [9]:
def outcome_clean(x):
    if x == "Complete":
        outcome = x
    else:
        outcome = "Incomplete"
    return outcome

In [10]:
df['outcome_c'] = df['outcome'].apply(outcome_clean)

In [11]:
df.head()

Unnamed: 0,match_id,team,player_from,player_to,location,pass_type,outcome,outcome_c
0,7562,Peru,José Paolo Guerrero González,Renato Fabrizio Tapia Cortijo,"[60.0, 41.0]",Ground Pass,Complete,Complete
1,7562,Peru,Renato Fabrizio Tapia Cortijo,Víctor Yoshimar Yotún Flores,"[46.0, 41.0]",Ground Pass,Complete,Complete
2,7562,Peru,Víctor Yoshimar Yotún Flores,Luis Jan Piers Advíncula Castrillón,"[40.0, 28.0]",Ground Pass,Complete,Complete
3,7562,Peru,Luis Jan Piers Advíncula Castrillón,Anderson Santamaría Bardales,"[43.0, 76.0]",Ground Pass,Complete,Complete
4,7562,Peru,Anderson Santamaría Bardales,Víctor Yoshimar Yotún Flores,"[38.0, 38.0]",Ground Pass,Complete,Complete


In [12]:
df['outcome_c'].unique()

array(['Complete', 'Incomplete'], dtype=object)

X/Y Location: Parsing the location into separate x, y columns

In [13]:
def location_x(a):
    x = a[0]
    return x

def location_y(a):
    y = a[1]
    return y

In [14]:
df['location_x'] = df["location"].apply(location_x)
df['location_y'] = df["location"].apply(location_y)

In [15]:
df.head()

Unnamed: 0,match_id,team,player_from,player_to,location,pass_type,outcome,outcome_c,location_x,location_y
0,7562,Peru,José Paolo Guerrero González,Renato Fabrizio Tapia Cortijo,"[60.0, 41.0]",Ground Pass,Complete,Complete,60.0,41.0
1,7562,Peru,Renato Fabrizio Tapia Cortijo,Víctor Yoshimar Yotún Flores,"[46.0, 41.0]",Ground Pass,Complete,Complete,46.0,41.0
2,7562,Peru,Víctor Yoshimar Yotún Flores,Luis Jan Piers Advíncula Castrillón,"[40.0, 28.0]",Ground Pass,Complete,Complete,40.0,28.0
3,7562,Peru,Luis Jan Piers Advíncula Castrillón,Anderson Santamaría Bardales,"[43.0, 76.0]",Ground Pass,Complete,Complete,43.0,76.0
4,7562,Peru,Anderson Santamaría Bardales,Víctor Yoshimar Yotún Flores,"[38.0, 38.0]",Ground Pass,Complete,Complete,38.0,38.0


## __GRAPH PREPARATION__

### __Node Table__
- Node ID
- Degree Centrality
- X Position
- Y Position
- Player Name
- Number of Pass Attempts
    - By type? 
- Completion Percentage
    - By type?

Filtering the passing data for only passes between France teammates:

In [16]:
FR_players = list(pd.DataFrame({'passes':df.query("team=='France'").query("outcome_c=='Complete'").groupby(['player_from']).size()}).sort_values(by='passes', ascending=False).head(20).index.values)
df_FR = df[df['player_from'].isin(FR_players) & df['player_to'].isin(FR_players)]
df_FR.head(3)

Unnamed: 0,match_id,team,player_from,player_to,location,pass_type,outcome,outcome_c,location_x,location_y
12101,8655,France,Antoine Griezmann,Raphaël Varane,"[60.0, 41.0]",Ground Pass,Complete,Complete,60.0,41.0
12102,8655,France,Raphaël Varane,Benjamin Pavard,"[38.0, 50.0]",Ground Pass,Complete,Complete,38.0,50.0
12103,8655,France,Benjamin Pavard,Kylian Mbappé Lottin,"[50.0, 72.0]",Ground Pass,Complete,Complete,50.0,72.0


Filtering only for the World Cup Final

In [17]:
df_FR = df_FR.query("match_id == 8658")
df_FR.head()

Unnamed: 0,match_id,team,player_from,player_to,location,pass_type,outcome,outcome_c,location_x,location_y
45314,8658,France,Benjamin Pavard,"N""Golo Kanté","[49.0, 80.0]",Low Pass,Complete,Complete,49.0,80.0
45315,8658,France,"N""Golo Kanté",Paul Pogba,"[65.0, 64.0]",High Pass,Incomplete,Incomplete,65.0,64.0
45318,8658,France,Antoine Griezmann,Kylian Mbappé Lottin,"[63.0, 73.0]",Ground Pass,Complete,Complete,63.0,73.0
45319,8658,France,Benjamin Pavard,Raphaël Varane,"[58.0, 79.0]",Low Pass,Complete,Complete,58.0,79.0
45320,8658,France,Raphaël Varane,Hugo Lloris,"[26.0, 69.0]",Ground Pass,Complete,Complete,26.0,69.0


Assemble the node table

In [19]:
dict_FR = {
    'id': ['n{}'.format(i) for i in range(1,len(df_FR['player_from'].unique())+1)],
    'x_pos_abs': df_FR[['player_from','location_x']].groupby(['player_from']).mean()['location_x'],                              
    'y_pos_abs': df_FR[['player_from','location_y']].groupby(['player_from']).mean()['location_y'],                              
    'x_pos_perc': df_FR[['player_from','location_x']].groupby(['player_from']).mean()['location_x']/120, # x position as a proportion of pitch length
    'y_pos_perc': df_FR[['player_from','location_y']].groupby(['player_from']).mean()['location_y']/80, # y position as a proportion of pitch width
    'name': sorted(df_FR['player_from'].unique()),
    'centrality': df_FR.groupby(['player_from']).size().values / df_FR.groupby(['player_from']).size().values.sum() / (df_FR.groupby(['player_from']).size().values / df_FR.groupby(['player_from']).size().values.sum()).max(),
    'pass_attempts': df_FR.groupby(['player_from']).size().values,
    'pass_completed': df_FR.query("outcome_c=='Complete'").groupby(['player_from']).size().values,
    'comp_percentage': df_FR.query("outcome_c=='Complete'").groupby(['player_from']).size().values / df_FR.groupby(['player_from']).size().values,
}

node_FR = pd.DataFrame(dict_FR)

node_FR

Unnamed: 0_level_0,id,x_pos_abs,y_pos_abs,x_pos_perc,y_pos_perc,name,centrality,pass_attempts,pass_completed,comp_percentage
player_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Antoine Griezmann,n1,68.727273,37.363636,0.572727,0.467045,Antoine Griezmann,0.6875,22,18,0.818182
Benjamin Pavard,n2,60.52381,75.809524,0.504365,0.947619,Benjamin Pavard,0.65625,21,17,0.809524
Blaise Matuidi,n3,52.45,15.3,0.437083,0.19125,Blaise Matuidi,0.625,20,18,0.9
Corentin Tolisso,n4,76.5,23.75,0.6375,0.296875,Corentin Tolisso,0.125,4,3,0.75
Hugo Lloris,n5,11.833333,40.166667,0.098611,0.502083,Hugo Lloris,0.5625,18,16,0.888889
Kylian Mbappé Lottin,n6,76.545455,63.818182,0.637879,0.797727,Kylian Mbappé Lottin,0.34375,11,8,0.727273
Lucas Hernández Pi,n7,54.555556,5.962963,0.45463,0.074537,Lucas Hernández Pi,0.84375,27,22,0.814815
"N""Golo Kanté",n8,38.1,20.4,0.3175,0.255,"N""Golo Kanté",0.3125,10,7,0.7
Nabil Fekir,n9,72.5,67.5,0.604167,0.84375,Nabil Fekir,0.0625,2,1,0.5
Olivier Giroud,n10,66.444444,43.666667,0.553704,0.545833,Olivier Giroud,0.5625,18,14,0.777778


In [148]:
node_FR.to_csv("player_nodes.csv", index=False)

### __Directed Link Table__
- Player 1
- Player 2
- Num Completed Passes (Link Weight)
- Completion Percentage
- Num Completed Passes 1->2

In [20]:
query_c = df_FR.query("outcome_c=='Complete'").groupby(['player_from','player_to'], as_index=False).size()

In [21]:
query_c

player_from                                 player_to           
Antoine Griezmann                           Benjamin Pavard         1
                                            Blaise Matuidi          2
                                            Corentin Tolisso        3
                                            Kylian Mbappé Lottin    2
                                            Lucas Hernández Pi      2
                                                                   ..
Steven N"Kemboanza Mike Christopher Nzonzi  Lucas Hernández Pi      1
                                            Nabil Fekir             1
                                            Paul Pogba              3
                                            Raphaël Varane          1
                                            Samuel Yves Umtiti      1
Length: 96, dtype: int64

Assemble the majority of the links table

In [24]:
links_FR = pd.DataFrame()
for i in range(query_c.shape[0]):
    row_Dict = {
    'id': 'l{}'.format(i+1),
    'source': query_c.index[i][0],
    'source_x': node_FR.loc[query_c.index[i][0],'x_pos_perc'],
    'source_y': node_FR.loc[query_c.index[i][0],'y_pos_perc'],
    'target': query_c.index[i][1],
    'target_x': node_FR.loc[query_c.index[i][1],'x_pos_perc'],
    'target_y': node_FR.loc[query_c.index[i][1],'y_pos_perc'],
    'pass_completed': query_c[i]
    }
    links_FR = links_FR.append(row_Dict, ignore_index=True)

In [25]:
links_FR

Unnamed: 0,id,pass_completed,source,source_x,source_y,target,target_x,target_y
0,l1,1.0,Antoine Griezmann,0.572727,0.467045,Benjamin Pavard,0.504365,0.947619
1,l2,2.0,Antoine Griezmann,0.572727,0.467045,Blaise Matuidi,0.437083,0.191250
2,l3,3.0,Antoine Griezmann,0.572727,0.467045,Corentin Tolisso,0.637500,0.296875
3,l4,2.0,Antoine Griezmann,0.572727,0.467045,Kylian Mbappé Lottin,0.637879,0.797727
4,l5,2.0,Antoine Griezmann,0.572727,0.467045,Lucas Hernández Pi,0.454630,0.074537
...,...,...,...,...,...,...,...,...
91,l92,1.0,"Steven N""Kemboanza Mike Christopher Nzonzi",0.423214,0.468750,Lucas Hernández Pi,0.454630,0.074537
92,l93,1.0,"Steven N""Kemboanza Mike Christopher Nzonzi",0.423214,0.468750,Nabil Fekir,0.604167,0.843750
93,l94,3.0,"Steven N""Kemboanza Mike Christopher Nzonzi",0.423214,0.468750,Paul Pogba,0.427604,0.580859
94,l95,1.0,"Steven N""Kemboanza Mike Christopher Nzonzi",0.423214,0.468750,Raphaël Varane,0.379167,0.718056


Add pass attempts to the links table

In [26]:
query_a = df_FR.groupby(['player_from','player_to'], as_index=False).size()
query_a

player_from                                 player_to           
Antoine Griezmann                           Benjamin Pavard         1
                                            Blaise Matuidi          4
                                            Corentin Tolisso        3
                                            Kylian Mbappé Lottin    3
                                            Lucas Hernández Pi      3
                                                                   ..
Steven N"Kemboanza Mike Christopher Nzonzi  Lucas Hernández Pi      1
                                            Nabil Fekir             1
                                            Paul Pogba              3
                                            Raphaël Varane          1
                                            Samuel Yves Umtiti      1
Length: 102, dtype: int64

In [27]:
att_FR = pd.DataFrame()
for i in range(query_a.shape[0]):
    row_Dict = {
    'source': query_a.index[i][0],
    'target': query_a.index[i][1],
    'pass_attempts': query_a[i]
    }
    att_FR = att_FR.append(row_Dict, ignore_index=True)

att_FR

Unnamed: 0,pass_attempts,source,target
0,1.0,Antoine Griezmann,Benjamin Pavard
1,4.0,Antoine Griezmann,Blaise Matuidi
2,3.0,Antoine Griezmann,Corentin Tolisso
3,3.0,Antoine Griezmann,Kylian Mbappé Lottin
4,3.0,Antoine Griezmann,Lucas Hernández Pi
...,...,...,...
97,1.0,"Steven N""Kemboanza Mike Christopher Nzonzi",Lucas Hernández Pi
98,1.0,"Steven N""Kemboanza Mike Christopher Nzonzi",Nabil Fekir
99,3.0,"Steven N""Kemboanza Mike Christopher Nzonzi",Paul Pogba
100,1.0,"Steven N""Kemboanza Mike Christopher Nzonzi",Raphaël Varane


In [28]:
links_FR = links_FR.merge(att_FR,how='left', on=['source','target'])
links_FR['comp_percentage'] = links_FR['pass_completed'] / links_FR['pass_attempts']
links_FR

Unnamed: 0,id,pass_completed,source,source_x,source_y,target,target_x,target_y,pass_attempts,comp_percentage
0,l1,1.0,Antoine Griezmann,0.572727,0.467045,Benjamin Pavard,0.504365,0.947619,1.0,1.000000
1,l2,2.0,Antoine Griezmann,0.572727,0.467045,Blaise Matuidi,0.437083,0.191250,4.0,0.500000
2,l3,3.0,Antoine Griezmann,0.572727,0.467045,Corentin Tolisso,0.637500,0.296875,3.0,1.000000
3,l4,2.0,Antoine Griezmann,0.572727,0.467045,Kylian Mbappé Lottin,0.637879,0.797727,3.0,0.666667
4,l5,2.0,Antoine Griezmann,0.572727,0.467045,Lucas Hernández Pi,0.454630,0.074537,3.0,0.666667
...,...,...,...,...,...,...,...,...,...,...
91,l92,1.0,"Steven N""Kemboanza Mike Christopher Nzonzi",0.423214,0.468750,Lucas Hernández Pi,0.454630,0.074537,1.0,1.000000
92,l93,1.0,"Steven N""Kemboanza Mike Christopher Nzonzi",0.423214,0.468750,Nabil Fekir,0.604167,0.843750,1.0,1.000000
93,l94,3.0,"Steven N""Kemboanza Mike Christopher Nzonzi",0.423214,0.468750,Paul Pogba,0.427604,0.580859,3.0,1.000000
94,l95,1.0,"Steven N""Kemboanza Mike Christopher Nzonzi",0.423214,0.468750,Raphaël Varane,0.379167,0.718056,1.0,1.000000


In [165]:
links_FR.to_csv('player_links.csv', index=False)