# INSY670 Social Media Analytics: Final Project

## Notebook 2: Training Graph Neural Network model

In [1]:
import json
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

import torch
from torch_geometric.data import Data

from sklearn.model_selection import StratifiedShuffleSplit

import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
events_path = 'data/events'
frames_path = 'data/frames'

events = pd.DataFrame()
frames = pd.DataFrame()

def append_json_to_df(folder_path, dataframe):
    for file in os.listdir(folder_path):
        if file.endswith('.json'):
            try:
                file_path = os.path.join(folder_path, file)
                data = pd.read_json(file_path)
                if folder_path=='data/events':
                    data['team_name'] = data['team'].apply(lambda x: x['name'] if isinstance(x, dict) and 'name' in x else np.nan)
                    unique_teams = data['team_name'].unique().tolist()
                    data['team1'] = unique_teams[0]
                    data['team2'] = unique_teams[1]
                    data.drop(['team_name'],axis=1,inplace=True)
                dataframe = pd.concat([dataframe, data], ignore_index=True)
                dataframe.reset_index(drop=True, inplace=True)
            except:
                continue
    return dataframe

events = append_json_to_df(events_path, events)
frames = append_json_to_df(frames_path, frames)

In [84]:
len(events), len(frames)

(469072, 401328)

In [4]:
events['pass_type'] = events['pass'].apply(lambda x: x['type']['name'] if isinstance(x, dict) and 'type' in x and 'name' in x['type'] else np.nan)
events['pass_shot_assist'] = events['pass'].apply(lambda x: x['shot_assist'] if isinstance(x, dict) and 'shot_assist' in x else False)
events['attack_team'] = events['team'].apply(lambda x: x['name'] if isinstance(x, dict) and 'name' in x else np.nan)

### Mapping from EA FC dataset to add player and team attributes

In [5]:
events['defense_team'] = events['team2']
events.loc[events['attack_team']==events['team2'],'defense_team'] = events['team1']

In [6]:
fifa_data = pd.read_csv('data/male_teams.csv')
fifa_data = fifa_data[['team_name', 'attack', 'defence']]

mean_attack = fifa_data['attack'].mean()
mean_defense = fifa_data['defence'].mean()

To create the network with the corner frame and target variable

In [7]:
corners = events[events['pass_type']=='Corner']
corners_wframes = pd.merge(corners, frames, left_on='id', right_on='event_uuid', how='inner')

network_data = corners_wframes[['freeze_frame', 'event_uuid', 'pass_shot_assist', 'attack_team', 'defense_team']]
network_data['id'] = network_data.index+1

In [8]:
network_data = pd.merge(network_data, fifa_data[['team_name','attack']], left_on='attack_team', right_on='team_name',how='left')
network_data['attack'].fillna(mean_attack, inplace=True)
network_data.drop(['team_name'], axis=1, inplace=True)
network_data.drop_duplicates(subset=['id'], inplace=True)
network_data.reset_index(drop=True, inplace=True)

network_data = pd.merge(network_data, fifa_data[['team_name','defence']], left_on='defense_team', right_on='team_name',how='left')
network_data['defence'].fillna(mean_defense, inplace=True)
network_data.drop_duplicates(subset=['id'], inplace=True)
network_data.drop(['attack_team','defense_team','team_name','id'], axis=1, inplace=True)
network_data.reset_index(drop=True, inplace=True)

In [9]:
network_data['pass_shot_assist'].value_counts()

pass_shot_assist
False    558
True     141
Name: count, dtype: int64

In [133]:
## Stratified split on Target variable (Pass Shot Assist)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=123)
for train_index, test_index in split.split(network_data, network_data["pass_shot_assist"]):
    train = network_data.loc[train_index]
    test = network_data.loc[test_index]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

## Convert the data to pytorch geometric dataset

In [11]:
def create_graph_data(row):
    players = row['freeze_frame']
    attack_rating = row['attack']
    defense_rating = row['defence']

    features = [
        [player['location'][0], player['location'][1],
         attack_rating if player['teammate'] else defense_rating]
        for player in players
    ]
    features_tensor = torch.tensor(features, dtype=torch.float)
    
    edges = []
    for i in range(len(players)):
        for j in range(i + 1, len(players)):
            if players[i]['teammate'] == players[j]['teammate']:
                edges.append([i, j])
                edges.append([j, i])
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    
    target = torch.tensor([int(row['pass_shot_assist'])], dtype=torch.long)
    
    graph_data = Data(x=features_tensor, edge_index=edge_index, y=target)
    return graph_data

In [135]:
train_graph = [create_graph_data(row) for index, row in train.iterrows()]
test_graph = [create_graph_data(row) for index, row in test.iterrows()]

## Training the model

In [136]:
torch.manual_seed(12345)

train_dataset = train_graph.copy()
test_dataset = test_graph.copy()

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 489
Number of test graphs: 210


In [137]:
print(train_dataset[0])

Data(x=[17, 3], edge_index=[2, 128], y=[1])


In [138]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(x=[1174, 3], edge_index=[2, 10016], y=[64], batch=[1174], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(x=[1166, 3], edge_index=[2, 9808], y=[64], batch=[1166], ptr=[65])

Step 3:
Number of graphs in the current batch: 64
DataBatch(x=[1162, 3], edge_index=[2, 9718], y=[64], batch=[1162], ptr=[65])

Step 4:
Number of graphs in the current batch: 64
DataBatch(x=[1181, 3], edge_index=[2, 10054], y=[64], batch=[1181], ptr=[65])

Step 5:
Number of graphs in the current batch: 64
DataBatch(x=[1151, 3], edge_index=[2, 9662], y=[64], batch=[1151], ptr=[65])

Step 6:
Number of graphs in the current batch: 64
DataBatch(x=[1189, 3], edge_index=[2, 10216], y=[64], batch=[1189], ptr=[65])

Step 7:
Number of graphs in the current batch: 64
DataBatch(x=[1171, 3], edge_index=[2, 9912], y=[64], batch=[1171], ptr=[65])

Step 8:
Number of graphs in the current batch: 41
DataBatch(x=[741, 3], edge_index=[2, 6220], y=[41],

In [139]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
import torch

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, num_node_features, num_classes):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)

        return x

model = GCN(hidden_channels=64, num_node_features=3, num_classes=2)
print(model)

GCN(
  (conv1): GCNConv(3, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [140]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  
         out = model(data.x, data.edge_index, data.batch) 
         loss = criterion(out, data.y)  
         loss.backward()  
         optimizer.step()  
         optimizer.zero_grad()

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  
         out = model(data.x, data.edge_index, data.batch)
         pred = out.argmax(dim=1)  
         correct += int((pred == data.y).sum())  
     return correct / len(loader.dataset)


for epoch in range(1, 1000):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)

print("Model training completed")
print(f'Train Accuracy: {train_acc:.4f}')
print(f'Test Accuracy: {test_acc:.4f}')

Model training completed
Train Accuracy: 0.7975
Test Accuracy: 0.8000


In [150]:
torch.save(model.state_dict(), 'gnn_model.pth')
model.load_state_dict(torch.load('gnn_model.pth'))
model.eval()

GCN(
  (conv1): GCNConv(3, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)

In [156]:
def predict(data):
    with torch.no_grad():
        out = model(data.x.unsqueeze(0), data.edge_index, torch.zeros(data.x.size(0), dtype=torch.long))
        prob = F.softmax(out.squeeze(), dim=0)
        return prob

# Make prediction
probability = predict(data)
print("Probability of leading to a shot:", probability[1].item())

Probability of leading to a shot: 0.18485227227210999
