In [1]:
import torch
from torch import Tensor
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from rdflib import Graph, Namespace, URIRef, RDF
import numpy as np
from gensim.models import Word2Vec


print(torch.__version__)

1.13.1+cpu


#### Step 1: Vectorize data

In [2]:
g = Graph()
g.parse("../kg/players.ttl")

fb = Namespace("https://footballerontology.com/")

num_players = len({s for s, _, _ in g.triples((None, RDF.type, fb.player))})
print(f"The number of players in the graph is: {num_players}")

The number of players in the graph is: 1144


In [None]:
# Trying to learn graph embeddings using random walks 

In [3]:
fb = Namespace("https://footballerontology.com/")

players = dict()

# Store each player & attributes as a dictionary of dictionaries 
for s, _, _ in g.triples((None, RDF.type, fb.player)):

    player_id = s.__str__().split("/")[-1] # Extract player ID (unique)
    players[player_id] = {} # Initialize empty dictionary for given player

    # Iterate over all attributes for the given player
    for _, p, o in g.triples((s, None, None)):
        attribute = p.__str__().split("/")[-1]  # Extract attribute name (not unique)
        value = o.__str__()
        # Since some attributes (club, matchesplayed, goals ,..) can have multiple values over each year, store both 
        if attribute in players[player_id]:
            players[player_id][attribute].append(value)
        else:
            players[player_id][attribute] = [value]


In [4]:
# Since I have numerical data (goals, assists, matchesplayed, etc.) 
# I need to ensure that their true datatypes are preserved 

def ensure_numeric(player_data):
    parsed_data = {}
    
    for player_id, attributes in player_data.items():
        parsed_attributes = {}
        
        for key, value in attributes.items():
            if isinstance(value, list):
                # Process each item in the list
                processed_values = []
                for item in value:
                    try:
                        # Try to convert to float, then to int if possible
                        num_value = float(item)
                        if num_value.is_integer():
                            processed_values.append(int(num_value))
                        else:
                            processed_values.append(num_value)
                    except ValueError:
                        # Keep the original string if conversion fails
                        processed_values.append(item)
                
                # Add the processed values to the parsed attributes
                parsed_attributes[key] = processed_values
            else:
                # If it's not a list, keep it as is
                parsed_attributes[key] = value
        
        # Add an attributed that stores whether or not a player is dropped
        parsed_attributes['dropped'] = [ 1 if len(parsed_attributes.get('club', [])) > 1 else 0 ]
        parsed_data[player_id] = parsed_attributes
    
    return parsed_data

players = ensure_numeric(players)


In [5]:
players

{'106795': {'22-rdf-syntax-ns#type': ['https://footballerontology.com/player'],
  'aerialswon': [0],
  'birthyear': [1981],
  'clearancessuccessful': [0, 0.5],
  'club': ['U.S. Sassuolo Calcio'],
  'contractuntil': [2022, 2023],
  'foulscommitted': [0],
  'foulssuffered': [0, 0.5],
  'fts': [1, 2],
  'gkdiving': [68],
  'gkhandling': [68],
  'gkkicking': [60],
  'gkpositioning': [70],
  'gkreflexes': [70],
  'height': [183],
  'intreputation': [1],
  'matchesplayed': [1, 2],
  'minutes': [90, 180],
  'name': ['Gianluca Pegolo'],
  'nationality': ['Italy'],
  'overallrating': [70],
  'passescompletelong': [9.5, 12],
  'passescompletepercentage': [83.1, 85.3],
  'position': ['GK'],
  'potential': [70],
  'preferredfoot': ['Left'],
  'starts': [1, 2],
  'touches': [36, 40.5],
  'value': [180000],
  'dropped': [0]},
 '138412': {'22-rdf-syntax-ns#type': ['https://footballerontology.com/player'],
  'aerialswon': [1.15, 1.47],
  'assiststotal': [0.11, 0.16],
  'attackingworkrate': ['Medium'],

In [6]:
import numpy as np
from torch_geometric.data import HeteroData
import torch
import torch_geometric.transforms as T

players_set = set(players.keys())  # Unique player IDs
attributes_set = set(a for p in players.values() for a in p.keys())  # Unique attributes

# Create index mappings
player_indices = {p: i for i, p in enumerate(players_set)}
attribute_indices = {a: i for i, a in enumerate(attributes_set)}

# Create edge sets (player → attribute)
player_attribute_edges = {
    (player_indices[p], attribute_indices[a]) for p, attrs in players.items() for a in attrs
}

players_np = np.array(list(player_indices.keys()))
attributes_np = np.array(list(attribute_indices.keys()))
player_attribute_edges_np = np.array(list(player_attribute_edges))

print("Players:", players_np[:3])
print("Attributes:", attributes_np[:3])
print("\nPlayer → Attribute Edges:\n", player_attribute_edges_np[:3])


Players: ['230977' '225850' '204638']
Attributes: ['dropped' 'height' 'defensiveworkrate']

Player → Attribute Edges:
 [[509  26]
 [113  26]
 [946  27]]


In [7]:
data = HeteroData()

# Create nodes
data["player"].node_id = torch.arange(len(players_np))
data["attribute"].node_id = torch.arange(len(attributes_np))

# Create edges
data["player", "has", "attribute"].edge_index = torch.tensor(player_attribute_edges_np.T, dtype=torch.long)

# Convert to an undirected graph and validate
data = T.ToUndirected()(data)
data.validate()


True

#### Step 2: Learn embeddings

#### Step 3: Train GNN and tune hyperparameter 

#### Step 4: Make predictions on test set