In [1]:
import random
import torch
from torch import Tensor
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from rdflib import Graph, Namespace, URIRef, RDF
import numpy as np


print(torch.__version__)

1.13.1+cpu


#### Step 1: Data Analysis using SPARQL Queries

In [2]:
g = Graph()
g.parse("../kg/players.ttl")

fb = Namespace("https://footballerontology.com/")
g.bind("fb", fb)

num_players = len({s for s, _, _ in g.triples((None, RDF.type, fb.player))})
print(f"The number of players in the graph is: {num_players}")

The number of players in the graph is: 1145


#### Checking for Data Quality



In [3]:
# Query 1: Check for players with missing attribute values 

q1 = """
PREFIX fb: <https://footballerontology.com/>

SELECT ?player
WHERE {
    ?player a fb:player .
    FILTER NOT EXISTS { ?player ?attribute ?value}
}
"""

results1 = g.query(q1)
print("Players with missing attributes:")
print(results1.serialize(format='txt').decode('utf-8'))

# Query 2: Find all literals whose datatype does not match range of predicate

q2 = """ 

PREFIX fb: <https://footballerontology.com/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?player ?attribute ?value ?expectedDatatype ?actualDatatype
WHERE {
    ?player ?attribute ?value .
    ?attribute rdfs:range ?expectedDatatype .
    
    FILTER (isLiteral(?value))
    BIND (datatype(?value) AS ?actualDatatype)

    # Ensure only incorrect datatypes are returned
    FILTER (?expectedDatatype != ?actualDatatype)
}

"""

results2 = g.query(q2)
print("Incorrect literal datatype: ")
print(results2.serialize(format='txt').decode('utf-8'))

# Query 3: Find all attributes that have negative values 

q3 = """
PREFIX fb: <https://footballerontology.com/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?player ?attribute ?value
WHERE {
    ?player a fb:player .
    ?player ?attribute ?value .
    FILTER (datatype(?value) = xsd:integer || datatype(?value) = xsd:float) .
    FILTER (?value < 0)
}
"""

results3 = g.query(q3)
print("Players with negative attribute values:")
print(results3.serialize(format='txt').decode('utf-8'))

# Query 4: 

Players with missing attributes:
(no results)

Incorrect literal datatype: 
(no results)

Players with negative attribute values:
(no results)



#### SPARQL Queries for Data Exploration

#### Find most improved players, rating & stats 
#### Find the stats of the best players 
#### 

In [57]:
query_multiple_clubs = """
PREFIX fb: <https://footballerontology.com/>

SELECT ?player (COUNT(?club) AS ?numClubs)
WHERE {
    ?player a fb:player .
    ?player fb:club ?club .
}
GROUP BY ?player
HAVING (COUNT(?club) > 1)
"""

results_multiple_clubs = g.query(query_multiple_clubs)
for row in results_multiple_clubs:
    print(f"Player: {row.player}, Number of Clubs: {row.numClubs}")

Player: https://footballerontology.com/player/158121, Number of Clubs: 2
Player: https://footballerontology.com/player/163587, Number of Clubs: 2
Player: https://footballerontology.com/player/163705, Number of Clubs: 2
Player: https://footballerontology.com/player/170084, Number of Clubs: 2
Player: https://footballerontology.com/player/172203, Number of Clubs: 2
Player: https://footballerontology.com/player/173221, Number of Clubs: 2
Player: https://footballerontology.com/player/177413, Number of Clubs: 2
Player: https://footballerontology.com/player/178160, Number of Clubs: 2
Player: https://footballerontology.com/player/179813, Number of Clubs: 2
Player: https://footballerontology.com/player/179847, Number of Clubs: 2
Player: https://footballerontology.com/player/181291, Number of Clubs: 2
Player: https://footballerontology.com/player/182212, Number of Clubs: 2
Player: https://footballerontology.com/player/183394, Number of Clubs: 2
Player: https://footballerontology.com/player/18357

In [28]:
query = """
PREFIX fb: <https://footballerontology.com/>

SELECT ?club (COUNT(?player) AS ?numPlayers)
WHERE {
    ?player a fb:player .
    ?player fb:club ?club .
}
GROUP BY ?club
ORDER BY DESC(?numPlayers)
LIMIT 10
"""

results = g.query(query)

for row in results:
        print(f"Club: {row.club}, Number of Players: {row.numPlayers}")

Club: Chelsea, Number of Players: 25
Club: AC Milan, Number of Players: 24
Club: Borussia Dortmund, Number of Players: 23
Club: VfL Wolfsburg, Number of Players: 23
Club: OGC Nice, Number of Players: 22
Club: Atalanta, Number of Players: 22
Club: Arsenal, Number of Players: 22
Club: Aston Villa, Number of Players: 21
Club: U.C. Sampdoria, Number of Players: 21
Club: Stade Rennais FC, Number of Players: 21


#### Step 2: Vectorize data

I want to store the difference in player performance over 2 seasons. This is the metric that makes the difference. If a player performs significantly worse than the previous season, he is released by the club. On the other end, if a player performs significantly better, he is sought after by bigger and better clubs that can afford him. 

There are a lot of things that determine a player transfer, and there is no simple linear relationship between the features and the outcome. Hence, it makes sense to use a neural network for this classification task. 

In [30]:
fb = Namespace("https://footballerontology.com/")

players = dict()

# List only attributes that are needed. Things like name, nationality can become confounding variables. 
attributes_list = ['potential', 'club', 'defending', 'dribbling', 'fts', 'gkdiving', 'gkhandling', 'gkkicking',
                   'gkpositioning', 'gkreflexes', 'matchesplayed', 'minutes', 'overallrating', 'pace', 'passing', 'physicality', 'shooting', 'skillmoves', 'starts', 
                   'value', 'aerialswon', 'assiststotal', 'clearancessuccessful', 'foulscommitted', 'foulssuffered', 'goalstotal', 'interceptions', 
                   'passescompletelong', 'passescompletepercentage', 'shotsongoaltotal', 'touches']

# Store each player & attributes as a dictionary of dictionaries 
for s, _, _ in g.triples((None, RDF.type, fb.player)):

    player_id = s.__str__().split("/")[-1] # Extract player ID (unique)
    players[player_id] = {} # Initialize empty dictionary for given player

    # Iterate over all attributes of the player 
    for _, p, o in g.triples((s, None, None)):
        attribute = p.__str__().split("/")[-1]  # Extract attribute name (not unique)
        if attribute in attributes_list:
            value = o.__str__()
            # Since some attributes (club, matchesplayed, goals ,..) can have multiple values over each year, store both 
            if attribute in players[player_id]:
                players[player_id][attribute].append(value)
            else:
                players[player_id][attribute] = [value]


In [31]:
# Since I have numerical data (goals, assists, matchesplayed, etc.) 
# I need to ensure that their true datatypes are preserved 

def ensure_numeric(player_data):
    parsed_data = {}
    
    for player_id, attributes in player_data.items():
        parsed_attributes = {}
        
        for key, value in attributes.items():
            if isinstance(value, list):
                # Process each item in the list
                processed_values = []
                for item in value:
                    try:
                        # Try to convert to float, then to int if possible
                        num_value = float(item)
                        if num_value.is_integer():
                            processed_values.append(int(num_value))
                        else:
                            processed_values.append(num_value)
                    except ValueError:
                        # Keep the original string if conversion fails
                        processed_values.append(item)
                
                # Add the processed values to the parsed attributes
                parsed_attributes[key] = processed_values
            else:
                # If it's not a list, keep it as is
                parsed_attributes[key] = value
        
        # Add an attributed that stores whether or not a player is dropped
        parsed_attributes['dropped'] = [ 1 if len(parsed_attributes.get('club', [])) > 1 else 0 ]
        parsed_data[player_id] = parsed_attributes
    
    return parsed_data

players = ensure_numeric(players)


In [32]:
for player_id, attributes in players.items():
    if 'club' in attributes:
        del attributes['club']  # Remove the club since it can be a confounder, sure some clubs buy more players than others but without context the neural network will just confound 


In [33]:
for player_id, attributes in players.items():
    for key, value in attributes.items():
        if key == 'dropped':
            pass
        elif isinstance(value, list):  
            if len(value) > 1:
                attributes[key] = [value[1]- value[0]]  # Find a difference in performance if exists 
            else:
                attributes[key] = [0] # Duplicate scores are not stored in the KG, hence best to remove 

In [34]:
first_player = dict(list(players.items())[:1])
print(first_player)

{'106795': {'aerialswon': [0], 'clearancessuccessful': [0.5], 'foulscommitted': [0], 'foulssuffered': [0.5], 'fts': [1], 'gkdiving': [0], 'gkhandling': [0], 'gkkicking': [0], 'gkpositioning': [0], 'gkreflexes': [0], 'matchesplayed': [1], 'minutes': [90], 'overallrating': [0], 'passescompletelong': [2.5], 'passescompletepercentage': [2.200000000000003], 'passing': [0], 'physicality': [0], 'potential': [0], 'starts': [1], 'touches': [4.5], 'value': [0], 'dropped': [0]}}


I now have a representation where each player has only one value per attribute, and this value is the difference in performance over one year. High performers will have large attribute values, players who got worse will have negative values & players who haven't improved will have 0 values. 

In [None]:
# Find all attributes and remove 'dropped' since that is the target vector 
all_attributes = set()
for attributes in players.values():
    all_attributes.update(attributes.keys())
    all_attributes.discard('dropped')
all_attributes = sorted(all_attributes)

all_attributes

['aerialswon',
 'assiststotal',
 'clearancessuccessful',
 'defending',
 'dribbling',
 'foulscommitted',
 'foulssuffered',
 'fts',
 'gkdiving',
 'gkhandling',
 'gkkicking',
 'gkpositioning',
 'gkreflexes',
 'goalstotal',
 'interceptions',
 'matchesplayed',
 'minutes',
 'overallrating',
 'pace',
 'passescompletelong',
 'passescompletepercentage',
 'passing',
 'physicality',
 'potential',
 'shooting',
 'shotsongoaltotal',
 'skillmoves',
 'starts',
 'touches',
 'value']

In [66]:
# Create feature matrix directly
feature_matrix = []
for player_id, attributes in players.items():
    feature_vector = []
    for attribute in all_attributes:
        if attribute in attributes:
            feature_vector.append(attributes[attribute][0])
        else:
            feature_vector.append(0)
    feature_matrix.append(feature_vector)

X = torch.tensor(np.array(feature_matrix), dtype = torch.float)
print(f"The feature matrix has the following shape {X.shape}")

The feature matrix has the following shape torch.Size([1144, 30])


In [67]:
# Create target vector 
target_vector = []
for player_id, attributes in players.items():
    target_vector.append(attributes['dropped'][0])

y = torch.tensor(np.array(target_vector), dtype= torch.float)
print(f"The target vector has the following shape {y.shape}")

The target vector has the following shape torch.Size([1144])


In [74]:
# Create a new players dictionary without the 'dropped' attribute
new_players = {player_id: {k: v for k, v in attributes.items() if k != 'dropped'} for player_id, attributes in players.items()}

# Extract unique sets of players and attributes
players_set = set(new_players.keys())

# Create index mappings for players and attributes
player_indices = {p: i for i, p in enumerate(players_set)}
attribute_indices = {a: i for i, a in enumerate(sorted(all_attributes))}

# Create edge sets for player-attribute relationships
player_attribute_edges = {(player_indices[p], attribute_indices[a]) for p, attributes in new_players.items() for a in attributes}

players_np = np.array(list(player_indices.keys()))
attributes_np = np.array(list(attribute_indices.keys()))
player_attribute_edges_np = np.array(list(player_attribute_edges))

print(f"The edge index has the following shape :{player_attribute_edges_np}")

The edge index has the following shape :[[509  26]
 [550  27]
 [946  27]
 ...
 [934  17]
 [631  16]
 [812  27]]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
    X, y, edge_index, test_size=0.2, random_state=42
)

#### Step 3: Set up neural network, and find train & test scores

In [68]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.linear = nn.Linear(hidden_channels, out_channels)
    
    def forward(self, x, edge_index):
        # First layer 
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        # Second layer 
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        # Output layer 
        x = self.linear(x)
        return torch.sigmoid(x)
