In [1]:
import torch
from torch import Tensor
import torch.nn as nn
import torch.optim as optim
from rdflib import Graph, Namespace, URIRef, RDF
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

print(torch.__version__)

1.13.1+cpu


#### Step 1: Data Analysis using SPARQL Queries

In [2]:
g = Graph()
g.parse("../kg/players.ttl")

fb = Namespace("https://footballerontology.com/")
g.bind("fb", fb)

num_players = len({s for s, _, _ in g.triples((None, RDF.type, fb.player))})
print(f"The number of players in the graph is: {num_players}")

The number of players in the graph is: 1145


##### (a) Data quality assessment queries



In [3]:
# Query 1: Check for players with missing attribute links 

q1 = """
PREFIX fb: <https://footballerontology.com/>

SELECT ?player
WHERE {
    ?player a fb:player .
    FILTER NOT EXISTS { ?player ?attribute ?value}
}
"""

results1 = g.query(q1)
print("Players with missing attributes:")
print(results1.serialize(format='txt').decode('utf-8'))

Players with missing attributes:
(no results)



In [4]:
# Query 2: Find all literals whose datatype does not match range of predicate

q2 = """ 

PREFIX fb: <https://footballerontology.com/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?player ?attribute ?value ?expectedDatatype ?actualDatatype
WHERE {
    ?player ?attribute ?value .
    ?attribute rdfs:range ?expectedDatatype .
    
    FILTER (isLiteral(?value))
    BIND (datatype(?value) AS ?actualDatatype)

    # Ensure only incorrect datatypes are returned
    FILTER (?expectedDatatype != ?actualDatatype)
}

"""

results2 = g.query(q2)
print("Incorrect literal datatype: ")
print(results2.serialize(format='txt').decode('utf-8'))

Incorrect literal datatype: 
(no results)



In [5]:
# Query 3: Find inconsistent birth years 

q3 = """
PREFIX fb: <https://footballerontology.com/>
SELECT ?player ?birthyear
WHERE {
  ?player a fb:player ;
          fb:birthyear ?birthyear .
  FILTER (?birthyear < 1900 || ?birthyear > 2023)
}
"""

results3 = g.query(q3)
print("Players with non-sensical birth years:")
print(results3.serialize(format='txt').decode('utf-8'))

Players with non-sensical birth years:
(no results)



In [6]:
# Query 4: Find any attributes with missing values 

q4 = """
PREFIX fb: <https://footballerontology.com/>

SELECT ?subject ?predicate ?object
WHERE {
    ?subject ?predicate ?object .
    FILTER (str(?object) = "NaN" || str(?object) = "nan")
}
"""

results4 = g.query(q4)
print("Triples with NaN values:")
print(results4.serialize(format='txt').decode('utf-8'))

Triples with NaN values:
                        subject                         |                   predicate                   |                    object                     
---------------------------------------------------------------------------------------------------------------------------------------------------------
<https://footballerontology.com/contractuntilSeq/247026>|<http://www.w3.org/1999/02/22-rdf-syntax-ns#_1>|"NaN"^^<http://www.w3.org/2001/XMLSchema#float>



In [7]:
# Query 5: Find any duplicate players

q5 = """
PREFIX fb: <https://footballerontology.com/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?player (COUNT(?player) AS ?count)
WHERE {
    ?player a fb:player .
}
GROUP BY ?player
HAVING (COUNT(?subject) > 1)
"""

results5 = g.query(q5)
print("Duplicate players:")
print(results5.serialize(format='txt').decode('utf-8'))

Duplicate players:
(no results)



##### (b) Inference queries

In [8]:
# Query 6: Find the most improved players by passing rating and their passes completed percentage 

q6 = """
PREFIX fb: <https://footballerontology.com/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?name ?overallRatingGrowth ?passesCompletedGrowth
WHERE {
    ?player a fb:player ;
            fb:name ?name ;
            fb:passing ?passingSeq ;
            fb:passescompletepercentage ?passescompletepercentageSeq .
    
    ?passingSeq rdf:_1 ?initialRating ;
                rdf:_2 ?finalRating .
    BIND((?finalRating - ?initialRating) AS ?overallRatingGrowth)
    
    ?passescompletepercentageSeq rdf:_1 ?initialStat ;
                                 rdf:_2 ?finalStat .
    BIND((?finalStat - ?initialStat) AS ?passesCompletedGrowth)
}
ORDER BY DESC(?overallRatingGrowth)
LIMIT 10
"""

results6 = g.query(q6)
print("Most improved players by passing rating and their growth in passes completed percentage:")
print(results6.serialize(format='txt').decode('utf-8'))


Most improved players by passing rating and their growth in passes completed percentage:
        name        |              overallRatingGrowth               |                     passesCompletedGrowth                     
--------------------------------------------------------------------------------------------------------------------------------------
"Anthony Gordon"    |"12"^^<http://www.w3.org/2001/XMLSchema#integer>|"14.599999999999994"^^<http://www.w3.org/2001/XMLSchema#float> 
"Castello Lukeba"   |"15"^^<http://www.w3.org/2001/XMLSchema#integer>|"-1.9000000000000057"^^<http://www.w3.org/2001/XMLSchema#float>
"Gianluca Scamacca" |"12"^^<http://www.w3.org/2001/XMLSchema#integer>|"-7.8999999999999915"^^<http://www.w3.org/2001/XMLSchema#float>
"Hugo Ekitike"      |"13"^^<http://www.w3.org/2001/XMLSchema#integer>|"11.800000000000011"^^<http://www.w3.org/2001/XMLSchema#float> 
"Luca Ranieri"      |"16"^^<http://www.w3.org/2001/XMLSchema#integer>|"8.700000000000003"^^<http://www.w3.

In [9]:
# Query 7 : Find players whose shooting rating in FIFA dropped despite improvement in shots on goals 

q7 = """
PREFIX fb: <https://footballerontology.com/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?player ?initialFifaRating ?finalFifaRating ?initialSoT ?finalSoT
WHERE {
    ?player a fb:player ;
            fb:shotsongoaltotal ?shotsongoalSeq ;
            fb:shooting ?shootingSeq .

    ?shootingSeq rdf:_1 ?initialFifaRating ;
               rdf:_2 ?finalFifaRating .

    ?shotsongoalSeq rdf:_1 ?initialSoT ;
                 rdf:_2 ?finalSoT .

    FILTER (?finalFifaRating < ?initialFifaRating && ?finalSoT > ?initialSoT)
}
ORDER BY DESC(?initialFifaRating)

"""

results7 = g.query(q7)
print("Players whose FIFA rating dropped despite improvement in real life (shooting attribute):")
print(results7.serialize(format='txt').decode('utf-8')) 

Players whose FIFA rating dropped despite improvement in real life (shooting attribute):
                    player                    |               initialFifaRating                |                finalFifaRating                 |                   initialSoT                   |                    finalSoT                    
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
<https://footballerontology.com/player/158023>|"92"^^<http://www.w3.org/2001/XMLSchema#integer>|"89"^^<http://www.w3.org/2001/XMLSchema#integer>|"1.17"^^<http://www.w3.org/2001/XMLSchema#float>|"1.94"^^<http://www.w3.org/2001/XMLSchema#float>
<https://footballerontology.com/player/159261>|"83"^^<http://www.w3.org/2001/XMLSchema#integer>|"80"^^<http://www.w3.org/2001/XMLSchema#integer>|"0.54"^^<http://www.w3.org/2001/XMLS

In [10]:
# Query 8: Find the teams with most expensive players in 2021 whose value dropped next year 

q8 = """
PREFIX fb: <https://footballerontology.com/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?team ?name ?initialValue ?finalValue
WHERE {
    ?player a fb:player ;
            fb:club ?clubSeq ;
            fb:value ?valueSeq ;
            fb:name ?name.
    ?clubSeq rdf:_1 ?initialTeam ;
             rdf:_2 ?finalTeam .
    ?valueSeq rdf:_1 ?initialValue ;
              rdf:_2 ?finalValue .
    BIND(IF(?initialValue > ?finalValue, ?initialTeam, ?finalTeam) AS ?team)
}
ORDER BY DESC(?initialValue)
LIMIT 10
"""

results8 = g.query(q8)
print("Most expensive players in 2021 whose values dropped in 2022")
print(results8.serialize(format='txt').decode('utf-8'))

Most expensive players in 2021 whose values dropped in 2022
        team         |          name          |                     initialValue                      |                      finalValue                       
---------------------------------------------------------------------------------------------------------------------------------------------------------------
"Atlético de Madrid" |"Jan Oblak"             |"112000000.0"^^<http://www.w3.org/2001/XMLSchema#float>|"85500000.0"^^<http://www.w3.org/2001/XMLSchema#float> 
"FC Barcelona"       |"Frenkie de Jong"       |"119500000.0"^^<http://www.w3.org/2001/XMLSchema#float>|"116500000.0"^^<http://www.w3.org/2001/XMLSchema#float>
"FC Bayern München"  |"Robert Lewandowski"    |"119500000.0"^^<http://www.w3.org/2001/XMLSchema#float>|"84000000.0"^^<http://www.w3.org/2001/XMLSchema#float> 
"Liverpool"          |"Trent Alexander-Arnold"|"114000000.0"^^<http://www.w3.org/2001/XMLSchema#float>|"100500000.0"^^<http://www.w3.org/2001/XM

#### Step 2: Vectorize data

I want to store the difference in player performance over 2 seasons. This is the metric that makes the difference. If a player performs significantly worse than the previous season, he is released by the club. On the other end, if a player performs significantly better, he is sought after by bigger and better clubs that can afford him. In general, this rule holds true.  

There are a lot of things that determine a player transfer, and there is no simple linear relationship between the features and the outcome. I will compare two models for this prediction task, simple logistics regression and a vanilla neural network. 

In [11]:
# Create dictionary that holds player id along with {attribute : change in value}
players = {}

# Get all player URIs
playerids = g.subjects(RDF.type, fb.player)

for player in playerids:
    # Extract player ID from URI
    player_id = player.split('/')[-1]
    player_data = {}
    
    # Process each predicate (attribute) of the player
    for predicate in g.predicates(subject=player):
        obj = g.value(subject=player, predicate=predicate)
        if isinstance(obj, URIRef):  # Check if the object is a sequence URI
            attr_name = predicate.split('/')[-1]  # Extract attribute name
            seq_uri = obj
            
            # Get the two values from the sequence
            v1 = g.value(seq_uri, RDF._1)
            v2 = g.value(seq_uri, RDF._2)
            
            if v1 is not None and v2 is not None:
                try:
                    diff = float(v2) - float(v1)
                    player_data[attr_name] = [diff]
                except (ValueError, TypeError):
                    # Skip values that are non-numeric (name, nationality, height, ... these are confounders)
                    pass
    
    # Create dropped feature based on player's club values
    club_seq = g.value(player, fb.club)
    dropped = 0
    if club_seq:
        c1 = g.value(club_seq, RDF._1)
        c2 = g.value(club_seq, RDF._2)
        if c1 is not None and c2 is not None and c1 != c2:
            dropped = 1
    player_data['dropped'] = [dropped]
    
    players[player_id] = player_data


In [12]:
for i, (key, value) in enumerate(players.items()):
        if i >= 2:
            break
        print(f"{key}: {value}")

106795: {'aerialswon': [0.0], 'clearancessuccessful': [0.5], 'contractuntil': [1.0], 'foulscommitted': [0.0], 'foulssuffered': [0.5], 'fts': [1.0], 'gkdiving': [0.0], 'gkhandling': [0.0], 'gkkicking': [0.0], 'gkpositioning': [0.0], 'gkreflexes': [0.0], 'intreputation': [0.0], 'matchesplayed': [1.0], 'minutes': [90.0], 'overallrating': [0.0], 'passescompletelong': [-2.5], 'passescompletepercentage': [-2.200000000000003], 'potential': [0.0], 'starts': [1.0], 'touches': [4.5], 'value': [0.0], 'dropped': [0]}
138412: {'aerialswon': [-0.32000000000000006], 'assiststotal': [0.05], 'clearancessuccessful': [-0.8399999999999999], 'contractuntil': [1.0], 'defending': [0.0], 'dribbling': [-1.0], 'foulscommitted': [-0.6400000000000001], 'foulssuffered': [-0.7999999999999998], 'fts': [-3.4000000000000004], 'goalstotal': [0.0], 'interceptions': [-0.6799999999999999], 'intreputation': [0.0], 'matchesplayed': [-10.0], 'minutes': [-307.0], 'overallrating': [-1.0], 'pace': [-4.0], 'passescompletelong': 

I now have a representation where each player has only one value per attribute, and this value is the difference in performance over one year. High performers will have large attribute values, players who got worse will have negative values & players who haven't improved will have 0 values. 

In [13]:
# Order all the attributes, since they occur in different positions for goalkeeprs and outfield players
# Remove 'dropped' feature since it is the target value

all_attributes = set()
for attributes in players.values():
    all_attributes.update(attributes.keys())
    all_attributes.discard('dropped')
all_attributes = sorted(all_attributes)

all_attributes

['aerialswon',
 'assiststotal',
 'clearancessuccessful',
 'contractuntil',
 'defending',
 'dribbling',
 'foulscommitted',
 'foulssuffered',
 'fts',
 'gkdiving',
 'gkhandling',
 'gkkicking',
 'gkpositioning',
 'gkreflexes',
 'goalstotal',
 'interceptions',
 'intreputation',
 'matchesplayed',
 'minutes',
 'overallrating',
 'pace',
 'passescompletelong',
 'passescompletepercentage',
 'passing',
 'physicality',
 'potential',
 'shooting',
 'shotsongoaltotal',
 'skillmoves',
 'starts',
 'touches',
 'value']

In [14]:
import pandas as pd

# Assuming `players` is your dictionary of player data and `all_attributes` is the list of all possible attributes
feature_matrix = []
player_ids = []

for player_id, attributes in players.items():
    feature_vector = []
    for attribute in all_attributes:
        # Check if the attribute exists in the player's attributes and is not an empty list
        if attribute in attributes and attributes[attribute]:
            feature_vector.append(attributes[attribute][0])  # Assuming the attribute value is a list
        else:
            feature_vector.append(0)  # Append 0 if the attribute is missing or empty, since some properties are not shared between goalkeepers and outfield players 
    feature_matrix.append(feature_vector)
    player_ids.append(player_id)


X = pd.DataFrame(feature_matrix, index=player_ids, columns=all_attributes)
# Drop the player with missing contract value 

X.head()

Unnamed: 0,aerialswon,assiststotal,clearancessuccessful,contractuntil,defending,dribbling,foulscommitted,foulssuffered,fts,gkdiving,...,passescompletepercentage,passing,physicality,potential,shooting,shotsongoaltotal,skillmoves,starts,touches,value
106795,0.0,0.0,0.5,1.0,0.0,0.0,0.0,0.5,1.0,0.0,...,-2.2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.5,0.0
138412,-0.32,0.05,-0.84,1.0,0.0,-1.0,-0.64,-0.8,-3.4,0.0,...,-9.2,-1.0,-1.0,-1.0,0.0,-0.05,0.0,-4.0,-10.8,-2600000.0
152908,-0.18,-0.14,1.0,1.0,3.0,-1.0,-0.38,-1.62,-0.7,0.0,...,-2.3,2.0,2.0,-4.0,-1.0,-0.07,0.0,3.0,11.8,-2900000.0
156616,-0.32,-0.13,-0.19,1.0,0.0,-3.0,-2.03,-0.28,-15.4,0.0,...,8.7,-1.0,0.0,-3.0,-2.0,-0.13,0.0,-16.0,12.8,-3400000.0
158023,-0.25,-0.02,0.0,0.0,0.0,-1.0,-0.16,-0.05,-6.4,0.0,...,-4.8,-1.0,-1.0,-2.0,-3.0,0.77,0.0,-6.0,-2.6,-24000000.0


In [15]:
# Create target vector 
target_vector = [attributes['dropped'][0] for attributes in players.values()]
y = pd.DataFrame(target_vector, index=player_ids, columns=['dropped'])

y.head()

Unnamed: 0,dropped
106795,0
138412,0
152908,0
156616,0
158023,0


In [16]:
# Drop the player with NaN value for contract 
X.drop(index='247026', inplace=True)
y.drop(index='247026', inplace=True)

# Split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Step 3: Set up and train simple logistic regression and a simple neural network, compare results on test set

In [17]:
# Logistic regression 

logit = LogisticRegression()
logit.fit(X_train, y_train.values)

# Get the probabilities for the positive class (1)
y_prob = logit.predict_proba(X_test)[:, 1]

# Apply the bayesian threshold 
y_pred = (y_prob > 0.5).astype(int)

# Calculate the accuracy
logit_accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {logit_accuracy}')

Accuracy: 0.6724890829694323


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:
# Vanilla neural network 

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# Define a simple neural network with one hidden layer
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.hidden = nn.Linear(input_dim, 8)  # Hidden layer with 8 neurons
        self.output = nn.Linear(8, 1)  # Output layer with 1 neuron

    def forward(self, x):
        x = torch.relu(self.hidden(x))  # Apply ReLU activation to hidden layer
        x = torch.sigmoid(self.output(x))  # Apply Sigmoid activation to output
        return x

# Initialize model
model = SimpleNN(X_train.shape[1])

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
epochs = 50
for epoch in range(epochs):
    y_pred = model(X_train_tensor)
    loss = criterion(y_pred, y_train_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Apply bayesian threshold 
with torch.no_grad():
    y_pred_prob = model(X_test_tensor)
    y_pred = (y_pred_prob > 0.5).float()
    
nn_accuracy = accuracy_score(y_test_tensor, y_pred.numpy())
print(f"Test Accuracy: {nn_accuracy:.4f}")

Test Accuracy: 0.6900


I observe similar performance from logistic regression and the vanilla neural network (~67% accuracy). Sticking to Occam's razor principle, I would personally chose logistic regression since it is more interpretable. It is also easy to tweak the parameters and see their effect on the predictions, helping identify the main predictors. 