In [1]:
from rdflib import Graph, Literal, RDF, URIRef, Namespace, RDFS, OWL
from rdflib.namespace import XSD, RDFS

#### Step 1: Define ontology 

I chose to have only one class for all players, https://dbpedia.org/page/Football_player 

The case can be made that each position can be a different class. This is possible because dbpedia does indeed have different classes for each position. I chose not to do this because:

(1) Many statistics and properties are shared between both types of players (club, matches played, starts, etc.)

(2) For my use case, which is predicting which players will be at the same club at a given year, the type of player shouldn't make a difference to the predictions.  

(3) Based on the way I have cleaned the data, there can be no inconsistencies. That is, outfield players cannot have goalkeeper statistics and vice versa.  

Cons:

(1) Different players share the same super class. Which means there is a compromise in the semantics, since I chose to ignore subclasses that exist.

Note: queries can be made to find goalkeepers or other positions, it would just be querying the property BestPosition of each player, instead of type of node itself. If I was just building a KG I would have to be more stringent while defining my main, sub classes. 

In [2]:
# Define namespaces 

sofifa = Namespace("https://sofifa.com")
dbpediaclass = Namespace("https://dbpedia.org/page")
wikidata = Namespace("https://www.wikidata.org/wiki/")
sportsschema= Namespace("https://sportschema.org/ontologies/soccer")
dbpedia = Namespace("https://dbpedia.org/property")

g = Graph()
g.bind("sofifa", sofifa)
g.bind("dbpediaclass", dbpediaclass)
g.bind("wikidata", wikidata)
g.bind("sportsschema", sportsschema)
g.bind("dbpedia", dbpedia)

In [3]:
# Add main class 

player_uri = sofifa['player']
g.add((player_uri, RDF.type, dbpediaclass['Football_player']))

# Add player properties that don't change in 2 years (name, height, etc.)

constant_literals = {'Property:P1477':'name', 'Property:P27':'nationality', 'Property:P413':'position',
                     'Property:P8006':'preferredfoot'}

for key, value in constant_literals.items():
    g.add((URIRef(wikidata[key]), RDF.type, RDF.Property))
    g.add((URIRef(wikidata[key]), RDFS.domain, player_uri))
    g.add((URIRef(wikidata[key]), RDFS.range, RDFS.Literal))
    g.add((URIRef(wikidata[key]), RDFS.label, Literal(value)))

g.add((URIRef(wikidata['Property:P2048']), RDF.type, RDF.Property))
g.add((URIRef(wikidata['Property:P2048']), RDFS.domain, player_uri))
g.add((URIRef(wikidata['Property:P2048']), RDFS.range, XSD.float))
g.add((URIRef(wikidata['Property:P2048']), RDFS.label, Literal('height')))
      
g.add((URIRef(wikidata['Q21821348']), RDF.type, RDF.Property))
g.add((URIRef(wikidata['Q21821348']), RDFS.domain, player_uri))
g.add((URIRef(wikidata['Q21821348']), RDFS.range, XSD.dateTime))
g.add((URIRef(wikidata['Q21821348']), RDFS.label, Literal('birthyear')))


<Graph identifier=Na30c12bd04c141809e317d0a7dee2fd6 (<class 'rdflib.graph.Graph'>)>

In [4]:
# Add fifa properties that change every year 

dbpedia_properties = [ "fts", "IntReputation", "Potential", "attackingworkrate", "contractuntil",
                       "defending", "defensiveworkrate", "dribbling", "gkdiving", "gkhandling",
                       "gkkicking", "gkpositioning", "gkreflexes", "matchesplayed", "minutes",
                       "overallRating", "pace", "passing", "physicality", "shooting", "skillmoves",
                       "starts", "value", "club" ]

for prop in dbpedia_properties:
    g.add((URIRef(dbpedia[prop]), RDF.type, RDF.Property))
    g.add((URIRef(dbpedia[prop]), RDFS.domain, player_uri))
    g.add((URIRef(dbpedia[prop]), RDFS.range, RDF.Seq))

In [5]:
# Add statistics properties that change every year

sportsschema_properties = [ "aerialsWon", "assistsTotal", "clearancesSuccessful", "foulsCommited",
                            "foulsSuffered", "goalsTotal", "interceptions", "passesCompleteLong",
                            "passesCompletePercentage", "shotsOnGoalTotal", "touches" ]

for prop in sportsschema_properties:
    g.add((URIRef(sportsschema[prop]), RDF.type, RDF.Property))
    g.add((URIRef(sportsschema[prop]), RDFS.domain, player_uri))
    g.add((URIRef(sportsschema[prop]), RDFS.range, RDF.Seq))


g.add((URIRef(wikidata['Property:P54']), RDF.type, RDF.Property))
g.add((URIRef(wikidata['Property:P54']), RDFS.domain, player_uri))
g.add((URIRef(wikidata['Property:P54']), RDFS.range, RDF.Seq))
g.add((URIRef(wikidata['Property:P54']), RDFS.label, Literal('club')))

<Graph identifier=Na30c12bd04c141809e317d0a7dee2fd6 (<class 'rdflib.graph.Graph'>)>

In [6]:
# Define a local namespace & make all properties and classes equivalent to make querying easier
fb = Namespace("https://footballerontology.com")
g.bind("fb", fb)

# Link main player class
g.add((sofifa.player, OWL.equivalentClass, fb.player))

# Link properties 
for prop in dbpedia_properties:
    g.add((URIRef(dbpedia[prop]), OWL.equivalentProperty, URIRef(fb[prop])))

for prop in sportsschema_properties:
    g.add((URIRef(sportsschema[prop]), OWL.equivalentProperty, URIRef(fb[prop])))

for key, value in constant_literals.items():
    g.add((URIRef(wikidata[key]), OWL.equivalentProperty, URIRef(fb[value])))

g.add((URIRef(wikidata['Property:P2048']), OWL.equivalentProperty, URIRef(fb['height'])))
g.add((URIRef(wikidata['Property:P54']), OWL.equivalentProperty, URIRef(fb['club'])))
g.add((URIRef(wikidata['Q21821348']), OWL.equivalentProperty, URIRef(fb['birthyear'])))

g.serialize(destination='../kg/schema.ttl', format='turtle')

<Graph identifier=Na30c12bd04c141809e317d0a7dee2fd6 (<class 'rdflib.graph.Graph'>)>

#### Step 2: Populate KG 

In [7]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd 

fb = Namespace("https://footballerontology.com")
data_21 = pd.read_csv("../data/cleaned/Data 21.csv")


for index, row in data_21.iterrows():
    # Add properties common to all players that do not change
    player_uri = URIRef(fb[f"/player/{row[0]}"])
    g.add((player_uri, RDF.type, fb.player))

    g.add((player_uri, fb.name, Literal(row[1])))
    g.add((player_uri, fb.height, Literal(row[2], datatype=XSD.float)))
    g.add((player_uri, fb.nationality, Literal(row[3])))
    g.add((player_uri, fb.position, Literal(row[6]))) 
    g.add((player_uri, fb.preferredfoot, Literal(row[11])))
    g.add((player_uri, fb.birthyear, Literal(row[26], datatype=XSD.integer)))
    
    # Add properties that change every year, common to all players
    g.add((player_uri, fb.fts, Literal(row[30], datatype=XSD.float)))
    g.add((player_uri, fb.intreputation, Literal(row[10], datatype=XSD.integer)))
    g.add((player_uri, fb.potential, Literal(row[5], datatype=XSD.integer)))
    g.add((player_uri, fb.matchesplayed, Literal(row[27], datatype=XSD.integer)))
    g.add((player_uri, fb.starts, Literal(row[28], datatype=XSD.integer)))
    g.add((player_uri, fb.value, Literal(row[8], datatype=XSD.float)))
    g.add((player_uri, fb.minutes, Literal(row[29], datatype=XSD.integer)))
    g.add((player_uri, fb.overallrating, Literal(row[4], datatype=XSD.integer)))
    g.add((player_uri, fb.contractuntil, Literal(row[9], datatype=XSD.dateTime)))
    g.add((player_uri, fb.club, Literal(row[7])))
    
    # Add goalkeeper properties 
    if row['BestPosition'] == 'GK':
        g.add((player_uri, fb.gkdiving, Literal(row[21], datatype=XSD.integer)))
        g.add((player_uri, fb.gkhandling, Literal(row[22], datatype=XSD.integer)))
        g.add((player_uri, fb.gkkicking, Literal(row[23], datatype=XSD.integer)))
        g.add((player_uri, fb.gkpositioning, Literal(row[24], datatype=XSD.integer)))
        g.add((player_uri, fb.gkreflexes, Literal(row[25], datatype=XSD.integer)))
        g.add((player_uri, fb.touches, Literal(row[37], datatype=XSD.integer)))
        g.add((player_uri, fb.foulscommitted, Literal(row[39], datatype=XSD.integer)))
        g.add((player_uri, fb.foulssuffered, Literal(row[40], datatype=XSD.integer)))
        g.add((player_uri, fb.clearancessuccessful, Literal(row[35], datatype=XSD.integer)))
        g.add((player_uri, fb.aerialswon, Literal(row[36], datatype=XSD.integer)))
        g.add((player_uri, fb.passescompletelong, Literal(row[38], datatype=XSD.integer)))
        g.add((player_uri, fb.passescompletepercentage, Literal(row[33], datatype=XSD.float)))
    
    # Add outfield player properties
    else:
        g.add((player_uri, fb.defending, Literal(row[19], datatype=XSD.integer)))
        g.add((player_uri, fb.attackingworkrate, Literal(row[13])))
        g.add((player_uri, fb.defensiveworkrate, Literal(row[14])))
        g.add((player_uri, fb.dribbling, Literal(row[18], datatype=XSD.integer)))
        g.add((player_uri, fb.pace, Literal(row[15], datatype=XSD.integer)))
        g.add((player_uri, fb.passing, Literal(row[17], datatype=XSD.integer)))
        g.add((player_uri, fb.physicality, Literal(row[20], datatype=XSD.integer)))
        g.add((player_uri, fb.shooting, Literal(row[16], datatype=XSD.integer)))
        g.add((player_uri, fb.skillmoves, Literal(row[12], datatype=XSD.integer)))
        g.add((player_uri, fb.goalstotal, Literal(row[31], datatype=XSD.integer)))
        g.add((player_uri, fb.assiststotal, Literal(row[32], datatype=XSD.integer)))
        g.add((player_uri, fb.shotsongoaltotal, Literal(row[41], datatype=XSD.integer)))
        g.add((player_uri, fb.touches, Literal(row[37], datatype=XSD.integer)))
        g.add((player_uri, fb.foulscommitted, Literal(row[39], datatype=XSD.integer)))
        g.add((player_uri, fb.foulssuffered, Literal(row[40], datatype=XSD.integer)))
        g.add((player_uri, fb.interceptions, Literal(row[34], datatype=XSD.integer)))
        g.add((player_uri, fb.clearancessuccessful, Literal(row[35], datatype=XSD.integer)))
        g.add((player_uri, fb.aerialswon, Literal(row[36], datatype=XSD.integer)))
        g.add((player_uri, fb.passescompletelong, Literal(row[38], datatype=XSD.integer)))
        g.add((player_uri, fb.passescompletepercentage, Literal(row[33], datatype=XSD.float)))

In [9]:
# Add 2022 data onto existing players 

data_22 = pd.read_csv("../data/cleaned/Data 22.csv")

for index, row in data_22.iterrows():
    # Find matching player by ID
    player_uri = URIRef(fb[f"/player/{row[0]}"])
    
    # Append new values to existing player
    # Add properties that change every year, common to all players
    g.add((player_uri, fb.fts, Literal(row[30], datatype=XSD.float)))
    g.add((player_uri, fb.intreputation, Literal(row[10], datatype=XSD.integer)))
    g.add((player_uri, fb.potential, Literal(row[5], datatype=XSD.integer)))
    g.add((player_uri, fb.matchesplayed, Literal(row[27], datatype=XSD.integer))) 
    g.add((player_uri, fb.starts, Literal(row[28], datatype=XSD.integer)))
    g.add((player_uri, fb.value, Literal(row[8], datatype=XSD.float)))
    g.add((player_uri, fb.minutes, Literal(row[29], datatype=XSD.integer)))
    g.add((player_uri, fb.overallrating, Literal(row[4], datatype=XSD.integer)))
    g.add((player_uri, fb.contractuntil, Literal(row[9], datatype=XSD.dateTime)))
    g.add((player_uri, fb.club, Literal(row[7])))

    # Add goalkeeper properties
    if row['BestPosition'] == 'GK':
        g.add((player_uri, fb.gkdiving, Literal(row[21], datatype=XSD.integer)))
        g.add((player_uri, fb.gkhandling, Literal(row[22], datatype=XSD.integer))) 
        g.add((player_uri, fb.gkkicking, Literal(row[23], datatype=XSD.integer)))
        g.add((player_uri, fb.gkpositioning, Literal(row[24], datatype=XSD.integer)))
        g.add((player_uri, fb.gkreflexes, Literal(row[25], datatype=XSD.integer)))
        g.add((player_uri, fb.touches, Literal(row[37], datatype=XSD.integer)))
        g.add((player_uri, fb.foulscommitted, Literal(row[39], datatype=XSD.integer)))
        g.add((player_uri, fb.foulssuffered, Literal(row[40], datatype=XSD.integer)))
        g.add((player_uri, fb.clearancessuccessful, Literal(row[35], datatype=XSD.integer)))
        g.add((player_uri, fb.aerialswon, Literal(row[36], datatype=XSD.integer)))
        g.add((player_uri, fb.passescompletelong, Literal(row[38], datatype=XSD.integer)))
        g.add((player_uri, fb.passescompletepercentage, Literal(row[33], datatype=XSD.float)))
    
    # Add outfield player properties 
    else:
        g.add((player_uri, fb.defending, Literal(row[19], datatype=XSD.integer)))
        g.add((player_uri, fb.attackingworkrate, Literal(row[13])))
        g.add((player_uri, fb.defensiveworkrate, Literal(row[14])))
        g.add((player_uri, fb.dribbling, Literal(row[18], datatype=XSD.integer)))
        g.add((player_uri, fb.pace, Literal(row[15], datatype=XSD.integer)))
        g.add((player_uri, fb.passing, Literal(row[17], datatype=XSD.integer)))
        g.add((player_uri, fb.physicality, Literal(row[20], datatype=XSD.integer)))
        g.add((player_uri, fb.shooting, Literal(row[16], datatype=XSD.integer)))
        g.add((player_uri, fb.skillmoves, Literal(row[12], datatype=XSD.integer)))
        g.add((player_uri, fb.goalstotal, Literal(row[31], datatype=XSD.integer)))
        g.add((player_uri, fb.assiststotal, Literal(row[32], datatype=XSD.integer)))
        g.add((player_uri, fb.shotsongoaltotal, Literal(row[41], datatype=XSD.integer)))
        g.add((player_uri, fb.touches, Literal(row[37], datatype=XSD.integer)))
        g.add((player_uri, fb.foulscommitted, Literal(row[39], datatype=XSD.integer)))
        g.add((player_uri, fb.foulssuffered, Literal(row[40], datatype=XSD.integer)))
        g.add((player_uri, fb.interceptions, Literal(row[34], datatype=XSD.integer)))
        g.add((player_uri, fb.clearancessuccessful, Literal(row[35], datatype=XSD.integer)))
        g.add((player_uri, fb.aerialswon, Literal(row[36], datatype=XSD.integer)))
        g.add((player_uri, fb.passescompletelong, Literal(row[38], datatype=XSD.integer)))
        g.add((player_uri, fb.passescompletepercentage, Literal(row[33], datatype=XSD.float)))

In [12]:
g.serialize(destination='../kg/players.ttl', format='ttl')

<Graph identifier=Na30c12bd04c141809e317d0a7dee2fd6 (<class 'rdflib.graph.Graph'>)>