In [1]:
from rdflib import Graph, Literal, RDF, URIRef, Namespace, RDFS, OWL
from rdflib.namespace import XSD, RDFS

#### Step 1: Define ontology 

I chose to have only one class for all players, https://dbpedia.org/page/Football_player 

The case can be made that each position can be a different class. This is possible because dbpedia does indeed have different classes for each position. I chose not to do this because:

(1) Many statistics and properties are shared between both types of players (club, matches played, starts, etc.)

(2) For my use case, which is predicting which players will be at the same club at a given year, the type of player shouldn't make a difference to the predictions.  

(3) Based on the way I have cleaned the data, there can be no inconsistencies. That is, outfield players cannot have goalkeeper statistics and vice versa.  

Cons:

(1) Different players share the same super class. Which means there is a compromise in the semantics, since I chose to ignore subclasses that exist.

Note: queries can be made to find goalkeepers or other positions, it would just be querying the property BestPosition of each player, instead of type of node itself. If I was just building a KG I would have to be more stringent while defining my main, sub classes. 

In [2]:
# Define namespaces 

sofifa = Namespace("https://sofifa.com")
dbpediaclass = Namespace("https://dbpedia.org/page")
wikidata = Namespace("https://www.wikidata.org/wiki")
sportsschema= Namespace("https://sportschema.org/ontologies/soccer")
dbpedia = Namespace("https://dbpedia.org/property")

g = Graph()
g.bind("sofifa", sofifa)
g.bind("dbpediaclass", dbpediaclass)
g.bind("wikidata", wikidata)
g.bind("sportsschema", sportsschema)
g.bind("dbpedia", dbpedia)

In [3]:
# Add main class 

player_uri = sofifa['player']
g.add((player_uri, RDF.type, dbpediaclass['Football_player']))

# Add player properties that don't change in 2 years (name, height, etc.)

constant_literals = {'Property:P1477':'Name', 'Property:P27':'Nationality', 'Property:P413':'Position',
                     'Property:P8006':'Preferred Foot'}

for key, value in constant_literals.items():
    g.add((URIRef(wikidata[key]), RDF.type, RDF.Property))
    g.add((URIRef(wikidata[key]), RDFS.domain, player_uri))
    g.add((URIRef(wikidata[key]), RDFS.range, RDFS.Literal))
    g.add((URIRef(wikidata[key]), RDFS.label, Literal(value)))

g.add((URIRef(wikidata['Property:P2048']), RDF.type, RDF.Property))
g.add((URIRef(wikidata['Property:P2048']), RDFS.domain, player_uri))
g.add((URIRef(wikidata['Property:P2048']), RDFS.range, XSD.float))
g.add((URIRef(wikidata['Property:P2048']), RDFS.label, Literal('Height')))
      
g.add((URIRef(wikidata['wiki/Q21821348']), RDF.type, RDF.Property))
g.add((URIRef(wikidata['wiki/Q21821348']), RDFS.domain, player_uri))
g.add((URIRef(wikidata['wiki/Q21821348']), RDFS.range, XSD.dateTime))
g.add((URIRef(wikidata['wiki/Q21821348']), RDFS.label, Literal('Birth Year')))


<Graph identifier=N50ab2c1ed1584fdfbbf61ffeed79f28d (<class 'rdflib.graph.Graph'>)>

In [6]:
# Add fifa properties that change every year 

# Integers 
fifa_int = ['overallRating', 'Potential', 'IntReputation', 'skillmoves', 'attackingworkrate',
            'defensiveworkrate', 'pace', 'shooting', 'passing', 'dribbling', 'defending',
            'physicality', 'gkdiving', 'gkhandling', 'gkkicking', 'gkpositioning', 'gkreflexes', 
            'matchesplayed', 'starts', '90s']

for prop in fifa_int:
    g.add((URIRef(dbpedia[prop]), RDF.type, RDF.Property))
    g.add((URIRef(dbpedia[prop]), RDFS.domain, player_uri))
    g.add((URIRef(dbpedia[prop]), RDFS.range, XSD.integer))

# Float 
g.add((URIRef(dbpedia['value']), RDF.type, RDF.Property))
g.add((URIRef(dbpedia['value']), RDFS.domain, player_uri))
g.add((URIRef(dbpedia['value']), RDFS.range, XSD.float))

g.add((URIRef(dbpedia['minutes']), RDF.type, RDF.Property))
g.add((URIRef(dbpedia['minutes']), RDFS.domain, player_uri))
g.add((URIRef(dbpedia['minutes']), RDFS.range, XSD.float))

# Datetime 
g.add((URIRef(dbpedia['contractuntil']), RDF.type, RDF.Property))
g.add((URIRef(dbpedia['contractuntil']), RDFS.domain, player_uri))
g.add((URIRef(dbpedia['contractuntil']), RDFS.range, XSD.dateTime))

<Graph identifier=N50ab2c1ed1584fdfbbf61ffeed79f28d (<class 'rdflib.graph.Graph'>)>

In [10]:
# Add statistics properties that change every year 

g.add((URIRef(wikidata['https://www.wikidata.org/wiki/Property:P54']), RDF.type, RDF.Property))
g.add((URIRef(wikidata['https://www.wikidata.org/wiki/Property:P54']), RDFS.domain, player_uri))
g.add((URIRef(wikidata['https://www.wikidata.org/wiki/Property:P54']), RDFS.range, XSD.string))
g.add((URIRef(wikidata['https://www.wikidata.org/wiki/Property:P54']), RDFS.label, Literal('Club')))

# Integers 
stats_int = ['goalsTotal', 'assistsTotal','interceptions', 'clearancesSuccessful', 'aerialsWon', 
             'touches', 'passesCompleteLong', 'foulsCommited', 'foulsSuffered', 'shotsOnGoalTotal']

for prop in stats_int:
    g.add((URIRef(sportsschema[prop]), RDF.type, RDF.Property))
    g.add((URIRef(sportsschema[prop]), RDFS.domain, player_uri))
    g.add((URIRef(sportsschema[prop]), RDFS.range, XSD.integer))

# Float 
g.add((URIRef(sportsschema['passesCompletePercentage']), RDF.type, RDF.Property))
g.add((URIRef(sportsschema['passesCompletePercentage']), RDFS.domain, player_uri))
g.add((URIRef(sportsschema['passesCompletePercentage']), RDFS.range, XSD.float))

g.serialize(destination='../kg/schema.ttl', format='turtle')

<Graph identifier=N50ab2c1ed1584fdfbbf61ffeed79f28d (<class 'rdflib.graph.Graph'>)>

#### Step 2: Populate KG 