In [1]:
import os
import datetime
import pandas as pd
from pathlib import Path

In [2]:
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import FOAF, XSD

Get the path in which the notebook is executing

In [3]:
base_path = str(Path(os.path.abspath(os.getcwd())))

Initialize external ontologies

In [4]:
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
F1 = Namespace("http://www.dei.unipd.it/database2/Formula1Ontology#")

Load the CSV files that maps between denominations and country

In [5]:
denom_csv = base_path + '/utils/denom.csv'
denom_df = pd.read_csv(denom_csv, sep=',')
denom_df['country'] = denom_df['country'].str.replace(' ','')     #remove white spaces from country names

# Import circuit data

Create the graph

In [None]:
g = Graph()
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("f1", F1)

In [36]:
circuits_csv = base_path + '/data/circuits.csv'

circuits_df = pd.read_csv(circuits_csv, sep=',', index_col='circuitId')
circuits_df['alt'].replace(to_replace='\\N',value=None,inplace=True)    #remove '\N' values from the dataset
circuits_df['country'] = circuits_df['country'].str.replace(' ','')     #remove white spaces from country names


Unnamed: 0_level_0,circuitRef,name,location,country,lat,lng,alt,url
circuitId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.84970,144.96800,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.73800,18,http://en.wikipedia.org/wiki/Sepang_Internatio...
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.03250,50.51060,7,http://en.wikipedia.org/wiki/Bahrain_Internati...
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57000,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcel...
5,istanbul,Istanbul Park,Istanbul,Turkey,40.95170,29.40500,130,http://en.wikipedia.org/wiki/Istanbul_Park
...,...,...,...,...,...,...,...,...
75,portimao,Autódromo Internacional do Algarve,Portimão,Portugal,37.22700,-8.62670,108,http://en.wikipedia.org/wiki/Algarve_Internati...
76,mugello,Autodromo Internazionale del Mugello,Mugello,Italy,43.99750,11.37190,255,http://en.wikipedia.org/wiki/Mugello_Circuit
77,jeddah,Jeddah Corniche Circuit,Jeddah,SaudiArabia,21.63190,39.10440,15,http://en.wikipedia.org/wiki/Jeddah_Street_Cir...
78,losail,Losail International Circuit,Al Daayen,Qatar,25.49000,51.45420,,http://en.wikipedia.org/wiki/Losail_Internatio...


Iterate over the circuit's data frame

In [39]:
for index, row in circuits_df.iterrows():
    idC = "circuit"+str(index)  # create a unique identifier for the circuit
    Circuit = URIRef(F1[idC])   # create the circuit

    g.add((Circuit, RDF.type, F1.Circuit))
    g.add((Circuit, F1['name'], Literal(row['name'], datatype=XSD.string)))

    if not row['alt'] == None:
        g.add((Circuit, F1['altitude'], Literal(row['alt'], datatype=XSD.int)))

    Country = URIRef(CNS[row['country']])
    g.add((Circuit, F1['hasCountry'], Country))


Export the serialized graph

In [41]:
with open(base_path + '/rdf/circuits.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

# Import constructor data

Create the graph

In [15]:
g = Graph()
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("f1", F1)

In [16]:
constructors_csv = base_path + '/data/constructors.csv'
constructors_df = pd.read_csv(constructors_csv, sep=',', index_col='constructorId')

Load constructor participation

In [18]:
constructors_df = pd.merge(constructors_df, denom_df, on='nationality', how='inner')

Load constructor's data

In [27]:
constructor_standings_csv = base_path + '/data/constructor_standings.csv'
constructor_standings_df = pd.read_csv(constructor_standings_csv, sep=',', index_col='constructorStandingsId')

In [30]:
constructor_results_csv = base_path + '/data/constructor_results.csv'
constructor_results_df = pd.read_csv(constructor_results_csv, sep=',', index_col='constructorResultsId')

Iterate over the constructor's data frame

In [34]:
for constructorId, row in constructors_df.iterrows():
    idCons = "constructor"+str(constructorId)   #unique identifier for the constructor
    Constructor = URIRef(F1[idCons])

    g.add((Constructor, RDF.type, F1.Constructor))
    g.add((Constructor, F1['name'], Literal(row['name'], datatype=XSD.string)))

    Country = URIRef(CNS[row['country']])
    g.add((Constructor, F1['hasCountry'], Country))

    # Find all the races in which a constructor has participated
    participatedRaces = constructor_results_df.loc[constructor_results_df['constructorId'] == constructorId]

    for participationId, row in participatedRaces.iterrows():
        idPart = "participation"+str(participationId)
        Participate = URIRef(F1[idPart])
        g.add((Participate, RDF.type, F1.Participate))

        #searching the constructor standing data (points, number of victories and position) after the participation 
        g.add((Participate, F1['points_after_race'], Literal(constructor_standings_df['points'].iloc[0], datatype=XSD.int)))
        g.add((Participate, F1['position_after_race'], Literal(constructor_standings_df['position'].iloc[0], datatype=XSD.int)))
        g.add((Participate, F1['number_of_wins'], Literal(constructor_standings_df['wins'].iloc[0], datatype=XSD.int)))

        #add the race_weekend associated to the drive
        idRWE = "raceWeekEnd"+str(row['raceId'])
        g.add((Participate, F1['during'], URIRef(F1[idRWE])))

        #add the drive
        g.add((Constructor, F1['appearIn'], Participate))

Saving data

In [36]:
with open(base_path + '/rdf/constructors.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

# Import driver data

Create the graph

In [10]:
g = Graph()
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("f1", F1)

In [16]:
drivers_csv = base_path + '/data/drivers.csv'   #Read the drivers
driver_df = pd.read_csv(drivers_csv, sep=',', index_col='driverId')
driver_df['number'].replace(to_replace='\\N',value=None,inplace=True)       #remove '\N' values from the dataset
driver_df['code'].replace(to_replace='\\N',value=None,inplace=True)         #remove '\N' values from the dataset
driver_df = pd.merge(driver_df, denom_df, on='nationality', how='inner')    #associate the nationality

Unnamed: 0,driverRef,number,code,forename,surname,dob,nationality,url,country
0,hamilton,44,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton,UnitedKingdom
1,coulthard,,COU,David,Coulthard,1971-03-27,British,http://en.wikipedia.org/wiki/David_Coulthard,UnitedKingdom
2,button,22,BUT,Jenson,Button,1980-01-19,British,http://en.wikipedia.org/wiki/Jenson_Button,UnitedKingdom
3,davidson,,DAV,Anthony,Davidson,1979-04-18,British,http://en.wikipedia.org/wiki/Anthony_Davidson,UnitedKingdom
4,wilson,,\N,Justin,Wilson,1978-07-31,British,http://en.wikipedia.org/wiki/Justin_Wilson_(ra...,UnitedKingdom
...,...,...,...,...,...,...,...,...,...
846,kvyat,26,KVY,Daniil,Kvyat,1994-04-26,Russian,http://en.wikipedia.org/wiki/Daniil_Kvyat,Russia
847,sirotkin,35,SIR,Sergey,Sirotkin,1995-08-27,Russian,http://en.wikipedia.org/wiki/Sergey_Sirotkin_(...,Russia
848,mazepin,9,MAZ,Nikita,Mazepin,1999-03-02,Russian,http://en.wikipedia.org/wiki/Nikita_Mazepin,Russia
849,haryanto,88,HAR,Rio,Haryanto,1993-01-22,Indonesian,http://en.wikipedia.org/wiki/Rio_Haryanto,Indonesia


In [18]:
races_csv = base_path + '/data/races.csv'   #Read the races
races_df = pd.read_csv(races_csv, sep=',', index_col='raceId')

In [19]:
results_csv = base_path + '/data/results.csv'
results_df = pd.read_csv(results_csv, sep=',', index_col='resultId')

In [20]:
qualifying_csv = base_path + '/data/qualifying.csv'
qualifying_df = pd.read_csv(qualifying_csv, sep=',', index_col='qualifyId')

In [21]:
sprint_races_csv = base_path + '/data/sprint_results.csv'
sprint_races_df = pd.read_csv(sprint_races_csv, sep=',', index_col='resultId')

In [22]:
driver_standings_csv = base_path + '/data/driver_standings.csv'
driver_standings_df = pd.read_csv(driver_standings_csv, sep=',', index_col='driverStandingsId')

In [23]:
for driverId, row in driver_df.iterrows():
    idD = "driver"+str(driverId)
    Driver = URIRef(F1[idD])
    g.add((Driver, RDF.type, F1.Driver))
    g.add((Driver, F1['firstName'], Literal(row['forename'], datatype=XSD.string)))
    g.add((Driver, F1['lastName'], Literal(row['surname'], datatype=XSD.string)))

    if row['number'] != None:
        g.add((Driver, F1['driverNumber'], Literal(row['number'], datatype=XSD.int)))
    
    if row['code'] != None:
        g.add((Driver, F1['driverCode'], Literal(row['code'], datatype=XSD.string)))       

    Country = URIRef(CNS[row['country']])
    g.add((Driver, F1['hasCountry'], Country))

    #get all the race results of the given driver, this is the driver's drive
    driverDrives = results_df.loc[results_df['driverId'] == driverId] 

    for driveId, row in driverDrives.iterrows():
        idDrive = "drive"+str(driveId)
        Drive = URIRef(F1[idDrive])

        g.add((Drive, RDF.type, F1.Drive))
        g.add((Drive, F1['race_position'], Literal(row['positionOrder'], datatype=XSD.string))) 
        g.add((Drive, F1['fast_lap'], Literal(row['fastestLapTime'], datatype=XSD.string)))

        #id of the constructor for which this driver drives
        idCons = "constructor"+str(row['constructorId'])
        g.add((Drive, F1['driveFor'], URIRef(F1[idCons])))

        #searching the qualify of the given driver in the given drive 
        qualifyDriver = qualifying_df.loc[(qualifying_df['driverId'] == driverId) & (qualifying_df['raceId'] == row['raceId'])] 

        #check for qualify times
        if qualifyDriver.size > 0:
            g.add((Drive, F1['quali_position'], Literal(qualifyDriver['position'].iloc[0], datatype=XSD.int)))
            g.add((Drive, F1['q1'], Literal(qualifyDriver['q1'].iloc[0], datatype=XSD.string)))
            g.add((Drive, F1['q2'], Literal(qualifyDriver['q2'].iloc[0], datatype=XSD.string)))
            g.add((Drive, F1['q3'], Literal(qualifyDriver['q3'].iloc[0], datatype=XSD.string)))

        #searching the sprint_race position of the given driver in the given drive if present
        sprintDriver = sprint_races_df.loc[(sprint_races_df['driverId'] == driverId) & (sprint_races_df['raceId'] == row['raceId'])]

        #check for sprint_race position
        if sprintDriver.size > 0:
            g.add((Drive, F1['quali_position'], Literal(sprintDriver['position'].iloc[0], datatype=XSD.string)))

        #searching the driver standing data (points, number of victories and position) after the drive 
        g.add((Drive, F1['points_after_race'], Literal(driver_standings_df['points'].iloc[0], datatype=XSD.int)))
        g.add((Drive, F1['position_after_race'], Literal(driver_standings_df['position'].iloc[0], datatype=XSD.int)))
        g.add((Drive, F1['number_of_wins'], Literal(driver_standings_df['wins'].iloc[0], datatype=XSD.int)))

        #add the race_weekend associated to the drive
        idRWE = "raceWeekEnd"+str(row['raceId'])
        g.add((Drive, F1['during'], URIRef(F1[idRWE])))  

        #add the drive
        g.add((Driver, F1['appearIn'], Drive))


Saving the data

In [None]:
with open(base_path + '/rdf/drivers.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))