In [22]:
import os
import datetime
import pandas as pd
import numpy as np
from pathlib import Path

In [23]:
from rdflib import Graph, Literal, RDF, URIRef, Namespace, RDFS
from rdflib.namespace import FOAF, XSD

Get the path in which the notebook is executing

In [24]:
base_path = str(Path(os.path.abspath(os.getcwd())))

Initialize external ontologies

In [25]:
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
W3ID_PERSON = Namespace("https://w3id.org/MON/person.owl#Person")
DBPEDIA_F1 = Namespace("http://dbpedia.org/resource/FormulaOneTeam#")
SCHEMA_CITY = Namespace("https://schema.org/City#")
F1 = Namespace("http://www.dei.unipd.it/database2/Formula1Ontology#")

Load the CSV files that maps between denominations and country

In [28]:
denom_csv = base_path + '/data/utils/denom.csv'
denom_df = pd.read_csv(denom_csv, sep=',')
denom_df['country'] = denom_df['country'].str.replace(' ','')     #remove white spaces from country names

# Import circuit data

Create the graph

In [41]:
g = Graph()
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("cities", SCHEMA_CITY)
g.bind("f1", F1)

In [42]:
circuits_csv = base_path + '/data/circuits.csv'

circuits_df = pd.read_csv(circuits_csv, sep=',', index_col='circuitId')
circuits_df['alt'].replace(to_replace='\\N',value=None,inplace=True)    #remove '\N' values from the dataset
circuits_df['country'] = circuits_df['country'].str.replace(' ','')     #remove white spaces from country names
circuits_df['location'] = circuits_df['location'].str.replace(' ','')     #remove white spaces from city names

Iterate over the circuit's data frame

In [43]:
for index, row in circuits_df.iterrows():
    idC = "circuit"+str(index)  # create a unique identifier for the circuit
    Circuit = URIRef(F1[idC])   # create the circuit

    g.add((Circuit, RDF.type, F1.Circuit))
    g.add((Circuit, F1['name'], Literal(row['name'], datatype=XSD.string)))

    #add altitude info
    if not row['alt'] == None:
        g.add((Circuit, F1['altitude'], Literal(row['alt'], datatype=XSD.int)))

    #add latitude info
    g.add((Circuit, F1['latitude'], Literal(row['lat'], datatype=XSD.decimal)))

    #add longitude info
    g.add((Circuit, F1['longitude'], Literal(row['lng'], datatype=XSD.decimal)))

    #add circuit city
    City = URIRef(SCHEMA_CITY[row["location"]])
    g.add((Circuit, F1['hasCity'], City))

    Country = URIRef(CNS[row['country']])
    g.add((Circuit, F1['hasCountry'], Country))


Export the serialized graph

In [44]:
with open(base_path + '/rdf/circuits.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

# Import constructor data

Create the graph

In [45]:
g = Graph()
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("f1", F1)
g.bind("dbpedia_f1", DBPEDIA_F1)

In [46]:
constructors_csv = base_path + '/data/constructors.csv'
constructors_df = pd.read_csv(constructors_csv, sep=',', index_col='constructorId')

Load constructor participation: associate constructor nationality to country

In [47]:
constructors_df = pd.merge(constructors_df, denom_df, on='nationality', how='inner')

Load constructor's data

In [48]:
constructor_standings_csv = base_path + '/data/constructor_standings.csv'
constructor_standings_df = pd.read_csv(constructor_standings_csv, sep=',', index_col='constructorStandingsId')

In [49]:
constructor_results_csv = base_path + '/data/constructor_results.csv'
constructor_results_df = pd.read_csv(constructor_results_csv, sep=',', index_col='constructorResultsId')

Iterate over the constructor's data frame

In [52]:
for constructorId, row in constructors_df.iterrows():
    idCons = "constructor"+str(constructorId)   #unique identifier for the constructor
    FormulaOneTeam = URIRef(DBPEDIA_F1[idCons])

    g.add((FormulaOneTeam, RDF.type, DBPEDIA_F1.FormulaOneTeam))
    g.add((FormulaOneTeam, F1['name'], Literal(row['name'], datatype=XSD.string)))

    Country = URIRef(CNS[row['country']])
    g.add((FormulaOneTeam, F1['hasCountry'], Country))

    # Find all the races in which a constructor has participated
    participatedRaces = constructor_results_df.loc[constructor_results_df['constructorId'] == constructorId]

    for participationId, row in participatedRaces.iterrows():
        idPart = "participation"+str(participationId)
        Participate = URIRef(F1[idPart])
        g.add((Participate, RDF.type, F1.Participate))

        #searching the constructor standing data (points, number of victories and position) after the participation 
        g.add((Participate, F1['cp_points_after_race'], Literal(constructor_standings_df['points'].iloc[0], datatype=XSD.int)))
        g.add((Participate, F1['cp_position_after_race'], Literal(constructor_standings_df['position'].iloc[0], datatype=XSD.int)))
        g.add((Participate, F1['number_of_wins_after_race'], Literal(constructor_standings_df['wins'].iloc[0], datatype=XSD.int)))

        #add the race_weekend associated to the drive
        idRWE = "raceWeekEnd"+str(row['raceId'])
        g.add((Participate, F1['during'], URIRef(F1[idRWE])))

        #add the drive
        g.add((FormulaOneTeam, F1['partecipateIn'], Participate))

Saving data

In [51]:
with open(base_path + '/rdf/constructors.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

# Import driver data

Create the graph and bind the ontologies namespaces

In [26]:
g = Graph()
g.bind("person", W3ID_PERSON)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("f1", F1)

Read the drivers from the csv file

In [49]:
drivers_csv = base_path + '/data/drivers.csv'                               
driver_df = pd.read_csv(drivers_csv, sep=',')
driver_df['number'].replace(to_replace='\\N',value=None,inplace=True)       #remove '\N' values from the dataset
driver_df['code'].replace(to_replace='\\N',value=None,inplace=True)         #remove '\N' values from the dataset
driver_df = pd.merge(driver_df, denom_df, on='nationality', how='inner')    #associate the nationality

Read the drivers race results from the csv file

In [30]:
results_csv = base_path + '/data/results.csv'
results_df = pd.read_csv(results_csv, sep=',', index_col='resultId')

Read the drivers qualifying results from the csv file

In [31]:
qualifying_csv = base_path + '/data/qualifying.csv'
qualifying_df = pd.read_csv(qualifying_csv, sep=',', index_col='qualifyId')

Read the drivers sprint race results from the csv file (Warning: not all the race week ends have a sprint race)

In [32]:
sprint_races_csv = base_path + '/data/sprint_results.csv'
sprint_races_df = pd.read_csv(sprint_races_csv, sep=',', index_col='resultId')

Read the driver championship standing from the csv file

In [33]:
driver_standings_csv = base_path + '/data/driver_standings.csv'
driver_standings_df = pd.read_csv(driver_standings_csv, sep=',', index_col='driverStandingsId')

Read the race status from the csv file

In [34]:
status_csv = base_path + '/data/status.csv'
status_df = pd.read_csv(status_csv, sep=',', index_col='statusId')

1) Create the drivers and associate all the informations
2) Create the drives related to the drivers and associate all the informations

In [54]:
for driverId, row in driver_df.iterrows():
    idD = "driver"+str(row['driverId'])
    print(idD)
    Driver = URIRef(F1[idD])
    
    #Driver is a subclass of Person
    g.add((F1.Driver, RDFS.subClassOf, W3ID_PERSON.Person))
    g.add((Driver, RDF.type, F1.Driver))
    
    #Add first_name, last_name and birth_date (Person data properties) to Driver
    g.add((Driver, W3ID_PERSON['first_name'], Literal(row['forename'], datatype=XSD.string)))
    g.add((Driver, W3ID_PERSON['last_name'], Literal(row['surname'], datatype=XSD.string)))
    g.add((Driver, W3ID_PERSON['birth_date'], Literal(row['dob'], datatype=XSD.date)))

    #Add driver number and code
    if row['number'] != None:
        g.add((Driver, F1['number'], Literal(row['number'], datatype=XSD.int)))
    
    if row['code'] != None:
        g.add((Driver, F1['code'], Literal(row['code'], datatype=XSD.string)))       

    #Add driver country
    Country = URIRef(CNS[row['country']])
    g.add((Driver, W3ID_PERSON['nationality'], Country))

    #get all the race, qualifying, sprint race results of the given driver, these are the driver's drives
    driverDrives = results_df.loc[results_df['driverId'] == driverId] 
    
    #add the race status information
    driverDrives = pd.merge(driverDrives, status_df, on='statusId', how='inner')

    for driveId, row in driverDrives.iterrows():
        idDrive = "drive"+str(driveId)

        #create the Drive object, a Drive contains all the race weekend performance informations of a given driver
        Drive = URIRef(F1[idDrive])

        #id of the constructor for which the driver drives
        idCons = "constructor"+str(row['constructorId'])
        g.add((Drive, F1['driveFor'], URIRef(F1[idCons])))

        g.add((Drive, RDF.type, F1.Drive))
        g.add((Drive, F1['race_position'], Literal(row['positionOrder'], datatype=XSD.string))) 
        g.add((Drive, F1['race_fastest_lap'], Literal(row['fastestLapTime'], datatype=XSD.string)))
        
        #add race status
        g.add((Drive, F1['status'], Literal(row['status'], datatype=XSD.string)))

        #searching the qualify of the given driver in the given drive 
        qualifyDriver = qualifying_df.loc[(qualifying_df['driverId'] == driverId) & (qualifying_df['raceId'] == row['raceId'])] 

        #check for qualify times
        if qualifyDriver.size > 0:
            g.add((Drive, F1['quali_position'], Literal(qualifyDriver['position'].iloc[0], datatype=XSD.int)))
            g.add((Drive, F1['q1_time'], Literal(qualifyDriver['q1'].iloc[0], datatype=XSD.string)))
            g.add((Drive, F1['q2_time'], Literal(qualifyDriver['q2'].iloc[0], datatype=XSD.string)))
            g.add((Drive, F1['q3_time'], Literal(qualifyDriver['q3'].iloc[0], datatype=XSD.string)))

        #searching the sprint_race position of the given driver in the given drive if present
        sprintDriver = sprint_races_df.loc[(sprint_races_df['driverId'] == driverId) & (sprint_races_df['raceId'] == row['raceId'])]

        #check for sprint_race position
        if sprintDriver.size > 0:
            g.add((Drive, F1['sprint_position'], Literal(sprintDriver['position'].iloc[0], datatype=XSD.int)))

        #searching the driver standing data (points, number of victories and position) after the drive 
        g.add((Drive, F1['cp_points_after_race'], Literal(driver_standings_df['points'].iloc[0], datatype=XSD.int)))
        g.add((Drive, F1['cp_position_after_race'], Literal(driver_standings_df['position'].iloc[0], datatype=XSD.int)))
        g.add((Drive, F1['number_of_wins'], Literal(driver_standings_df['wins'].iloc[0], datatype=XSD.int)))

        #add the race_weekend associated to the drive
        idRWE = "raceWeekEnd"+str(row['raceId'])
        g.add((Drive, F1['during'], URIRef(F1[idRWE])))  

        #add the drive
        g.add((Driver, F1['hasDrivenIn'], Drive))


driver1


Saving the data

In [53]:
with open(base_path + '/rdf/drivers.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

# Import Race Weekends

Create the graph

In [60]:
g = Graph()
g.bind("xsd", XSD)
g.bind("f1", F1)

Read the race weekends from the csv file

In [62]:
races_csv = base_path + '/data/races.csv'   #Read the races
races_df = pd.read_csv(races_csv, sep=',', index_col='raceId')
races_df

Unnamed: 0_level_0,year,round,circuitId,name,date,time,url,fp1_date,fp1_time,fp2_date,fp2_time,fp3_date,fp3_time,quali_date,quali_time,sprint_date,sprint_time
raceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_G...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,http://en.wikipedia.org/wiki/2009_Malaysian_Gr...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,http://en.wikipedia.org/wiki/2009_Chinese_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,http://en.wikipedia.org/wiki/2009_Bahrain_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,http://en.wikipedia.org/wiki/2009_Spanish_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1092,2022,18,22,Japanese Grand Prix,2022-10-09,05:00:00,http://en.wikipedia.org/wiki/2022_Japanese_Gra...,2022-10-07,04:00:00,2022-10-07,08:00:00,2022-10-08,04:00:00,2022-10-08,07:00:00,\N,\N
1093,2022,19,69,United States Grand Prix,2022-10-23,19:00:00,http://en.wikipedia.org/wiki/2022_United_State...,2022-10-21,19:00:00,2022-10-21,22:00:00,2022-10-22,19:00:00,2022-10-22,22:00:00,\N,\N
1094,2022,20,32,Mexico City Grand Prix,2022-10-30,20:00:00,http://en.wikipedia.org/wiki/2022_Mexican_Gran...,2022-10-28,18:00:00,2022-10-28,21:00:00,2022-10-29,17:00:00,2022-10-29,20:00:00,\N,\N
1095,2022,21,18,Brazilian Grand Prix,2022-11-13,18:00:00,http://en.wikipedia.org/wiki/2022_Brazilian_Gr...,2022-11-11,15:30:00,2022-11-12,15:30:00,\N,\N,2022-11-11,19:00:00,2022-11-12,19:30:00


Read the fans rating for the race weekends 

In [63]:
fan_ratings_csv = base_path + '/data/fan_ratings.csv'   #Read the races
fan_ratings_df = pd.read_csv(fan_ratings_csv, sep=',')
fan_ratings_df

Unnamed: 0,Y,R,GPNAME,P1,P2,P3,RATING
0,2008,1,Australian GP,Hamilton,Heidfeld,Rosberg,7.609
1,2008,10,German GP,Hamilton,Piquet,Massa,7.180
2,2008,11,Hungarian GP,Kovalainen,Glock,Raikkonen,6.202
3,2008,12,European GP,Massa,Hamilton,Kubica,3.977
4,2008,13,Belgian GP,Massa,Heidfeld,Hamilton,7.736
...,...,...,...,...,...,...,...
197,2018,5,Spanish GP,Hamilton,Bottas,Verstappen,4.850
198,2018,6,Monaco GP,Ricciardo,Vettel,Hamilton,5.230
199,2018,7,Canadian GP,Vettel,Bottas,Verstappen,4.240
200,2018,8,French GP,Hamilton,Verstappen,Raikkonen,6.470


In [64]:
# Rename columns to be able to merge
fan_ratings_df.rename(columns={'Y': 'year', 'R': 'round','RATING':'rating'}, inplace=True)
fan_rated_races = pd.merge(races_df, fan_ratings_df, how='left', on=['year', 'round'])

In [67]:
for raceId, row in fan_rated_races.iterrows():
    idRWE = "raceWeekEnd"+str(raceId)
    Race = URIRef(F1[idRWE])
    
    g.add((Race, RDF.type, F1.RaceWeekend))
    g.add((Race, F1['name'], Literal(row['name'], datatype=XSD.string))) 
    g.add((Race, F1['year'], Literal(row['year'], datatype=XSD.int)))
    g.add((Race, F1['round'], Literal(row['round'], datatype=XSD.int)))
    g.add((Race, F1['date'], Literal(row['date'], datatype=XSD.date)))

    if not np.isnan(row['rating']):
        g.add((Race, F1['fans_rating'], Literal(row['rating'], datatype=XSD.float)))

    #create circuit node
    idC = 'circuit' + str(row['circuitId'])
    Circuit = URIRef(F1[idC])

    g.add((Race, F1['takePlaceIn'], Circuit))   

Saving the data

In [68]:
with open(base_path + '/rdf/raceWeekend.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))