In [425]:
import os
import re
import pandas as pd
import numpy as np
from pathlib import Path

In [426]:
from rdflib import Graph, Literal, RDF, URIRef, Namespace, RDFS
from rdflib.namespace import FOAF, XSD

Get the path in which the notebook is executing

In [427]:
base_path = str(Path(os.path.abspath(os.getcwd())))

Initialize external ontologies

In [428]:
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
W3ID_PERSON = Namespace("https://w3id.org/MON/person.owl#")
DBPEDIA_F1 = Namespace("https://dbpedia.org/ontology/FormulaOneTeam#")
SCHEMA_CITY = Namespace("http://schema.org/City#")
SCHEMA_ORG = Namespace("http://schema.org/")
F1 = Namespace("http://www.dei.unipd.it/database2/Formula1Ontology#")

Load the CSV files that maps between denominations and country.
The content of these files is used to map between nationality to country and from country to the 2 digits ISO code representing it.

In [429]:
denom_csv = base_path + '/data/utils/denom.csv'
denom_df = pd.read_csv(denom_csv, sep=',')

In [430]:
country_codes_csv = base_path + '/data/utils/iso_country_codes.csv'
country_codes_df = pd.read_csv(country_codes_csv, sep=',')
country_codes_df['alpha-2'] = country_codes_df['alpha-2'].str.lower() #convert country codes to lower case

# Import circuit data

Create the graph

In [431]:
g = Graph()
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("cities", SCHEMA_CITY)
g.bind("schema_org", SCHEMA_ORG)
g.bind("f1", F1)

In [432]:
circuits_csv = base_path + '/data/circuits.csv'
circuits_df = pd.read_csv(circuits_csv, sep=',')
circuits_df = circuits_df.replace({np.nan: None, '\\N': None})
circuits_df['location_real_name'] = circuits_df['location']
circuits_df['location'] = circuits_df['location'].str.replace(' ','')     #remove white spaces from city names

Iterate over the circuit's data frame

In [433]:
for i, row in circuits_df.iterrows():

    circuit_id = "circuit"+str(row['circuitId'])    # create a unique identifier for the circuit
    Circuit = URIRef(F1[circuit_id])                # create the circuit

    g.add((Circuit, RDF.type, F1.Circuit))
    g.add((Circuit, F1['name'], Literal(row['name'], datatype=XSD.string)))

    #add altitude info
    if not row['alt'] == None:
        g.add((Circuit, F1['altitude'], Literal(row['alt'], datatype=XSD.int)))

    #add latitude info
    g.add((Circuit, F1['latitude'], Literal(row['lat'], datatype=XSD.decimal)))

    #add longitude info
    g.add((Circuit, F1['longitude'], Literal(row['lng'], datatype=XSD.decimal)))

    #add circuit city
    City = URIRef(SCHEMA_CITY[row["location"]])
    g.add((City, SCHEMA_ORG['name'], Literal(row['location_real_name'], datatype=XSD.string)))  
    g.add((Circuit, F1['hasCity'], City))


    # Convert the country to the associated alpha-2 code
    country = country_codes_df.loc[country_codes_df['name'] == row['country']]

    if not country.empty:
        country_code = country['alpha-2'].iloc[0]
        Country = URIRef(CNS[country_code])
        g.add((Circuit, F1['hasCountry'], Country))


Export the serialized graph

In [434]:
with open(base_path + '/rdf/circuits.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

# Import constructor data

Create the graph

In [435]:
g = Graph()
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("f1", F1)
g.bind("dbpedia_f1", DBPEDIA_F1)

In [436]:
constructors_csv = base_path + '/data/constructors.csv'
constructors_df = pd.read_csv(constructors_csv, sep=',')

Load constructor participation: associate constructor nationality to country

In [437]:
constructors_df = pd.merge(constructors_df, denom_df, on='nationality', how='inner')

Load constructor's data

In [438]:
constructor_standings_csv = base_path + '/data/constructor_standings.csv'
constructor_standings_df = pd.read_csv(constructor_standings_csv, sep=',')

In [439]:
constructor_results_csv = base_path + '/data/constructor_results.csv'
constructor_results_df = pd.read_csv(constructor_results_csv, sep=',')

Iterate over the constructor's data frame

In [440]:
for const_i, const_row in constructors_df.iterrows():
    constructor_id = const_row['constructorId']
    constructor_token = "constructor"+str(constructor_id)   #unique identifier for the constructor
    FormulaOneTeam = URIRef(DBPEDIA_F1[constructor_token])
    
    g.add((FormulaOneTeam, RDF.type, DBPEDIA_F1.FormulaOneTeam))
    g.add((FormulaOneTeam, F1['name'], Literal(const_row['name'], datatype=XSD.string)))

    # Convert the country to the associated alpha-2 code
    country = country_codes_df.loc[country_codes_df['name'] == const_row['country']]

    if not country.empty:
        country_code = country['alpha-2'].iloc[0]
        Country = URIRef(CNS[country_code])
        g.add((FormulaOneTeam, F1['hasCountry'], Country))

    # Find all the races in which a constructor has participated
    participated_races_df = constructor_results_df.loc[constructor_results_df['constructorId'] == constructor_id]

    for part_id, part_row in participated_races_df.iterrows():
        part_token = "participation"+str(part_id+1)     #unique identifier for the participation
        Participate = URIRef(F1[part_token])

        g.add((Participate, RDF.type, F1.Participate))

        #get data from constructor standings for the given race
        constructor_standing = constructor_standings_df.loc[(constructor_standings_df['raceId'] == part_row['raceId']) & (constructor_standings_df['constructorId'] == constructor_id)]

        if not constructor_standing.empty:
            g.add((Participate, F1['cp_points_after_race'], Literal(constructor_standing['points'].iloc[0], datatype=XSD.decimal)))
            g.add((Participate, F1['cp_position_after_race'], Literal(constructor_standing['position'].iloc[0], datatype=XSD.int)))
            g.add((Participate, F1['number_of_wins_after_race'], Literal(constructor_standing['wins'].iloc[0], datatype=XSD.int)))

        #add the race_weekend associated to the drive
        race_weekend_token = "raceWeekEnd"+str(part_row['raceId'])
        g.add((Participate, F1['during'], URIRef(F1[race_weekend_token])))

        #add the drive
        g.add((FormulaOneTeam, F1['participateIn'], Participate))

Saving data

In [441]:
with open(base_path + '/rdf/constructors.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

# Import driver data

Create the graph and bind the ontologies namespaces

In [442]:
g = Graph()
g.bind("person", W3ID_PERSON)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("f1", F1)
g.bind("dbpedia_f1", DBPEDIA_F1)

Read the drivers from the csv file

In [443]:
drivers_csv = base_path + '/data/drivers.csv'                               
driver_df = pd.read_csv(drivers_csv, sep=',')
driver_df = driver_df.replace({np.nan: None, '\\N': None})
driver_df = pd.merge(driver_df, denom_df, on='nationality', how='inner')    #associate the nationality

Read the drivers race results from the csv file

In [444]:
results_csv = base_path + '/data/results.csv'
results_df = pd.read_csv(results_csv, sep=',')

Read the drivers qualifying results from the csv file

In [445]:
qualifying_csv = base_path + '/data/qualifying.csv'
qualifying_df = pd.read_csv(qualifying_csv, sep=',')
qualifying_df = qualifying_df.replace({np.nan: None, '\\N': None})

Read the drivers sprint race results from the csv file (Warning: not all the race week ends have a sprint race)

In [446]:
sprint_races_csv = base_path + '/data/sprint_results.csv'
sprint_races_df = pd.read_csv(sprint_races_csv, sep=',')
sprint_races_df = sprint_races_df.replace({np.nan: None, '\\N': None})

Read the driver championship standing from the csv file

In [447]:
driver_standings_csv = base_path + '/data/driver_standings.csv'
driver_standings_df = pd.read_csv(driver_standings_csv, sep=',')

Read the race status from the csv file

In [448]:
status_csv = base_path + '/data/status.csv'
status_df = pd.read_csv(status_csv, sep=',')

Read data about laps

In [449]:
lap_times_csv = base_path + '/data/lap_times.csv'
lap_times_df = pd.read_csv(lap_times_csv, sep=',')

Read data about pit stops

In [450]:
pit_stops_csv = base_path + '/data/pit_stops.csv'
pit_stops_df = pd.read_csv(pit_stops_csv, sep=',')

1) Create the drivers and associate all the informations
2) Create the drives related to the drivers and associate all the informations

In [451]:
'''
Using a compiled regex matches string of type MM:SS.FFFF that represents lap times.
If no match is possible returns NaN
'''

time_tokenizer = re.compile("[: .]")
def str_time_to_decimal(time: str) -> float:
    tokens = time_tokenizer.split(time)

    if len(tokens) != 3: 
        return float('NaN')
        
    time = 0.0
    time += int(tokens[0]) * 60 + float(tokens[1]) + float("0." + tokens[2])
    return time

In [320]:
for driver_index, driver_row in driver_df.iterrows():

    driverId = driver_row['driverId']
    uriId = "driver"+str(driverId)

    Driver = URIRef(F1[uriId])
    
    g.add((Driver, RDF.type, F1.Driver))
    
    #Add first_name, last_name and birth_date (Person data properties) to Driver
    g.add((Driver, W3ID_PERSON['firstName'], Literal(driver_row['forename'], datatype=XSD.string)))
    g.add((Driver, W3ID_PERSON['lastName'], Literal(driver_row['surname'], datatype=XSD.string)))
    g.add((Driver, W3ID_PERSON['birthDate'], Literal(driver_row['dob'], datatype=XSD.date)))

    #Add driver number and code
    if driver_row['number'] != None:
        g.add((Driver, F1['number'], Literal(driver_row['number'], datatype=XSD.int)))
    
    if driver_row['code'] != None:
        g.add((Driver, F1['code'], Literal(driver_row['code'], datatype=XSD.string)))       

    #Add driver country

    # Convert the country to the associated alpha-2 code
    country = country_codes_df.loc[country_codes_df['name'] == driver_row['country']]

    if not country.empty:
        country_code = country['alpha-2'].iloc[0]
        Country = URIRef(CNS[country_code])
        g.add((Driver, F1['nationality'], Country))

    #get all the race, qualifying, sprint race results of the given driver, these are the driver's drives
    driverDrives = results_df.loc[results_df['driverId'] == driverId] 

    #add the race status information
    driverDrives = pd.merge(driverDrives, status_df, on='statusId', how='inner')

    for drive_i, drive_row in driverDrives.iterrows():
        driveId = drive_row['resultId']
        
        uriDriveId = "drive"+str(driveId)
        
        #create the Drive object, a Drive contains all the race weekend performance information of a given driver
        Drive = URIRef(F1[uriDriveId])

        g.add((Drive, RDF.type, F1.Drive))

        #id of the constructor for which the driver drives
        idCons = "constructor"+str(drive_row['constructorId'])
        g.add((Drive, F1['driveFor'], URIRef(DBPEDIA_F1[idCons])))

        g.add((Drive, F1['race_position'], Literal(drive_row['positionOrder'], datatype=XSD.int))) 
            
        #add race status
        g.add((Drive, F1['status'], Literal(drive_row['status'], datatype=XSD.string)))

        #searching the qualify of the given driver in the given drive 
        qualifyDriver = qualifying_df.loc[(qualifying_df['driverId'] == driverId) & (qualifying_df['raceId'] == drive_row['raceId'])]

        if not qualifyDriver.empty:
            g.add((Drive, F1['quali_position'], Literal(qualifyDriver['position'].iloc[0], datatype=XSD.int)))

            #check for qualify times
            if qualifyDriver['q1'].iloc[0] != None:
                time = str_time_to_decimal(qualifyDriver['q1'].iloc[0])
                g.add((Drive, F1['q1_time'], Literal(time, datatype=XSD.decimal)))
            
            if qualifyDriver['q2'].iloc[0] != None:
                time = str_time_to_decimal(qualifyDriver['q2'].iloc[0])
                g.add((Drive, F1['q2_time'], Literal(time, datatype=XSD.decimal)))

            if qualifyDriver['q3'].iloc[0] != None:
                time = str_time_to_decimal(qualifyDriver['q3'].iloc[0])
                g.add((Drive, F1['q3_time'], Literal(time, datatype=XSD.decimal)))

        #searching the sprint_race position of the given driver in the given drive if present
        sprintDriver = sprint_races_df.loc[(sprint_races_df['driverId'] == driverId) & (sprint_races_df['raceId'] == driveId)]

        #check for sprint_race position
        if not sprintDriver.empty:
            g.add((Drive, F1['sprint_position'], Literal(sprintDriver['position'].iloc[0], datatype=XSD.int)))

        #get data from constructor standings for the given race
        driver_standing = driver_standings_df.loc[(driver_standings_df['raceId'] == drive_row['raceId']) & (driver_standings_df['driverId'] == driverId)]

        if not driver_standing.empty:
            g.add((Drive, F1['cp_points_after_race'], Literal(driver_standing['points'].iloc[0], datatype=XSD.decimal )))
            g.add((Drive, F1['cp_position_after_race'], Literal(driver_standing['position'].iloc[0], datatype=XSD.int)))
            g.add((Drive, F1['number_of_wins'], Literal(driver_standing['wins'].iloc[0], datatype=XSD.int)))

        # Get info about race laps
        lap_times = lap_times_df.loc[(lap_times_df['raceId'] == drive_row['raceId']) & (lap_times_df['driverId'] == driverId)]

        if not lap_times.empty:
            best = lap_times['milliseconds'].min() / 1000
            g.add((Drive, F1['race_fastest_lap'], Literal(best, datatype=XSD.decimal)))
        
        # Get info about pit stops
        pit_stops = pit_stops_df.loc[(pit_stops_df['raceId'] == drive_row['raceId']) & (pit_stops_df['driverId'] == driverId)]
        if not pit_stops.empty:
            pit_stop_count = pit_stops.shape[0]
            best = pit_stops['milliseconds'].min() / 1000
            worst = pit_stops['milliseconds'].max() / 1000
            g.add((Drive, F1['pitstop_count'], Literal(pit_stop_count, datatype=XSD.int)))
            g.add((Drive, F1['fastest_pitstop'], Literal(best, datatype=XSD.decimal)))
            g.add((Drive, F1['longest_pitstop'], Literal(worst, datatype=XSD.decimal)))

        #add the race_weekend associated to the drive
        idRWE = "raceWeekEnd"+str(drive_row['raceId'])
        g.add((Drive, F1['during'], URIRef(F1[idRWE])))  

        #add the drive
        g.add((Driver, F1['hasDrivenIn'], Drive))

    



Saving the data

In [321]:
with open(base_path + '/rdf/drivers.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

# Import Race Weekends

Create the graph

In [401]:
g = Graph()
g.bind("xsd", XSD)
g.bind("f1", F1)

Read the race weekends from the csv file

In [402]:
races_csv = base_path + '/data/races.csv'   #Read the races
races_df = pd.read_csv(races_csv, sep=',')

Read the fans rating for the race weekends 

In [403]:
fan_ratings_csv = base_path + '/data/fan_ratings.csv'   #Read the races
fan_ratings_df = pd.read_csv(fan_ratings_csv, sep=',')

In [404]:
# Rename columns to be able to merge
fan_ratings_df.rename(columns={'Y': 'year', 'R': 'round','RATING':'rating'}, inplace=True)
fan_rated_races = pd.merge(races_df, fan_ratings_df, how='left', on=['year', 'round'])

In [405]:

for race_index, race_row in fan_rated_races.iterrows():
    race_weekend_token = "raceWeekEnd"+str(race_row['raceId'])
    Race = URIRef(F1[race_weekend_token])
    
    g.add((Race, RDF.type, F1.RaceWeekend))
    g.add((Race, F1['name'], Literal(race_row['name'], datatype=XSD.string))) 
    g.add((Race, F1['year'], Literal(race_row['year'], datatype=XSD.int)))
    g.add((Race, F1['round'], Literal(race_row['round'], datatype=XSD.int)))
    g.add((Race, F1['date'], Literal(race_row['date'], datatype=XSD.date)))

    if not np.isnan(race_row['rating']):
        g.add((Race, F1['fans_rating'], Literal(race_row['rating'], datatype=XSD.float)))

    #create circuit node
    circuit_token = 'circuit' + str(race_row['circuitId'])
    Circuit = URIRef(F1[circuit_token])

    g.add((Race, F1['takePlaceIn'], Circuit))   

Saving the data

In [406]:
with open(base_path + '/rdf/raceWeekend.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))