In [2]:
from rdflib import Graph
import pandas as pd

# Charger le RDF
RDF_FILES = [
    "data\chicago_rdf\CTA Chicago - Ridership - Bus Routes - Daily Type Averages & Totals (RDF).rdf",
    "data\chicago_rdf\CTA Chicago - Ridership - Bus Routes - Monthly Day-Type Averages & Totals (RDF).rdf"
]


In [3]:

g = Graph()
g.parse(RDF_FILES[0], format="xml")

print("Nombre de triplets :", len(g))


file:///c:/Users/pc/ridership-analytics-Chicago-vs-Philadelphie/data\chicago_rdf\CTA Chicago - Ridership - Bus Routes - Daily Type Averages & Totals (RDF).rdf does not look like a valid URI, trying to serialize this will break.


Nombre de triplets : 3500


In [4]:
predicates = set()

for s, p, o in g:
    predicates.add(p)

for p in list(predicates)[:20]:
    print(p)

https://data.cityofchicago.org/resource/jyb9-n7fm/daytype
http://www.w3.org/2000/01/rdf-schema#member
https://data.cityofchicago.org/resource/jyb9-n7fm/route
http://www.socrata.com/rdf/terms#rowID
http://www.w3.org/1999/02/22-rdf-syntax-ns#type
https://data.cityofchicago.org/resource/jyb9-n7fm/rides
https://data.cityofchicago.org/resource/jyb9-n7fm/date


In [5]:
g = Graph()
g.parse(RDF_FILES[1], format="xml")

print("Nombre de triplets :", len(g))

file:///c:/Users/pc/ridership-analytics-Chicago-vs-Philadelphie/data\chicago_rdf\CTA Chicago - Ridership - Bus Routes - Monthly Day-Type Averages & Totals (RDF).rdf does not look like a valid URI, trying to serialize this will break.


Nombre de triplets : 5000


In [6]:
predicates1 = set()

for s, p, o in g:
    predicates1.add(p)

for p in list(predicates1)[:20]:
    print(p)

https://data.cityofchicago.org/resource/bynn-gwxy/avg_weekday_rides
http://www.w3.org/2000/01/rdf-schema#member
https://data.cityofchicago.org/resource/bynn-gwxy/route
http://www.socrata.com/rdf/terms#rowID
http://www.w3.org/1999/02/22-rdf-syntax-ns#type
https://data.cityofchicago.org/resource/bynn-gwxy/routename
https://data.cityofchicago.org/resource/bynn-gwxy/avg_sunday_holiday_rides
https://data.cityofchicago.org/resource/bynn-gwxy/month_beginning
https://data.cityofchicago.org/resource/bynn-gwxy/monthtotal
https://data.cityofchicago.org/resource/bynn-gwxy/avg_saturday_rides


In [13]:
from rdflib import Graph,URIRef

RDF_DAILY = RDF_FILES[0]

NS_DAILY = {
    "route": URIRef("https://data.cityofchicago.org/resource/jyb9-n7fm/route"),
    "date": URIRef("https://data.cityofchicago.org/resource/jyb9-n7fm/date"),
    "day_type": URIRef("https://data.cityofchicago.org/resource/jyb9-n7fm/daytype"),
    "ridership": URIRef("https://data.cityofchicago.org/resource/jyb9-n7fm/rides")
}

g = Graph()
g.parse(RDF_DAILY, format="xml")
print("Triplets RDF (daily):", len(g))

records = {}

for s, p, o in g:
    if p in NS_DAILY.values():
        if s not in records:
            records[s] = {
                "date": None,
                "route": None,
                "day_type": None,
                "ridership": None
            }

        if p == NS_DAILY["date"]:
            records[s]["date"] = str(o)

        elif p == NS_DAILY["route"]:
            records[s]["route"] = str(o)

        elif p == NS_DAILY["day_type"]:
            records[s]["day_type"] = str(o)

        elif p == NS_DAILY["ridership"]:
            try:
                records[s]["ridership"] = int(o)
            except:
                records[s]["ridership"] = None

df_daily = pd.DataFrame.from_dict(records, orient="index")

# Nettoyage
df_daily["date"] = pd.to_datetime(df_daily["date"], errors="coerce")
df_daily = df_daily.dropna(subset=["date", "route", "ridership"])
df_daily["route"] = df_daily["route"].str.upper().str.strip()
df_daily["day_type"] = df_daily["day_type"].str.capitalize()

# Export CSV
OUTPUT_DAILY = "..\data\chicago_csv\chicago_ridership_daily.csv"
df_daily.to_csv(OUTPUT_DAILY, index=False)

print("CSV DAILY généré :", OUTPUT_DAILY)

file:///c:/Users/pc/ridership-analytics-Chicago-vs-Philadelphie/data\chicago_rdf\CTA Chicago - Ridership - Bus Routes - Daily Type Averages & Totals (RDF).rdf does not look like a valid URI, trying to serialize this will break.


Triplets RDF (daily): 3500
CSV DAILY généré : ..\data\chicago_csv\chicago_ridership_daily.csv


In [15]:
RDF_MONTHLY = RDF_FILES[1]

NS_MONTHLY = {
    "route": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/route"),
    "route_name": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/routename"),
    "avg_weekday": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/avg_weekday_rides"),
    "avg_saturday": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/avg_saturday_rides"),
    "avg_sunday": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/avg_sunday_holiday_rides"),
    "month_total": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/monthtotal"),
    "month_beginning": URIRef("https://data.cityofchicago.org/resource/bynn-gwxy/month_beginning")
}

g = Graph()
g.parse(RDF_MONTHLY, format="xml")
print("Triplets RDF (monthly):", len(g))

records = {}

for s, p, o in g:
    if p in NS_MONTHLY.values():
        if s not in records:
            records[s] = {
                "month_beginning": None,
                "route": None,
                "route_name": None,
                "avg_weekday_rides": None,
                "avg_saturday_rides": None,
                "avg_sunday_holiday_rides": None,
                "month_total": None
            }

        if p == NS_MONTHLY["month_beginning"]:
            records[s]["month_beginning"] = str(o)

        elif p == NS_MONTHLY["route"]:
            records[s]["route"] = str(o)

        elif p == NS_MONTHLY["route_name"]:
            records[s]["route_name"] = str(o)

        elif p == NS_MONTHLY["avg_weekday"]:
            records[s]["avg_weekday_rides"] = float(o)

        elif p == NS_MONTHLY["avg_saturday"]:
            records[s]["avg_saturday_rides"] = float(o)

        elif p == NS_MONTHLY["avg_sunday"]:
            records[s]["avg_sunday_holiday_rides"] = float(o)

        elif p == NS_MONTHLY["month_total"]:
            records[s]["month_total"] = float(o)

df_monthly = pd.DataFrame.from_dict(records, orient="index")

# Nettoyage
df_monthly["month_beginning"] = pd.to_datetime(df_monthly["month_beginning"], errors="coerce")
df_monthly["route"] = df_monthly["route"].str.upper().str.strip()

# Export CSV
OUTPUT_MONTHLY = "..\data\chicago_csv\chicago_ridership_monthly.csv"
df_monthly.to_csv(OUTPUT_MONTHLY, index=False)

print("CSV MONTHLY généré :", OUTPUT_MONTHLY)

file:///c:/Users/pc/ridership-analytics-Chicago-vs-Philadelphie/data\chicago_rdf\CTA Chicago - Ridership - Bus Routes - Monthly Day-Type Averages & Totals (RDF).rdf does not look like a valid URI, trying to serialize this will break.


Triplets RDF (monthly): 5000
CSV MONTHLY généré : ..\data\chicago_csv\chicago_ridership_monthly.csv
