In [None]:
# importation des bibliothèques 
import os
import pandas as pd
import rdflib
from rdflib import Graph

In [None]:
# function to convert RDF daily ridership data to a pandas DataFrame
def rdf_daily_to_df(file_path):
    g = Graph()
    g.parse(file_path, format="xml")

    rows = {}

    for s, p, o in g:
        s = str(s)
        p = str(p)
        o = str(o)

        if s not in rows:
            rows[s] = {
                "subject": s,
                "city": "Chicago"
            }

        col = p.split("/")[-1]

        if col == "route":
            rows[s]["route"] = o
        elif col == "date":
            rows[s]["date"] = o
        elif col == "daytype":
            rows[s]["daytype"] = o
        elif col == "rides":
            rows[s]["ridership"] = o

    df = pd.DataFrame(rows.values())

    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["ridership"] = pd.to_numeric(df["ridership"], errors="coerce")

    return df

In [None]:
## Chemin du dossier
folder_path = r"C:\Users\dohas\Downloads\rdf_CTA__Ridership__Daily_by_Route_routes_2001_2025"

In [4]:
# # Lecture de tous les fichiers RDF
dfs = []

for file in os.listdir(folder_path):
    if file.endswith(".rdf"):
        file_path = os.path.join(folder_path, file)
        print(f"Lecture du fichier : {file}")
        df_temp = rdf_daily_to_df(file_path)
        dfs.append(df_temp)

Lecture du fichier : rdf_CTA__Ridership__Daily_by_Route_routes_1.rdf
Lecture du fichier : rdf_CTA__Ridership__Daily_by_Route_routes_10.rdf
Lecture du fichier : rdf_CTA__Ridership__Daily_by_Route_routes_11.rdf
Lecture du fichier : rdf_CTA__Ridership__Daily_by_Route_routes_12.rdf
Lecture du fichier : rdf_CTA__Ridership__Daily_by_Route_routes_13.rdf
Lecture du fichier : rdf_CTA__Ridership__Daily_by_Route_routes_14.rdf
Lecture du fichier : rdf_CTA__Ridership__Daily_by_Route_routes_15.rdf
Lecture du fichier : rdf_CTA__Ridership__Daily_by_Route_routes_16.rdf
Lecture du fichier : rdf_CTA__Ridership__Daily_by_Route_routes_17.rdf
Lecture du fichier : rdf_CTA__Ridership__Daily_by_Route_routes_18.rdf
Lecture du fichier : rdf_CTA__Ridership__Daily_by_Route_routes_19.rdf
Lecture du fichier : rdf_CTA__Ridership__Daily_by_Route_routes_2.rdf
Lecture du fichier : rdf_CTA__Ridership__Daily_by_Route_routes_3.rdf
Lecture du fichier : rdf_CTA__Ridership__Daily_by_Route_routes_4.rdf
Lecture du fichier : rdf

In [17]:
# Fusion de tous les DataFrames
df_Final = pd.concat(dfs, ignore_index=True)

In [18]:
# Vérifications rapides
display(df_Final.head())

Unnamed: 0,subject,city,daytype,ridership,route,date
0,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,W,2601,108,2001-06-06
1,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,W,1030,100,2007-05-09
2,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,A,279,11,2020-11-21
3,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,W,1379,108,2018-03-15
4,https://data.cityofchicago.org/resource/jyb9-n...,Chicago,W,1391,106,2019-03-05


In [19]:
display(df_Final.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1092474 entries, 0 to 1092473
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   subject    1092474 non-null  object        
 1   city       1092474 non-null  object        
 2   daytype    1092474 non-null  object        
 3   ridership  1092474 non-null  int64         
 4   route      1092474 non-null  object        
 5   date       1092474 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 50.0+ MB


None

In [None]:
# Supprimer les colonnes "subject" et "city"
df_Final = df_Final.drop(columns=["city","subject"])

In [21]:
df_Final.head()

Unnamed: 0,daytype,ridership,route,date
0,W,2601,108,2001-06-06
1,W,1030,100,2007-05-09
2,A,279,11,2020-11-21
3,W,1379,108,2018-03-15
4,W,1391,106,2019-03-05


In [22]:
df_Final.isna().sum()

daytype      0
ridership    0
route        0
date         0
dtype: int64

In [None]:
# Convertir la colonne "date" en datetime
df_Final['date'] = pd.to_datetime(df_Final['date'], errors='coerce') 

In [None]:
# Filtrer les dates entre 2019 et 2025
df_Final = df_Final[(df_Final['date'].dt.year >= 2019) & (df_Final['date'].dt.year <= 2025)]

In [None]:
# Vérifications finales
display("Colonnes restantes :", df_Final.columns)
display("Nombre de lignes après filtre :", len(df_Final))
display(df_Final.head())

'Colonnes restantes :'

Index(['daytype', 'ridership', 'route', 'date'], dtype='object')

'Nombre de lignes après filtre :'

288929

Unnamed: 0,daytype,ridership,route,date
2,A,279,11,2020-11-21
4,W,1391,106,2019-03-05
5,W,360,108,2021-06-23
6,W,860,103,2021-06-15
9,U,642,111,2021-07-04


In [None]:
# Enregistrer le DataFrame final dans un fichier CSV
df_Final.to_csv(r"C:\Users\dohas\Downloads\rdf_CTA__Ridership__Daily_by_Route_routes_2001_2025\Daily_All.csv", index=False)