In [0]:
# importation des biblotheques
from pyspark.sql.types import LongType, DoubleType, StringType, TimestampType
from pyspark.sql.functions import col
from pyspark.sql import DataFrame
from functools import reduce
import os

In [0]:
# Lecture des fichiers parquet
folder = "/Volumes/workspace/trips/yellow_taxi"
files = dbutils.fs.ls(folder)

dfs = []

for f in files:
    if f.path.endswith(".parquet"):
        tmp = spark.read.parquet(f.path)
        # Uniformiser les types et noms
        tmp = tmp.withColumn("VendorID", col("VendorID").cast(LongType())) \
                 .withColumn("passenger_count", col("passenger_count").cast(DoubleType())) \
                 .withColumn("PULocationID", col("PULocationID").cast(LongType())) \
                 .withColumn("DOLocationID", col("DOLocationID").cast(LongType())) \
                 .withColumnRenamed("Airport_fee", "airport_fee")  # uniformiser majuscule
        dfs.append(tmp)

# Combiner tous les DataFrames

df = reduce(DataFrame.unionByName, dfs)
df.printSchema()
df.show(5)


In [0]:
# Réécrire tous les fichiers Parquet avec un schéma uniforme
df.write.mode("overwrite").parquet("/Volumes/workspace/trips/yellow_taxi_clean")

In [0]:
# Vérifier le résultat
df = spark.read.parquet("/Volumes/workspace/trips/yellow_taxi_clean")
df.show(5)