In [0]:
from pyspark.sql.functions import col, regexp_replace, when, lit
from pyspark.sql.types import StringType, IntegerType, LongType, DoubleType, DecimalType, TimestampType

path = "/Volumes/workspace/raw-zone/taxi_amarelo/"
arquivos = dbutils.fs.ls(path)

# Dicionário para renomear colunas antigas para novas
renomear_colunas = {
    "vendorid": "cod_motorista",
    "tpep_pickup_datetime": "dt_hr_inicio",
    "tpep_dropoff_datetime": "dt_hr_fim",
    "passenger_count": "qtd_pessoas",
    "trip_distance": "dist_percorrida",
    "ratecodeid": "cod_taxa",
    "store_and_fwd_flag": "ind_armazenamento",
    "pulocationid": "cod_bairro_origem",
    "dolocationid": "cod_bairro_destino",
    "payment_type": "forma_pagamento",
    "fare_amount": "vlr_taxa_corrida",
    "extra": "vlr_taxa_extra",
    "mta_tax": "vlr_taxa_mta",
    "tip_amount": "vlr_troco",
    "tolls_amount": "vlr_pedagio",
    "improvement_surcharge": "cod_taxa_melhoria",
    "total_amount": "vlr_total",
    "congestion_surcharge": "vlr_taxa_congestao",
    "airport_fee": "vlr_taxa_aeroporto",
}

# Dicionário de tipos com nomes finais das colunas
tipos_colunas = {
    "cod_motorista": StringType(),
    "dt_hr_inicio": TimestampType(),
    "dt_hr_fim": TimestampType(),
    "qtd_pessoas": IntegerType(),
    "dist_percorrida": DecimalType(10, 2),
    "cod_taxa": IntegerType(),
    "ind_armazenamento": StringType(),
    "cod_bairro_origem": StringType(),
    "cod_bairro_destino": StringType(),
    "tipo_pagamento": StringType(),
    "vlr_taxa_corrida": DecimalType(10, 2),
    "vlr_taxa_extra": DecimalType(10, 2),
    "vlr_taxa_mta": DecimalType(10, 2),
    "vlr_troco": DecimalType(10, 2),
    "vlr_pedagio": DecimalType(10, 2),
    "cod_taxa_melhoria": IntegerType(),
    "vlr_total": DecimalType(10, 2),
    "vlr_taxa_congestao": DecimalType(10, 2),
    "vlr_taxa_aeroporto": DecimalType(10, 2),
}

for i, arquivo in enumerate(arquivos, start=1):
    if arquivo.name.endswith(".parquet"):
        df = spark.read.parquet(arquivo.path)

        # Padronizar nomes para lowercase
        df = df.toDF(*[c.lower() for c in df.columns])

        # Renomear colunas
        for antigo, novo in renomear_colunas.items():
            if antigo in df.columns:
                df = df.withColumnRenamed(antigo, novo)

        # Converter para string para manipular vírgulas e pontos
        df = df.select([col(c).cast(StringType()).alias(c) for c in df.columns])

        # Substituir vírgula por ponto para decimais
        df = df.select([regexp_replace(col(c), ",", ".").alias(c) for c in df.columns])

        # Safe cast usando o dicionário com nomes finais
        for c in df.columns:
            if c in tipos_colunas:
                t = tipos_colunas[c]
                if isinstance(t, (IntegerType, LongType)):
                    df = df.withColumn(
                        c,
                        when(col(c).rlike(r"^[0-9]+(\.[0-9]+)?$"), col(c).cast("double").cast(t))
                        .otherwise(None)
                    )
                elif isinstance(t, (DecimalType, DoubleType)):
                    df = df.withColumn(
                        c,
                        when(col(c).rlike(r"^[0-9]+(\.[0-9]+)?$"), col(c).cast(t))
                        .otherwise(None)
                    )
                elif isinstance(t, TimestampType):
                    df = df.withColumn(c, col(c).cast(TimestampType()))
                else:
                    df = df.withColumn(c, col(c).cast(StringType()))

        # Adicionar coluna de origem
        df_final = df.withColumn("origem_taxi", lit("taxi_amarelo"))

        # Criar variável com nome dinâmico
        globals()[f"df_taxi_amarelo_{i}"] = df_final

        # Salvar na tabela
        df_final.write.mode("overwrite").saveAsTable("`trusted-zone`.tb_corrida_taxi_amarelo")


In [0]:
from pyspark.sql.functions import col, regexp_replace, when, lit
from pyspark.sql.types import StringType, IntegerType, LongType, DoubleType, DecimalType, TimestampType

path = "/Volumes/workspace/raw-zone/taxi_verde/"
arquivos = dbutils.fs.ls(path)

# Dicionário para renomear colunas antigas para novas
renomear_colunas = {
    "vendorid": "cod_motorista",
    "lpep_pickup_datetime": "dt_hr_inicio",
    "lpep_dropoff_datetime": "dt_hr_fim",
    "passenger_count": "qtd_pessoas",
    "trip_distance": "dist_percorrida",
    "ratecodeid": "cod_taxa",
    "store_and_fwd_flag": "ind_armazenamento",
    "pulocationid": "cod_bairro_origem",
    "dolocationid": "cod_bairro_destino",
    "payment_type": "tipo_pagamento",
    "trip_type": "tipo_viagem",
    "fare_amount": "vlr_taxa_corrida",
    "extra": "vlr_taxa_extra",
    "mta_tax": "vlr_taxa_mta",
    "tip_amount": "vlr_troco",
    "tolls_amount": "vlr_pedagio",
    "ehail_fee":"vlr_gorjeta",
    "improvement_surcharge": "cod_taxa_melhoria",
    "total_amount": "vlr_total",
    "congestion_surcharge": "vlr_taxa_congestao",
    "airport_fee": "vlr_taxa_aeroporto",
}

# Dicionário de tipos com nomes finais das colunas
tipos_colunas = {
    "cod_motorista": StringType(),
    "dt_hr_inicio": TimestampType(),
    "dt_hr_fim": TimestampType(),
    "qtd_pessoas": IntegerType(),
    "dist_percorrida": DecimalType(10, 2),
    "cod_taxa": IntegerType(),
    "ind_armazenamento": StringType(),
    "cod_bairro_origem": StringType(),
    "cod_bairro_destino": StringType(),
    "tipo_pagamento": StringType(),
    "tipo_viagem": IntegerType(),
    "vlr_taxa_corrida": DecimalType(10, 2),
    "vlr_taxa_extra": DecimalType(10, 2),
    "vlr_taxa_mta": DecimalType(10, 2),
    "vlr_troco": DecimalType(10, 2),
    "vlr_pedagio": DecimalType(10, 2),
    "vlr_gorjeta":DecimalType(10, 2),
    "cod_taxa_melhoria": IntegerType(),
    "vlr_total": DecimalType(10, 2),
    "vlr_taxa_congestao": DecimalType(10, 2),
    "vlr_taxa_aeroporto": DecimalType(10, 2),
}

for i, arquivo in enumerate(arquivos, start=1):
    if arquivo.name.endswith(".parquet"):
        df = spark.read.parquet(arquivo.path)

        # Padronizar nomes para lowercase
        df = df.toDF(*[c.lower() for c in df.columns])

        # Renomear colunas
        for antigo, novo in renomear_colunas.items():
            if antigo in df.columns:
                df = df.withColumnRenamed(antigo, novo)

        # Converter para string para manipular vírgulas e pontos
        df = df.select([col(c).cast(StringType()).alias(c) for c in df.columns])

        # Substituir vírgula por ponto para decimais
        df = df.select([regexp_replace(col(c), ",", ".").alias(c) for c in df.columns])

        # Safe cast usando o dicionário com nomes finais
        for c in df.columns:
            if c in tipos_colunas:
                t = tipos_colunas[c]
                if isinstance(t, (IntegerType, LongType)):
                    df = df.withColumn(
                        c,
                        when(col(c).rlike(r"^[0-9]+(\.[0-9]+)?$"), col(c).cast("double").cast(t))
                        .otherwise(None)
                    )
                elif isinstance(t, (DecimalType, DoubleType)):
                    df = df.withColumn(
                        c,
                        when(col(c).rlike(r"^[0-9]+(\.[0-9]+)?$"), col(c).cast(t))
                        .otherwise(None)
                    )
                elif isinstance(t, TimestampType):
                    df = df.withColumn(c, col(c).cast(TimestampType()))
                else:
                    df = df.withColumn(c, col(c).cast(StringType()))

        # Adicionar coluna de origem
        df_final = df.withColumn("origem_taxi", lit("taxi_verde"))

        # Criar variável com nome dinâmico (caso queira usar depois)
        globals()[f"df_taxi_verde_{i}"] = df_final

        # Salvar na tabela
        df_final.write.mode("overwrite").option("mergeSchema", "true").saveAsTable("`trusted-zone`.tb_corrida_taxi_verde")


In [0]:
display(df_taxi_verde_5)

cod_motorista,lpep_pickup_datetime,lpep_dropoff_datetime,ind_armazenamento,cod_taxa,cod_bairro_origem,cod_bairro_destino,qtd_pessoas,dist_percorrida,vlr_taxa_corrida,vlr_taxa_extra,vlr_taxa_mta,vlr_troco,vlr_pedagio,vlr_gorjeta,cod_taxa_melhoria,vlr_total,forma_pagamento,trip_type,vlr_taxa_congestao,origem_taxi
2,2023-03-01 00:25:10,2023-03-01 00:35:47,N,1,82,196,1,2.36,13.5,1.0,0.5,0.0,0.0,,1.0,16.0,2,1.0,0.0,taxi_verde
2,2023-03-01 00:14:29,2023-03-01 00:25:04,N,1,7,7,1,0.78,,,,0.0,0.0,,,,3,1.0,0.0,taxi_verde
2,2023-03-01 00:14:29,2023-03-01 00:25:04,N,1,7,7,1,0.78,6.5,1.0,0.5,0.0,0.0,,1.0,9.0,3,1.0,0.0,taxi_verde
2,2023-02-28 22:59:46,2023-02-28 23:08:38,N,1,166,74,1,1.66,11.4,1.0,0.5,2.78,0.0,,1.0,16.68,1,1.0,0.0,taxi_verde
2,2023-03-01 00:54:03,2023-03-01 01:03:14,N,1,236,229,1,3.14,15.6,1.0,0.5,4.17,0.0,,1.0,25.02,1,1.0,2.75,taxi_verde
2,2023-03-01 01:00:09,2023-03-01 01:14:37,N,1,75,235,1,5.69,23.3,1.0,0.5,4.0,0.0,,1.0,29.8,1,1.0,0.0,taxi_verde
2,2023-03-01 00:09:45,2023-03-01 00:26:06,N,1,260,160,1,2.92,17.7,1.0,0.5,4.04,0.0,,1.0,24.24,1,1.0,0.0,taxi_verde
2,2023-03-01 00:39:30,2023-03-01 00:39:33,N,5,95,264,5,0.0,35.0,0.0,0.0,1.0,0.0,,1.0,37.0,1,2.0,0.0,taxi_verde
2,2023-03-01 00:03:07,2023-03-01 00:14:44,N,1,244,41,1,3.34,16.3,1.0,0.5,5.64,0.0,,1.0,24.44,1,1.0,0.0,taxi_verde
2,2023-03-01 00:42:56,2023-03-01 00:49:57,N,1,83,7,5,1.75,10.7,1.0,0.5,2.64,0.0,,1.0,15.84,1,1.0,0.0,taxi_verde


In [0]:

from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType, LongType, DecimalType, DoubleType, TimestampType, StringType

# exemplo de df
df = spark.createDataFrame([
    ("1.0", "2023-08-01 10:00:00"),
    ("2.5", "2023-08-01 11:00:00"),
    ("abc", "invalid")
], ["qtd_pessoas", "dt_hr_inicio"])

tipos_colunas = {
    "qtd_pessoas": IntegerType(),
    "dt_hr_inicio": TimestampType()
}

for c in df.columns:
    if c in tipos_colunas:
        t = tipos_colunas[c]
        tipo_t = type(t)
        if tipo_t == IntegerType or tipo_t == LongType:
            df = df.withColumn(
                c,
                when(col(c).rlike(r"^[0-9]+(\.[0-9]+)?$"), col(c).cast("double").cast(t))
                .otherwise(None)
            )
        elif tipo_t == DecimalType or tipo_t == DoubleType:
            df = df.withColumn(
                c,
                when(col(c).rlike(r"^[0-9]+(\.[0-9]+)?$"), col(c).cast(t))
                .otherwise(None)
            )
        elif tipo_t == TimestampType:
            df = df.withColumn(c, col(c).cast(TimestampType()))
        else:
            df = df.withColumn(c, col(c).cast(StringType()))

df.printSchema()
df.show()


root
 |-- qtd_pessoas: integer (nullable = true)
 |-- dt_hr_inicio: timestamp (nullable = true)



[0;31m---------------------------------------------------------------------------[0m
[0;31mDateTimeException[0m                         Traceback (most recent call last)
File [0;32m<command-7565651743343005>, line 38[0m
[1;32m     35[0m             df [38;5;241m=[39m df[38;5;241m.[39mwithColumn(c, col(c)[38;5;241m.[39mcast(StringType()))
[1;32m     37[0m df[38;5;241m.[39mprintSchema()
[0;32m---> 38[0m df[38;5;241m.[39mshow()

File [0;32m/databricks/python/lib/python3.11/site-packages/pyspark/sql/connect/dataframe.py:1193[0m, in [0;36mDataFrame.show[0;34m(self, n, truncate, vertical)[0m
[1;32m   1192[0m [38;5;28;01mdef[39;00m [38;5;21mshow[39m([38;5;28mself[39m, n: [38;5;28mint[39m [38;5;241m=[39m [38;5;241m20[39m, truncate: Union[[38;5;28mbool[39m, [38;5;28mint[39m] [38;5;241m=[39m [38;5;28;01mTrue[39;00m, vertical: [38;5;28mbool[39m [38;5;241m=[39m [38;5;28;01mFalse[39;00m) [38;5;241m-[39m[38;5;241m>[39m [38;5;28;01mNone[39;

In [0]:
from pyspark.sql.functions import col

df = spark.read.parquet(arquivo.path)

filtered_df = df.filter(col("ehail_fee").isNotNull())
display(filtered_df)

VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge


In [0]:
from pyspark.sql.functions import col

display(df)#.filter(col("passenger_count").like("%1.0%")).select("passenger_count"))