In [0]:
import re
from pyspark.sql.functions import (
    col,            # Para referenciar colunas
    lit,            # Para criar valores literais (como None/Null)
    when,           # Para condições (CASE WHEN)
    regexp_replace, # Útil se precisar limpar texto dentro da coluna
    to_date,        # Para converter string em data
    to_timestamp    # Para converter string em timestamp
)


In [0]:
bronze_path = "bronze.f1_world"

In [0]:
def replace_coluns(text):
    temp = re.sub(r"([a-z])([A-Z])",r"\1_\2",text)
    return temp.lower()


def tratar_nulos(df):
    for coluna in df.columns:
        df = df.withColumn(coluna,
                                    when(col(coluna)) == "\\N" , lit(None),
                                    otherwise(col(coluna)))
    
    return df






In [0]:
def gerar_map_tipagem(table_name):

    df = spark.table(f"bronze.f1_world.{table_name}")
    coluns_list = [replace_coluns(c) for c in df.columns]
    print(f"# Regras para {table_name.upper()}")
    print(f"regras_{table_name} = {{")
    for col_nova in coluns_list:
        tipo_sugerido = "string"
        if "_id" in col_nova or col_nova == "year" or col_nova in ["position","win","number","lap","grid","milliseconds"]:
            tipo_sugerido = "int"
        elif col_nova in ["lat","lng","fastest_lap_speed","fastest_lap_time","points","msecs","alt"]:
            tipo_sugerido = "float"
        elif col_nova in ["date","race_date","dob","date","quali_date","fp1_date","fp2_date","fp3_date","sprint_date"]:
            tipo_sugerido = "date"
        print(f'    "{col_nova}": "{tipo_sugerido}",')
        
    print("}")
    print("-" * 30)
tabelas = ["circuits","drivers","races","results","constructors","qualifying","pit_stops","constructor_results","constructor_standings","driver_standings","lap_times","status","sprint_results","seasons"]

for t in tabelas:
    gerar_map_tipagem(t)


In [0]:
# Regras para CIRCUITS
regras_circuits = {
    "circuit_id": "int",
    "circuit_ref": "string",
    "name": "string",
    "location": "string",
    "country": "string",
    "lat": "float",
    "lng": "float",
    "alt": "float",
    "url": "string",
}

# Regras para DRIVERS
regras_drivers = {
    "driver_id": "int",
    "driver_ref": "string",
    "number": "int",
    "code": "string",
    "forename": "string",
    "surname": "string",
    "dob": "date",
    "nationality": "string",
    "url": "string",
}

# Regras para RACES
regras_races = {
    "race_id": "int",
    "year": "int",
    "round": "string",
    "circuit_id": "int",
    "name": "string",
    "date": "date",
    "time": "string",
    "url": "string",
    "fp1_date": "date",
    "fp1_time": "string",
    "fp2_date": "date",
    "fp2_time": "string",
    "fp3_date": "date",
    "fp3_time": "string",
    "quali_date": "date",
    "quali_time": "string",
    "sprint_date": "date",
    "sprint_time": "string",
}

# Regras para RESULTS
regras_results = {
    "result_id": "int",
    "race_id": "int",
    "driver_id": "int",
    "constructor_id": "int",
    "number": "int",
    "grid": "int",
    "position": "int",
    "position_text": "string",
    "position_order": "string",
    "points": "float",
    "laps": "string",
    "time": "string",
    "milliseconds": "int",
    "fastest_lap": "string",
    "rank": "string",
    "fastest_lap_time": "float",
    "fastest_lap_speed": "float",
    "status_id": "int",
}

# Regras para CONSTRUCTORS
regras_constructors = {
    "constructor_id": "int",
    "constructor_ref": "string",
    "name": "string",
    "nationality": "string",
    "url": "string",
}

# Regras para QUALIFYING
regras_qualifying = {
    "qualify_id": "int",
    "race_id": "int",
    "driver_id": "int",
    "constructor_id": "int",
    "number": "int",
    "position": "int",
    "q1": "string",
    "q2": "string",
    "q3": "string",
}

# Regras para PIT_STOPS
regras_pit_stops = {
    "race_id": "int",
    "driver_id": "int",
    "stop": "string",
    "lap": "int",
    "time": "string",
    "duration": "string",
    "milliseconds": "int",
}

# Regras para CONSTRUCTOR_RESULTS
regras_constructor_results = {
    "constructor_results_id": "int",
    "race_id": "int",
    "constructor_id": "int",
    "points": "float",
    "status": "string",
}

# Regras para CONSTRUCTOR_STANDINGS
regras_constructor_standings = {
    "constructor_standings_id": "int",
    "race_id": "int",
    "constructor_id": "int",
    "points": "float",
    "position": "int",
    "position_text": "string",
    "wins": "string",
}

# Regras para DRIVER_STANDINGS
regras_driver_standings = {
    "driver_standings_id": "int",
    "race_id": "int",
    "driver_id": "int",
    "points": "float",
    "position": "int",
    "position_text": "string",
    "wins": "string",
}

# Regras para LAP_TIMES
regras_lap_times = {
    "race_id": "int",
    "driver_id": "int",
    "lap": "int",
    "position": "int",
    "time": "string",
    "milliseconds": "int",
}

# Regras para STATUS
regras_status = {
    "status_id": "int",
    "status": "string",
}

# Regras para SPRINT_RESULTS
regras_sprint_results = {
    "result_id": "int",
    "race_id": "int",
    "driver_id": "int",
    "constructor_id": "int",
    "number": "int",
    "grid": "int",
    "position": "int",
    "position_text": "string",
    "position_order": "string",
    "points": "float",
    "laps": "string",
    "time": "string",
    "milliseconds": "int",
    "fastest_lap": "string",
    "fastest_lap_time": "float",
    "status_id": "int",
}

# Regras para SEASONS
regras_seasons = {
    "year": "int",
}


map_type = [
    regras_circuits,
    regras_drivers,
    regras_races,
    regras_results,
    regras_constructors,
    regras_qualifying,
    regras_pit_stops,
    regras_constructor_results,
    regras_constructor_standings,
    regras_driver_standings,
    regras_lap_times,
    regras_status,
    regras_sprint_results,
    regras_seasons]


In [0]:


def aplicar_tipagem(df, map_type):

    coluna_final = []

    for coluna_atual in df.columns:
        if coluna_atual in map_type:
            tipo_alvo = map_type[coluna_atual]
            coluna_final.append(col(coluna_atual).cast(tipo_alvo))
        else:
            coluna_final.append(col(coluna_atual))
    return df.select(coluna_final)