In [1]:
import polars as pl
import os

from config import COLS_TO_FLATTEN, COLS_TO_SELECT

In [2]:
def get_flatten_df(original_df):
    
    df = original_df
    cols = original_df.columns
    cols_in_df = []
    
    for col in COLS_TO_FLATTEN:
        if col in cols:
            df = df.with_columns(pl.col(col).name.map_fields(lambda x: f"{col}_{x}"))
            cols_in_df.append(col)
    
    df = df.unnest(cols_in_df)
        
    return df

In [3]:
def get_selected_cols_df(flatten_df,page):
    
    cols_flatten_df = set(flatten_df.columns)

    cols_missing = set(COLS_TO_SELECT) - cols_flatten_df

    if len(cols_missing):
        print(f"The df page {page} is missing the following columns: {cols_missing}")

    selected_cols_df = flatten_df.select(COLS_TO_SELECT)

    return selected_cols_df 

In [4]:
def get_df_all_pages(path):
    
    json_files = [f for f in os.listdir(f"./{path}") if f.endswith('.json')]
    last_page = len(json_files)
    print(f'Combining {last_page} json files')

    for page in range(1, last_page+1):
        original_df = pl.read_json(f'./{path}/page_{page}.json')
        if 'km'  not in original_df.columns:
            original_df = original_df.with_columns(pl.lit(0).alias('km').cast(pl.Int32))
        flatten_df = get_flatten_df(original_df)
        df = get_selected_cols_df(flatten_df, page)

        if page == 1:
            df_all = df
        else:
            df_all = pl.concat([df_all, df])
    return df_all


In [5]:
df_all_scraped = get_df_all_pages('../01_extract/01_scraping_json_files') 

Combining 8441 json files


In [6]:
df_all_scraped = df_all_scraped.unique(('id', 'price_amount'), keep='first', maintain_order=True)

In [7]:
df_all_rescraped = get_df_all_pages('../01_extract/01_rescraping_json_files')

Combining 1504 json files


In [8]:
df_all_rescraped = df_all_rescraped.unique(('id', 'price_amount'), keep='first', maintain_order=True)

In [9]:
df_all_new = get_df_all_pages('../01_extract/01_new_car_scraping_json_files')

Combining 260 json files


In [10]:
df_all_new = df_all_new.unique(('id', 'price_amount'), keep='first', maintain_order=True)

In [18]:
df_all_new = df_all_new.with_columns(pl.col(pl.Int32).cast(pl.Int64))

In [19]:
df_load = pl.concat([df_all_rescraped, df_all_scraped])

In [20]:
df_load = pl.concat([df_load, df_all_new])

In [21]:
df_load = df_load.unique(('id', 'price_amount'), keep='first', maintain_order=True)

In [24]:
# df_load.write_parquet('car_ads_load.parquet')