In [1]:
import os
from src.cleaning.common_functions import *
os.chdir("../..")
from sibr_module import BigQuery, Logger, CStorage

import pandas as pd
pd.set_option('display.max_columns', 500)



In [2]:
replace = True
dataset = 'cars'
logger = Logger(f'pre_process{dataset.capitalize()}')
bq = BigQuery(logger=logger,dataset=dataset)
cs = CStorage(logger=logger, bucket_name='sibr-market-bucket')
logger.debug(f'Replace: {replace} | Dataset: {dataset}')

2025-07-09 14:02:58,896 - pre_processCars - INFO - Cloud Logging is disabled. Using local logging to /Users/sigvardbratlie/Documents/Projects/sibr_market_training/logfiles/pre_processCars.log
2025-07-09 14:02:58,901 - pre_processCars - INFO - BigQuery client initialized with project_id: sibr-market
2025-07-09 14:02:58,903 - pre_processCars - INFO - Google Cloud Storage client initialized with bucket: sibr-market-bucket
2025-07-09 14:02:58,903 - pre_processCars - DEBUG - Replace: True | Dataset: cars


In [3]:
df_ = bq.read_clean(replace = replace)

2025-07-09 14:03:01,492 - pre_processCars - INFO - Reading clean data from dataset: cars
2025-07-09 14:03:42,504 - pre_processCars - INFO - 547870 rows read from cars. Query: 
            SELECT c.* FROM sibr-market.clean.cars c
            ... (truncated)


## Set index and remove empty data

In [7]:
df = df_.dropna(subset='item_id')
df.set_index('item_id',inplace=True)
logger.debug(f'Length of df: {len(df)} | after dropping NaN on item_id')

2025-07-09 14:08:54,642 - pre_processCars - DEBUG - Length of df: 547870 | after dropping NaN on item_id


In [8]:
df.dropna(subset=['total_price','mileage','model_year'],inplace=True)

drop = ['address','description','contact_person','phone','url','country','title','web','email','municipality','rn','FIRST','LAST','postal_code','region','last_updated','salgstid'
    ,'vin','reg_num','co2','color_description','first_registration','cargo_space','prev_owners',
        'last_eu','next_eu','trailer_weight','subtitle','warranty_until','known_issues','engine_tuned',
        'liens','major_repairs','state','battery','price_excl_transfer','clean_date','warranty','color_interior', 'gearbox_type','warranty_length','condition_report','warranty'
]

drop_eq = [x for x in df.columns if x.startswith('eq_') if x not in
           ['eq_rear_view_camera','eq_bluetooth','eq_tow_hitch','eq_360_camera','eq_cruise_control','eq_parking_sensor_behind',
 'eq_parking_sensor_front', 'eq_air_conditioning','eq_navigation','eq_winter_tires','eq_summer_tires' ,
 'eq_skin_interior','eq_apple_carplay','eq_led_lights', 'eq_xenon_lights',]]
df.drop(columns=drop_eq, inplace=True, errors='ignore')

df.drop(columns=drop,inplace=True,errors = 'ignore')
df = rm_empty_features(df)
logger.debug(f'Length of df: {len(df)} | after dropping NaN on price, usable_area and bedrooms')

2025-07-09 14:09:16,190 - pre_processCars - DEBUG - Length of df: 547870 | after dropping NaN on price, usable_area and bedrooms


## DUMMY VARIABLES

In [18]:

# for col in dummy_cols:
#     logger.debug(f'Processing column: {col}')
#     df.loc[:,col] = df[col].str.lower()
#     if col == 'fuel':
#         df.loc[:,col] = df[col].map(fuel_mapping).fillna('annet')
#     elif col == 'body_type':
#         df.loc[:,col] = df[col].map(body_type_mapping).fillna('annet')
#     elif col == 'brand':
#         df.loc[:,col] = df[col].replace(brand_mapping)
#     for cat, num in df[col].value_counts(normalize = True).items():
#         if num < 0.01:
#             df = df[df[col] != cat]
#         #df.loc[:,col] = df[col].astype('category')

In [9]:
dummy_cols = [
    'gearbox',
    'fuel',
    'color',
    'wheel_drive',
    'body_type',
    'sales_type',
    'category',
    'brand',
    #'model',
    'county',
]
fuel_mapping = {
    'diesel': 'diesel',
    'bensin': 'bensin',
    'el': 'el',
    'elektrisitet': 'el',
    'el + bensin': 'el + bensin',
    'hybrid bensin': 'el + bensin',
    'plug-in bensin': 'el + bensin',
    'el + diesel': 'el + diesel',
    'hybrid diesel': 'el + diesel',
    'plug-in diesel': 'el + diesel',
    'hydrogen': 'hydrogen'
}
body_type_mapping = {
    'suv/offroad': 'flerbruksbil (af)',
    'stasjonsvogn': 'stasjonsvogn (ac)',
    'kombi 5-dørs': 'kombikupé (ab)',
    '5': 'kombikupé (ab)',
    'kasse': 'integrert førerhus (bb)',
    'sedan': 'sedan (aa)',
    'annet': 'annet',
    'flerbruksbil': 'flerbruksbil (af)',
    'coupe': 'kupé (ad)',
    'cabriolet': 'kabriolet (ae)',
    'pickup': 'pick-up (be)',
    'kombi 3-dørs': 'kombikupé (ab)',  # endret fra 'kupé (ad)'
    '3': 'kombikupé (ab)'  # endret fra 'kupé (ad)'
}
brand_mapping = {'tesla motors':'tesla',
             'bmw i':'bmw',
             'alfa' : 'alfa romeo',
             'jaguar land rover limited' : 'land rover',
             'automobili lamborghini s.p.a.' : 'lamborghini',
             'land' : 'land rover',
             'rover' : 'land rover',
             'range rover' : 'land rover',
             'mercedes sprinter / kegger' : 'mercedes-benz',
             'mercedes-amg' : 'mercedes-benz',
             'ford-cng-technik' : 'ford',
             'daimler' : 'mercedes-benz',
             'kg mobility' : 'kgm',
             'mitsubishi fuso' : 'mitsubishi',
             'jaguar cars limited' : 'jaguar'}
for col in ['fuel','body_type','brand']:
    #logger.debug(f'Processing column: {col}')
    df.loc[:,col] = df[col].str.lower()
    if col == 'fuel':
        df.loc[:,col] = df[col].map(fuel_mapping).fillna('annet')
    elif col == 'body_type':
        df.loc[:,col] = df[col].map(body_type_mapping).fillna('annet')
    elif col == 'brand':
        df.loc[:,col] = df[col].replace(brand_mapping)

df['dealer'] = df['dealer'].apply(lambda x: False if x.lower() == 'private' else True)

df = process_bool(df)

logger.debug(f'Length of df: {len(df)} | after mapping categories and creating dummy variables')

2025-07-09 14:09:23,503 - pre_processCars - DEBUG - Length of df: 547870 | after mapping categories and creating dummy variables


In [10]:
df_['gearbox'].value_counts()

gearbox
Automat           425123
Manuell           122679
Ikke oppgitt          66
AutomatAutomat         1
MANUAL                 1
Name: count, dtype: int64

In [11]:
def mk_cat(df, col, valid_values):
    """
    Convert a column to a categorical type with specified valid values.
    """
    df[col] = df[col].apply(lambda x: x.lower() if  isinstance(x,str) else x)
    valid_values = [x.lower() for x in valid_values if isinstance(x, str)]
    isin = df[col].isin(valid_values)
    df = df[isin].copy()
    df.loc[:, col] = df[col].astype(str)
    df.loc[:, col] = pd.Categorical(df[col], categories=valid_values, ordered=False)
    return df

col_valid_values = {
    'fuel': ['el + bensin','el + diesel', 'diesel', 'el', 'bensin'],
    'gearbox': ['automat', 'manuell'],
    'color': ['svart', 'grå', 'rød', 'blå', 'sølv', 'grønn', 'brun', 'hvit'],
    'wheel_drive': ['forhjulsdrift', 'firehjulsdrift', 'bakhjulsdrift'],
    'body_type': [
        'flerbruksbil (af)',
        'stasjonsvogn (ac)',
        'kombikupé (ab)',
        'integrert førerhus (bb)',
        'sedan (aa)',
        'pick-up (be)',
        'kupé (ad)',
        'annet'
    ],
    'sales_type': ['bruktbil til salgs', 'auksjon', 'nybil til salgs'],
    'brand': [
        'ford', 'skoda', 'bmw', 'volvo', 'peugeot', 'mercedes-benz',
        'volkswagen', 'kia', 'toyota', 'audi', 'porsche', 'citroen',
        'opel', 'nissan', 'renault', 'hyundai', 'mazda', 'tesla',
        'mitsubishi', 'suzuki'
    ],
    'county': [
        'rogaland', 'innlandet', 'buskerud', 'akershus', 'møre og romsdal',
        'vestfold', 'oslo', 'vestland', 'nordland', 'trøndelag', 'agder',
        'østfold', 'telemark', 'troms'
    ],
    'category': ['personbil', 'varebil']
}
for col, valid_values in col_valid_values.items():
    df = mk_cat(df, col, valid_values)
    logger.debug(f'Length of df: {len(df)} | removing unwanted categories from {col}')

logger.debug(f'Length of df: {len(df)} | after mapping and removing unwanted categories')

2025-07-09 14:09:38,971 - pre_processCars - DEBUG - Length of df: 547405 | removing unwanted categories from fuel
2025-07-09 14:09:39,346 - pre_processCars - DEBUG - Length of df: 547340 | removing unwanted categories from gearbox
2025-07-09 14:09:39,736 - pre_processCars - DEBUG - Length of df: 533514 | removing unwanted categories from color
2025-07-09 14:09:40,111 - pre_processCars - DEBUG - Length of df: 532097 | removing unwanted categories from wheel_drive
2025-07-09 14:09:40,515 - pre_processCars - DEBUG - Length of df: 526989 | removing unwanted categories from body_type
2025-07-09 14:09:40,890 - pre_processCars - DEBUG - Length of df: 526770 | removing unwanted categories from sales_type
2025-07-09 14:09:41,249 - pre_processCars - DEBUG - Length of df: 484335 | removing unwanted categories from brand
2025-07-09 14:09:41,629 - pre_processCars - DEBUG - Length of df: 477602 | removing unwanted categories from county
2025-07-09 14:09:41,949 - pre_processCars - DEBUG - Length of d

In [12]:
df = df[df['sales_type'] != 'leasing']

logger.debug(f'Length of df: {len(df)} | After filtering sales_type')

2025-07-09 14:09:45,925 - pre_processCars - DEBUG - Length of df: 476639 | After filtering sales_type


## Features

In [13]:
df['n_features'] = df['features'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)
df.drop('features', axis=1, inplace=True)
# # Step 1: Find the 10 most common features
# features_lists = df['features'].dropna().apply(lambda x: [f.strip() for f in x.split(',')])
# feature_counts = Counter(chain.from_iterable(features_lists))
# top_10_features = set([f for f, _ in feature_counts.most_common(10)])
#
# # Step 2 & 3: Filter and update the features column
# def filter_top_features(feature_str):
#     if pd.isna(feature_str):
#         return feature_str
#     features = [f.strip() for f in feature_str.split(',')]
#     filtered = [f for f in features if f in top_10_features]
#     return ', '.join(filtered) if filtered else np.nan
#
# df['features'] = df['features'].apply(filter_top_features)

## DATE COLUMNS

In [14]:
df['scrape_date'] = pd.to_datetime(df['scrape_date'], errors='coerce')
df['day'] = df['scrape_date'].dt.day
df['month'] = df['scrape_date'].dt.month
df['year'] = df['scrape_date'].dt.year
df.drop('scrape_date',axis=1,inplace=True)
df['pre_processed_date'] = pd.Timestamp.now()
logger.debug(f'Length of df: {len(df)} | after date columns')

2025-07-09 14:09:58,951 - pre_processCars - DEBUG - Length of df: 476639 | after date columns


## ENSURE CORRECT DATA TYPES

In [15]:
df = ensure_num_types(df,num_types=['int','float'])

## Split electric & fossil

In [16]:
df_el = df[df['fuel'] == 'el']
df_fossil = df[df['fuel'] != 'el']

## ELECTRIC

In [17]:
df_el.dropna(subset=['range'],inplace=True)
df_el.drop(columns=['engine_volume'], inplace=True, errors='ignore')
df_el.info()

<class 'pandas.core.frame.DataFrame'>
Index: 126890 entries, 365946617 to 401238336
Data columns (total 42 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   model_year                     126890 non-null  Int64         
 1   mileage                        126890 non-null  Int64         
 2   gearbox                        126890 non-null  object        
 3   fuel                           126890 non-null  object        
 4   transfer_fee                   126890 non-null  Int64         
 5   color                          126890 non-null  object        
 6   wheel_drive                    126890 non-null  object        
 7   power                          126890 non-null  Int64         
 8   weight                         126890 non-null  Int64         
 9   seats                          126890 non-null  Int64         
 10  body_type                      126890 non-null  object        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_el.dropna(subset=['range'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_el.drop(columns=['engine_volume'], inplace=True, errors='ignore')


In [18]:
logger.debug(f'Length of df_el: {len(df_el)} | before saving to BQ. Replace is {replace}')
if replace:
    bq.to_bq(df_el,
             table_name=f'{dataset}_el',
             dataset_name='pre_processed',
             if_exists='replace')
else:
    bq.to_bq(df_el,
             table_name=f'{dataset}_el',
             dataset_name='pre_processed',
             if_exists='merge',
             merge_on=['item_id'])

2025-07-09 14:10:15,880 - pre_processCars - DEBUG - Length of df_el: 126890 | before saving to BQ. Replace is True
2025-07-09 14:10:21,282 - pre_processCars - INFO - 126890 rader lagret i sibr-market.pre_processed.cars_el


## FOSSIL

In [19]:
df_fossil = df_fossil.drop(columns=['range'], errors='ignore')
df_fossil.info()

<class 'pandas.core.frame.DataFrame'>
Index: 336319 entries, 400706752 to 390060678
Data columns (total 42 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   model_year                     336319 non-null  Int64         
 1   mileage                        336319 non-null  Int64         
 2   gearbox                        336319 non-null  object        
 3   fuel                           336319 non-null  object        
 4   transfer_fee                   336319 non-null  Int64         
 5   color                          336319 non-null  object        
 6   wheel_drive                    336319 non-null  object        
 7   power                          336319 non-null  Int64         
 8   engine_volume                  327449 non-null  Float64       
 9   weight                         336319 non-null  Int64         
 10  seats                          336319 non-null  Int64         

In [20]:
logger.debug(f'Length of df_fossil: {len(df_fossil)} | Before saving to BQ. Replace is {replace}')
if replace:
    bq.to_bq(df_fossil,
             table_name=f'{dataset}_fossil',
             dataset_name='pre_processed',
             if_exists='replace')
else:
    bq.to_bq(df_fossil,
             table_name=f'{dataset}_fossil',
             dataset_name='pre_processed',
             if_exists='merge',
             merge_on=['item_id'])

2025-07-09 14:10:21,500 - pre_processCars - DEBUG - Length of df_fossil: 336319 | Before saving to BQ. Replace is True
2025-07-09 14:10:29,043 - pre_processCars - INFO - 336319 rader lagret i sibr-market.pre_processed.cars_fossil
