In [1]:
import pickle
import os 
import pandas as pd 
import re
import copy

pickle_file = open("../data/features/car_number_2.pkl", "rb")
unpickled = pickle.load(pickle_file)

basic_info = unpickled[0]
basic_info.update(unpickled[1])
additional_info = unpickled[2]

print("=== BASIC INFO ===\n", basic_info.keys())
print("\n=== ADDITIONAL INFO ===\n", additional_info)

=== BASIC INFO ===
 dict_keys(['type', 'addressLocality', 'addressCountry', 'streetAddress', 'postalCode', 'telephone', 'name', 'image', 'logo', 'url', 'openingHours', 'latitude', 'longitude', 'priceRange', 'price', 'priceCurrency', 'description', 'vehicleEngine', 'value', 'unitCode', 'mileageFromOdometer', 'vehicleSeatingCapacity', 'productionDate', 'color', 'vehicleTransmission', 'bodyType', 'meetsEmissionStandard', 'numberOfDoors', 'manufacturer', 'steeringPosition', 'knownVehicleDamages', 'fuelType', 'brand', 'model', 'Plivajući zamajac', 'Emisiona klasa motora', 'Pogon', 'Menjač', 'Broj vrata', 'Broj sedišta', 'Strana volana', 'Klima', 'Boja', 'Materijal enterijera', 'Boja enterijera', 'Registrovan do', 'Poreklo vozila', 'Oštećenje', 'Zemlja uvoza'])

=== ADDITIONAL INFO ===
 ['Airbag za vozača', 'Airbag za suvozača', 'Bočni airbag', 'ABS', 'ESP', 'ASR', 'Kodiran ključ', 'Blokada motora', 'Centralno zaključavanje', 'Automatsko kočenje', 'Vazdušni jastuci za kolena', 'Branici u boj

## Collecting car features and unpickling them

In [2]:
def generate_price(basic_info, scale=1e6):
    
    mileage_str_raw = basic_info['mileageFromOdometer'] 
    mileage_str = re.sub("[^0-9]", "", mileage_str_raw[ : -4]) 
    mileage = scale * 1 / (float(mileage_str) + 1)

    return mileage



def collect_raw_data(data_folder, max_samples=1000):
    """
    Returns a list where each element is a dictionary corresponding to a single car.

    """
    
    raw_data = []
    file_names = os.listdir(data_folder)

    for i, file_name in enumerate(file_names):
        
        if i >= max_samples:
            break 

        file_path = os.path.join(data_folder, file_name)
        pickle_file = open(file_path, "rb")
        unpickled = pickle.load(pickle_file)

        basic_info = unpickled[0]
        basic_info.update(unpickled[1])
        additional_info = unpickled[2]

        price = generate_price(basic_info)

        basic_info.update({'price' : price})

        raw_data.append(basic_info)

    return raw_data

In [3]:
data_folder = "../data/features"
num_samples = 100

raw_data = collect_raw_data(data_folder, num_samples)

print(raw_data[0])

{'url': 'https://www.polovniautomobili.com/auto-oglasi/17942149/peugeot-407-16hdi-restajling ,', 'image': 'https://hcdn.polovniautomobili.com/user-images/thumbs/1794/17942149/a05cf0dd4a24.jpg-800x600.jpg ,', 'brand': 'Peugeot', 'model': '407', 'manufacturer': 'Peugeot', 'productionDate': '2009', 'fuelType': 'Dizel', 'bodyType': 'Limuzina', 'vehicleEngine': '80.00 KWT', 'mileageFromOdometer': '188.763 KMT', 'name': 'Peugeot 407 16HDI RESTAJLING', 'Plivajući zamajac': 'Sa plivajućim zamajcem', 'Emisiona klasa motora': 'Euro 4', 'Pogon': 'Prednji', 'Menjač': 'Manuelni 5 brzina', 'Broj vrata': '4/5 vrata', 'Broj sedišta': '5 sedišta', 'Strana volana': 'Levi volan', 'Klima': 'Automatska klima', 'Boja': 'Braon', 'Materijal enterijera': 'Štof', 'Boja enterijera': 'Crna', 'Registrovan do': 'Nije registrovan', 'Poreklo vozila': 'Na ime kupca', 'Oštećenje': 'Nije oštećen', 'price': 5.2976203089572165}


## Discarding most of the features and keeping only the interesting ones

In [19]:
def preprocess_features(data_raw, keys_to_preprocess=[]):

    data_clean = copy.copy(data_raw)
    
    if ('productionDate' in keys_to_preprocess):
        data_clean['productionDate'] = int(data_raw['productionDate'])

    if ('vehicleEngine' in  keys_to_preprocess):
        data_clean['vehicleEngine'] = float(data_raw['vehicleEngine'][ : -5])

    if ('Broj sedišta' in keys_to_preprocess):
        data_clean['Broj sedišta'] = int(data_raw['Broj sedišta'].split(' ')[0])

    return data_clean

In [21]:
keys_to_keep = [
    'brand', 'productionDate', 'bodyType', 'fuelType', 
    'vehicleEngine', 'Broj sedišta', 'Oštećenje',
    'Menjač', 'Klima', 
]

keys_to_preprocess = ['productionDate', 'vehicleEngine', 'Broj sedišta']
kept_values = []

for i in range(len(raw_data)):
    
    curr_features_clean = preprocess_features(raw_data[i], keys_to_preprocess)
    curr_features = [
        curr_features_clean[key] for key in keys_to_keep
    ] 
    
    kept_values.append(curr_features)

df_raw = pd.DataFrame(kept_values, columns=keys_to_keep)
df_raw

Unnamed: 0,brand,productionDate,bodyType,fuelType,vehicleEngine,Broj sedišta,Oštećenje,Menjač,Klima
0,Peugeot,2009,Limuzina,Dizel,80.0,5,Nije oštećen,Manuelni 5 brzina,Automatska klima
1,Renault,2020,Džip/SUV,Benzin,103.0,5,Nije oštećen,Manuelni 6 brzina,Automatska klima
2,BMW,2010,Džip/SUV,Dizel,130.0,5,Nije oštećen,Automatski,Automatska klima
3,Volkswagen,2005,Limuzina,Dizel,77.0,5,Nije oštećen,Manuelni 5 brzina,Automatska klima
4,Peugeot,2016,Karavan,Dizel,88.0,5,Nije oštećen,Manuelni 6 brzina,Automatska klima
...,...,...,...,...,...,...,...,...,...
95,Škoda,2016,Hečbek,Dizel,81.0,5,Nije oštećen,Automatski,Automatska klima
96,Audi,2008,Karavan,Dizel,171.0,5,Nije oštećen,Automatski,Automatska klima
97,Volkswagen,2015,Karavan,Dizel,88.0,5,Nije oštećen,Automatski,Automatska klima
98,Citroen,2012,Hečbek,Dizel,68.0,5,Nije oštećen,Manuelni 5 brzina,Automatska klima


## Map 

In [38]:
# =======================================================================
# == AC mapping ==

AC_map = {'Nema klimu' : 0, 'Manuelna klima' : 1, 'Automatska klima' : 2}

# =======================================================================
# == Brands mapping ==

brands_file = open('../data/brands.txt', 'r', encoding='UTF-8')
brands_raw_str = brands_file.readline()
brands_file.close()

brands_raw = brands_raw_str.split('value="')[2 : ]
brands_clean = [x.replace("</select>", "<option ")[ : -17].replace('">', "$") for x in brands_raw]

brands_map = {}
for i, brand in enumerate(brands_clean):
    key = brand.split('$')[1]
    brands_map.update({key: i})

# =======================================================================
# == Map bodyType  == 

body_map = {
    'Limuzina' : 0, 'Hečbek' : 1, 'Karavan' : 2, 'Kupe' : 3, 'Kabriolet/Roadster' : 4, 
    'Monovolumen (MiniVan)' : 5, 'Džip/SUV' : 6, 'Pickup' : 7
    }

# =======================================================================
# == Map fuelType ==

fuel_map = {
    'Benzin' : 0, 'Dizel' : 1, 'Benzin + Gas (TNG)' : 2,
    'Benzin + Metan (CNG)' : 3, 'Električni pogon' : 4, 'Hibridni pogon' : 5
}

# =======================================================================
# == Map Oštećenje ==

damage_map = {
    'Nije oštećen' : 0, 'Oštećen - u voznom stanju' : 1,
    'Oštećen - nije u voznom stanju' : 2
    }

# =======================================================================
# == Map Menjač ==

control_map = {
    'Manuelni 4 brzine' : 0, 'Manuelni 5 brzina' : 1, 
    'Manuelni 6 brzina' : 2, 'Poluautomatski' : 3,
    'Automatski / poluautomatski' : 4, 'Automatski' : 5
}


to_replace = {
    "Klima" : AC_map, "brand" : brands_map, "bodyType" : body_map,
    "fuelType" : fuel_map, "Oštećenje" : damage_map, "Menjač" : control_map
    }

df_clean = df_raw.replace(to_replace)
df_clean.head()

Unnamed: 0,brand,productionDate,bodyType,fuelType,vehicleEngine,Broj sedišta,Oštećenje,Menjač,Klima
0,68,2009,0,1,80.0,5,0,1,2
1,77,2020,6,0,103.0,5,0,2,2
2,11,2010,6,1,130.0,5,0,5,2
3,99,2005,0,1,77.0,5,0,1,2
4,68,2016,2,1,88.0,5,0,2,2
