In [13]:
import pickle
import os 
import pandas as pd
import numpy as np  
import re
import copy

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor

pickle_file = open("../data/features/car_number_2.pkl", "rb")
unpickled = pickle.load(pickle_file)

basic_info = unpickled[0]
basic_info.update(unpickled[1])
additional_info = unpickled[2]

print("=== BASIC INFO ===\n", basic_info.keys())
print("\n=== ADDITIONAL INFO ===\n", additional_info)

=== BASIC INFO ===
 dict_keys(['type', 'addressLocality', 'addressCountry', 'streetAddress', 'postalCode', 'telephone', 'name', 'image', 'logo', 'url', 'openingHours', 'latitude', 'longitude', 'priceRange', 'price', 'priceCurrency', 'description', 'vehicleEngine', 'value', 'unitCode', 'mileageFromOdometer', 'vehicleSeatingCapacity', 'productionDate', 'color', 'vehicleTransmission', 'bodyType', 'meetsEmissionStandard', 'numberOfDoors', 'manufacturer', 'steeringPosition', 'knownVehicleDamages', 'fuelType', 'brand', 'model', 'Plivajući zamajac', 'Emisiona klasa motora', 'Pogon', 'Menjač', 'Broj vrata', 'Broj sedišta', 'Strana volana', 'Klima', 'Boja', 'Materijal enterijera', 'Boja enterijera', 'Registrovan do', 'Poreklo vozila', 'Oštećenje', 'Zemlja uvoza'])

=== ADDITIONAL INFO ===
 ['Airbag za vozača', 'Airbag za suvozača', 'Bočni airbag', 'ABS', 'ESP', 'ASR', 'Kodiran ključ', 'Blokada motora', 'Centralno zaključavanje', 'Automatsko kočenje', 'Vazdušni jastuci za kolena', 'Branici u boj

## Collecting car features and unpickling them

In [2]:
def generate_price(basic_info, scale=1e6):
    
    mileage_str_raw = basic_info['mileageFromOdometer'] 
    mileage_str = re.sub("[^0-9]", "", mileage_str_raw[ : -4]) 
    # price = scale * 1 / (float(mileage_str) + 1)
    # price = float(mileage_str)
    price = scale - float(mileage_str)

    return price


def collect_prices(prices_path):
    
    pickle_file = open(prices_path, "rb")
    prices_unpickled = pickle.load(pickle_file)

    prices_dict = {}
    for price_tuple in prices_unpickled:
        idx = price_tuple[0]
        value = re.sub("[^0-9]", "", price_tuple[1])
        if value == "":
            continue 
        prices_dict.update({idx : int(value)})

    return prices_dict


def collect_raw_data(data_folder, prices):
    """
    Returns a list where each element is a dictionary corresponding to a single car.

    """
    
    raw_data = []
    file_names = os.listdir(data_folder)

    # for file_name in file_names:
    for car_idx in prices.keys():

        file_name = f"car_number_{car_idx}.pkl"
        if file_name not in file_names:
            continue

        file_path = os.path.join(data_folder, file_name)
        pickle_file = open(file_path, "rb")
        unpickled = pickle.load(pickle_file)

        try:
            basic_info = unpickled[0]
            basic_info.update(unpickled[1])
            additional_info = unpickled[2]
        except(TypeError):
            return raw_data

        # price = generate_price(basic_info)
        price = prices[car_idx]

        basic_info.update({'price' : price})

        raw_data.append(basic_info)

    return raw_data

In [3]:
features_folder = "../data/features"
prices_path = "../data/prices.pkl"

prices = collect_prices(prices_path)
raw_data = collect_raw_data(features_folder, prices)



## Discarding most of the features and keeping only the interesting ones

In [4]:
def preprocess_features(data_raw, keys_to_preprocess=[]):

    data_clean = copy.copy(data_raw)
    
    if ('productionDate' in keys_to_preprocess):
        data_clean['productionDate'] = int(data_raw['productionDate'])

    if ('vehicleEngine' in  keys_to_preprocess):
        data_clean['vehicleEngine'] = float(data_raw['vehicleEngine'][ : -5])

    if ('Broj sedišta' in keys_to_preprocess):
        data_clean['Broj sedišta'] = int(data_raw['Broj sedišta'].split(' ')[0])

    if ('mileageFromOdometer' in keys_to_preprocess):
        no_dots = re.sub("[^0-9]", "", data_raw['mileageFromOdometer'][ : -4])
        data_clean['mileageFromOdometer'] = float(no_dots)

    return data_clean

In [7]:
keys_to_keep = [
    'brand', 'productionDate', 'bodyType', 'fuelType', 
    'vehicleEngine', 'mileageFromOdometer', 'Broj sedišta', 
    'Oštećenje', 'Menjač', 'Klima', 'price'
]

keys_to_preprocess = ['productionDate', 'vehicleEngine', 'Broj sedišta', 'mileageFromOdometer']
kept_values = []

for i in range(len(raw_data)):
    
    curr_features_clean = preprocess_features(raw_data[i], keys_to_preprocess)
    curr_features = [
        curr_features_clean[key] for key in keys_to_keep
    ] 
    
    kept_values.append(curr_features)

df_raw = pd.DataFrame(kept_values, columns=keys_to_keep)
df_raw.head()

Unnamed: 0,brand,productionDate,bodyType,fuelType,vehicleEngine,mileageFromOdometer,Broj sedišta,Oštećenje,Menjač,Klima,price
0,Peugeot,2009,Limuzina,Dizel,80.0,188763.0,5,Nije oštećen,Manuelni 5 brzina,Automatska klima,35990
1,Renault,2015,Hečbek,Dizel,66.0,160000.0,5,Nije oštećen,Manuelni 5 brzina,Manuelna klima,2599
2,Nissan,2020,Džip/SUV,Dizel,110.0,0.0,7,Nije oštećen,Automatski,Automatska klima,13560
3,Renault,2021,Hečbek,Benzin,48.0,5.0,4,Nije oštećen,Manuelni 5 brzina,Automatska klima,19610
4,Kia,2021,Džip/SUV,Benzin,97.0,5.0,5,Nije oštećen,Manuelni 6 brzina,Manuelna klima,15850


## Map 

In [8]:
# =======================================================================
# == AC mapping ==

AC_map = {'Nema klimu' : 0, 'Manuelna klima' : 1, 'Automatska klima' : 2}

# =======================================================================
# == Brands mapping ==

brands_file = open('../data/brands.txt', 'r', encoding='UTF-8')
brands_raw_str = brands_file.readline()
brands_file.close()

brands_raw = brands_raw_str.split('value="')[2 : ]
brands_clean = [x.replace("</select>", "<option ")[ : -17].replace('">', "$") for x in brands_raw]

brands_map = {}
for i, brand in enumerate(brands_clean):
    key = brand.split('$')[1]
    brands_map.update({key: i})

# =======================================================================
# == Map bodyType  == 

body_map = {
    'Limuzina' : 0, 'Hečbek' : 1, 'Karavan' : 2, 'Kupe' : 3, 'Kabriolet/Roadster' : 4, 
    'Monovolumen (MiniVan)' : 5, 'Džip/SUV' : 6, 'Pickup' : 7
    }

# =======================================================================
# == Map fuelType ==

fuel_map = {
    'Benzin' : 0, 'Dizel' : 1, 'Metan CNG' : 2, 'Benzin + Gas (TNG)' : 3,
    'Benzin + Metan (CNG)' : 4, 'Električni pogon' : 5, 'Hibridni pogon' : 6
}

# =======================================================================
# == Map Oštećenje ==

damage_map = {
    'Nije oštećen' : 0, 'Oštećen - u voznom stanju' : 1,
    'Oštećen - nije u voznom stanju' : 2
    }

# =======================================================================
# == Map Menjač ==

control_map = {
    'Manuelni 4 brzine' : 0, 'Manuelni 5 brzina' : 1, 
    'Manuelni 6 brzina' : 2, 'Poluautomatski' : 3,
    'Automatski / poluautomatski' : 4, 'Automatski' : 5
}


to_replace = {
    "Klima" : AC_map, "brand" : brands_map, "bodyType" : body_map,
    "fuelType" : fuel_map, "Oštećenje" : damage_map, "Menjač" : control_map
    }

df_clean = df_raw.replace(to_replace)
df_clean.head()

Unnamed: 0,brand,productionDate,bodyType,fuelType,vehicleEngine,mileageFromOdometer,Broj sedišta,Oštećenje,Menjač,Klima,price
0,68,2009,0,1,80.0,188763.0,5,0,1,2,35990
1,77,2015,1,1,66.0,160000.0,5,0,1,1,2599
2,63,2020,6,1,110.0,0.0,7,0,5,2,13560
3,77,2021,1,0,48.0,5.0,4,0,1,2,19610
4,43,2021,6,0,97.0,5.0,5,0,2,1,15850


## Transform data to polynomial features

In [9]:
data_np = df_clean.to_numpy()
features = data_np[ : , : -1]
targets = data_np[ : , -1]

In [10]:
degree = 1
poly = PolynomialFeatures(degree)

features_poly = poly.fit_transform(features)

train_features, test_features, train_targets, test_targets = train_test_split(
    features_poly, targets, test_size=0.33 
)

## Linear regression

In [38]:
# Train
linear_regressor = LinearRegression()
linear_regressor.fit(train_features, train_targets)

test_preds_lr = linear_regressor.predict(test_features)
rmse_lr = mean_squared_error(test_targets, test_preds_lr, squared=False)

print("LR RMSE -- ", rmse_lr)

for i in range(0):
    print("PRED -- ", test_preds[i])
    print("REAL -- ", test_targets[i])
    print("\n\n")

# Test

LR RMSE --  10453.1860857016


## K Nearest Neighbors

In [36]:
N_neighbors = 4

# Train
knn = KNeighborsRegressor(n_neighbors=N_neighbors)
knn.fit(train_features, train_targets)

test_preds_knn = knn.predict(test_features)
rmse_knn = mean_squared_error(test_targets, test_preds_knn, squared=False)

print("KNN RMSE -- ", rmse_knn)

for i in range(0):
    print("PRED -- ", test_preds[i])
    print("REAL -- ", test_targets[i])
    print("\n\n")

# Test

KNN RMSE --  10921.407457223086


## Multi-Layer Perceptron

In [37]:
# Train
mlp = MLPRegressor(
    solver='sgd', max_iter=int(10000), activation='relu', 
    learning_rate_init=0.0001
    )
mlp.fit(train_features, train_targets)

test_preds_mlp = mlp.predict(test_features)
rmse_mlp = mean_squared_error(test_targets, test_preds_mlp, squared=False)

print("MLP RMSE -- ", rmse_mlp)

MLP RMSE --  26002253109410.38
