In [1]:
import pickle
import os 
import pandas as pd
import numpy as np  
import re
import copy

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

# idx = 10
# for id in [1, 10, 14, 15]:
#     pickle_file = open(f"../data/features/car_number_{id + 1}.pkl", "rb") ##### offset is 1
#     unpickled = pickle.load(pickle_file)

#     basic_info = unpickled[0]
#     basic_info.update(unpickled[1])
#     additional_info = unpickled[2]

#     if 'price' in basic_info:
#         print(id, basic_info['price'])
    # print(prices[id])
# print("=== BASIC INFO ===\n", basic_info.keys())
# print("\n=== ADDITIONAL INFO ===\n", additional_info)

## Collecting car features and unpickling them

In [2]:
def generate_price(basic_info, scale=1e6):
    
    mileage_str_raw = basic_info['mileageFromOdometer'] 
    mileage_str = re.sub("[^0-9]", "", mileage_str_raw[ : -4]) 
    # price = scale * 1 / (float(mileage_str) + 1)
    # price = float(mileage_str)
    price = scale - float(mileage_str)

    return price


def collect_prices(prices_path):
    
    pickle_file = open(prices_path, "rb")
    prices_unpickled = pickle.load(pickle_file)

    prices_dict = {}
    for price_tuple in prices_unpickled:
        idx = price_tuple[0]
        value = re.sub("[^0-9]", "", price_tuple[1])
        if value == "":
            continue 
        prices_dict.update({idx : int(value)})

    return prices_dict


def collect_raw_data(data_folder, prices):
    """
    Returns a list where each element is a dictionary corresponding to a single car.

    """
    
    raw_data = []
    file_names = os.listdir(data_folder)

    # for file_name in file_names:
    for car_idx in prices.keys():

        file_name = f"car_number_{car_idx + 1}.pkl"
        if file_name not in file_names:
            continue

        file_path = os.path.join(data_folder, file_name)
        pickle_file = open(file_path, "rb")
        unpickled = pickle.load(pickle_file)

        try:
            basic_info = unpickled[0]
            basic_info.update(unpickled[1])
            additional_info = unpickled[2]
        except(TypeError):
            return raw_data


        # print(file_name)
        # print(f"{car_idx} -- {prices[car_idx]}\n\n")
        # print(file_path)
        # print(basic_info)

        # price = generate_price(basic_info)
        price = prices[car_idx]

        basic_info.update({'price' : price})

        raw_data.append(basic_info)

    return raw_data

In [3]:
features_folder = "../data/features"
prices_path = "../data/prices_1000.pkl"

prices = collect_prices(prices_path)
raw_data = collect_raw_data(features_folder, prices)

# list(prices.keys())
# prices

# prices = {1 : 35990}

## Discarding most of the features and keeping only the interesting ones

In [4]:
def preprocess_features(data_raw, keys_to_preprocess=[]):

    data_clean = copy.copy(data_raw)
    
    if ('productionDate' in keys_to_preprocess):
        data_clean['productionDate'] = int(data_raw['productionDate'])

    if ('vehicleEngine' in  keys_to_preprocess):
        data_clean['vehicleEngine'] = float(data_raw['vehicleEngine'][ : -5])

    if ('Broj sedišta' in keys_to_preprocess):
        data_clean['Broj sedišta'] = int(data_raw['Broj sedišta'].split(' ')[0])

    if ('mileageFromOdometer' in keys_to_preprocess):
        no_dots = re.sub("[^0-9]", "", data_raw['mileageFromOdometer'][ : -4])
        data_clean['mileageFromOdometer'] = float(no_dots)

    if ('url' in keys_to_preprocess):
        data_clean['url'] = data_raw['url'][46 : ]

    return data_clean

In [5]:
keys_to_keep = [
    'brand', 'productionDate', 'bodyType', 'fuelType', 
    'vehicleEngine', 'mileageFromOdometer', 'Broj sedišta', 
    'Oštećenje', 'Menjač', 'Klima', 'price'
]

keys_to_preprocess = ['productionDate', 'vehicleEngine', 'Broj sedišta', 'mileageFromOdometer']
kept_values = []

for i in range(len(raw_data)):
    
    curr_features_clean = preprocess_features(raw_data[i], keys_to_preprocess)
    curr_features = [
        curr_features_clean[key] for key in keys_to_keep
    ] 
    
    kept_values.append(curr_features)

df_raw = pd.DataFrame(kept_values, columns=keys_to_keep)
df_raw.head()

Unnamed: 0,brand,productionDate,bodyType,fuelType,vehicleEngine,mileageFromOdometer,Broj sedišta,Oštećenje,Menjač,Klima,price
0,Mercedes Benz,2016,Kupe,Dizel,150.0,197600.0,5,Nije oštećen,Automatski / poluautomatski,Automatska klima,35990
1,Renault,2020,Džip/SUV,Benzin,103.0,5.0,5,Nije oštećen,Manuelni 6 brzina,Automatska klima,20690
2,Mercedes Benz,2015,Limuzina,Dizel,80.0,148000.0,4,Nije oštećen,Automatski,Automatska klima,16500
3,Fiat,2005,Monovolumen (MiniVan),Benzin + Metan (CNG),76.0,179000.0,6,Nije oštećen,Manuelni 5 brzina,Manuelna klima,2199
4,Land Rover,2004,Džip/SUV,Dizel,82.0,193000.0,5,Nije oštećen,Manuelni 5 brzina,Manuelna klima,3599


## Map 

In [6]:
# =======================================================================
# == AC mapping ==

AC_map = {'Nema klimu' : 0, 'Manuelna klima' : 1, 'Automatska klima' : 2}

# =======================================================================
# == Brands mapping ==

brands_file = open('../data/brands.txt', 'r', encoding='UTF-8')
brands_raw_str = brands_file.readline()
brands_file.close()

brands_raw = brands_raw_str.split('value="')[2 : ]
brands_clean = [x.replace("</select>", "<option ")[ : -17].replace('">', "$") for x in brands_raw]

brands_map = {}
for i, brand in enumerate(brands_clean):
    key = brand.split('$')[1]
    brands_map.update({key: i})

# =======================================================================
# == Map bodyType  == 

body_map = {
    'Limuzina' : 0, 'Hečbek' : 1, 'Karavan' : 2, 'Kupe' : 3, 'Kabriolet/Roadster' : 4, 
    'Monovolumen (MiniVan)' : 5, 'Džip/SUV' : 6, 'Pickup' : 7
    }

# =======================================================================
# == Map fuelType ==

fuel_map = {
    'Benzin' : 0, 'Dizel' : 1, 'Metan CNG' : 2, 'Benzin + Gas (TNG)' : 3,
    'Benzin + Metan (CNG)' : 4, 'Električni pogon' : 5, 'Hibridni pogon' : 6
}

# =======================================================================
# == Map Oštećenje ==

damage_map = {
    'Nije oštećen' : 0, 'Oštećen - u voznom stanju' : 1,
    'Oštećen - nije u voznom stanju' : 2
    }

# =======================================================================
# == Map Menjač ==

control_map = {
    'Manuelni 4 brzine' : 0, 'Manuelni 5 brzina' : 1, 
    'Manuelni 6 brzina' : 2, 'Poluautomatski' : 3,
    'Automatski / poluautomatski' : 4, 'Automatski' : 5
}


to_replace = {
    "Klima" : AC_map, "brand" : brands_map, "bodyType" : body_map,
    "fuelType" : fuel_map, "Oštećenje" : damage_map, "Menjač" : control_map
    }

df_clean = df_raw.replace(to_replace)
df_clean.head()

Unnamed: 0,brand,productionDate,bodyType,fuelType,vehicleEngine,mileageFromOdometer,Broj sedišta,Oštećenje,Menjač,Klima,price
0,57,2016,3,1,150.0,197600.0,5,0,4,2,35990
1,77,2020,6,0,103.0,5.0,5,0,2,2,20690
2,57,2015,0,1,80.0,148000.0,4,0,5,2,16500
3,28,2005,5,4,76.0,179000.0,6,0,1,1,2199
4,48,2004,6,1,82.0,193000.0,5,0,1,1,3599


## Transform data to polynomial features

In [7]:
data_np = df_clean.to_numpy()
features_raw = data_np[ : , : -1]
targets = data_np[ : , -1]

# scaler = StandardScaler()
# features = scaler.fit_transform(features_raw)

features = features_raw

In [8]:
degree = 1
poly = PolynomialFeatures(degree)

features_poly = poly.fit_transform(features)

train_features, test_features, train_targets, test_targets = train_test_split(
    features_poly, targets, test_size=0.33 
)

## Linear regression

In [9]:
# Train
linear_regressor = LinearRegression()
linear_regressor.fit(train_features, train_targets)

test_preds_lr = linear_regressor.predict(test_features)
rmse_lr = mean_squared_error(test_targets, test_preds_lr, squared=False)

print("LR RMSE -- ", rmse_lr)

for i in range(10):
    print("PRED -- ", test_preds_lr[i])
    print("REAL -- ", test_targets[i])
    print("\n\n")

# Test

LR RMSE --  4801.0602753495405
PRED --  21292.040975358686
REAL --  35990.0



PRED --  4025.078918133164
REAL --  4199.0



PRED --  7241.673478799639
REAL --  5250.0



PRED --  6298.218840551679
REAL --  5000.0



PRED --  10043.780618656543
REAL --  2990.0



PRED --  6529.713546400424
REAL --  3799.0



PRED --  311.29885514150374
REAL --  2999.0



PRED --  24265.299142878735
REAL --  21700.0



PRED --  37850.29308590782
REAL --  37158.0



PRED --  7618.941004263703
REAL --  12790.0





## K Nearest Neighbors

In [10]:
N_neighbors = 4

# Train
knn = KNeighborsRegressor(n_neighbors=N_neighbors)
knn.fit(train_features, train_targets)

test_preds_knn = knn.predict(test_features)
rmse_knn = mean_squared_error(test_targets, test_preds_knn, squared=False)

print("KNN RMSE -- ", rmse_knn)

for i in range(0):
    print("PRED -- ", test_preds[i])
    print("REAL -- ", test_targets[i])
    print("\n\n")

# Test

KNN RMSE --  8321.482870971675
