# Imports

In [41]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [2]:
df = pd.read_csv('../Data/merged_vehicle_data.csv')
df.head()

Unnamed: 0,make,model,engine_size,trim,num_doors,mileage,reg_year,transmission_type,fuel_type,price,is_electric
0,BMW,1 Series,1.5,116d Sport,2,65221,2016,Manual,Diesel,9400,0
1,BMW,1 Series,2.0,120d M Sport,2,77873,2015,Manual,Diesel,9950,0
2,BMW,1 Series,1.5,118i Sport,2,57371,2016,Manual,Petrol,10050,0
3,BMW,1 Series,2.0,118d M Sport,2,71342,2016,Manual,Diesel,10450,0
4,BMW,1 Series,1.5,118i M Sport,2,77767,2017,Manual,Petrol,11000,0


# Model 1

In [3]:
cvec1 = CountVectorizer()
trimcvec = cvec1.fit_transform(df['trim'])
model1 = pd.DataFrame(trimcvec.todense(), columns=cvec1.get_feature_names_out())
model1[['model','make','engine_size', 'num_doors', 'mileage', 'reg_year', 'transmission_type',
        'fuel_type','is_electric', 'price']] = df[['model', 'make','engine_size', 'num_doors',
            'mileage','reg_year','transmission_type','fuel_type','is_electric', 'price']].copy()

In [4]:
model1 = pd.get_dummies(model1, columns=['make','model','transmission_type',
                                         'fuel_type'], drop_first=True)

In [5]:
X = model1.drop(columns=['price'])
y = model1['price']

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

In [6]:
extra = ExtraTreesRegressor(n_estimators=500)
extra.fit(X_train, y_train)
extra.score(X_train, y_train), extra.score(X_val, y_val)

(0.999999422681311, 0.9921996478473136)

In [7]:
extra_pred = extra.predict(X_val)
metrics.mean_absolute_error(y_val, extra_pred)

137.0378930575681

# Model 2

In [8]:
params = {
    'max_features': np.arange(1, X.shape[1] + 1), # p
    'max_depth': np.append(np.arange(1, 10), None), # 10
    'min_samples_leaf': np.arange(1, 31) # 30
}

rf = RandomForestRegressor(
    n_estimators=100,
    random_state=2023
)
kf = KFold(n_splits=10, shuffle=True, random_state=42)
rs = RandomizedSearchCV(rf, params, n_iter=100, cv=kf, n_jobs=-1);

In [9]:
%%time
rs.fit(X_train, y_train)

CPU times: user 21.5 s, sys: 1.17 s, total: 22.6 s
Wall time: 1h 3min 36s


In [10]:
rs.score(X_train, y_train), rs.score(X_val, y_val)

(0.9930351325587348, 0.9868141096489367)

In [11]:
rs_preds = rs.predict(X_val)
metrics.mean_absolute_error(y_val, rs_preds)

245.10179807235133

# Model 3

In [12]:
pgrid = {
    'learning_rate': [0.1, 1],
    'n_estimators': [10, 100, 200, 300],
    'max_depth': [None, 1, 2, 3]
}
grad = GradientBoostingRegressor()
gs = GridSearchCV(grad, pgrid, cv=kf, n_jobs=-1)

In [13]:
%%time
gs.fit(X_train, y_train)
gs.score(X_train, y_train), gs.score(X_val, y_val)

CPU times: user 37 s, sys: 449 ms, total: 37.4 s
Wall time: 1h 8min 9s


(0.9999995712971101, 0.9865043011181779)

In [14]:
gs_preds = gs.predict(X_val)
metrics.mean_absolute_error(y_val, gs_preds)

196.2309044248525

# Model 4

In [15]:
ss = StandardScaler()
X_trainss = ss.fit_transform(X_train)
X_valss = ss.transform(X_val)

In [16]:
et2 = ExtraTreesRegressor(n_estimators=500)
et2.fit(X_trainss, y_train)
et2.score(X_trainss, y_train), et2.score(X_valss, y_val)

(0.9999993985630082, 0.9921060951530838)

In [17]:
et2_preds = et2.predict(X_valss)
metrics.mean_absolute_error(y_val, et2_preds)

137.3924187901367

# Model 5

In [18]:
%%time
tree = DecisionTreeRegressor()
tree.fit(X_trainss, y_train)
tree.score(X_trainss, y_train), tree.score(X_valss, y_val)

CPU times: user 375 ms, sys: 14.7 ms, total: 390 ms
Wall time: 390 ms


(0.9999995720026178, 0.9849092692770431)

In [19]:
tree_preds = tree.predict(X_valss)
metrics.mean_absolute_error(y_val, tree_preds)

206.9469216579817

# Model 6

In [20]:
%%time
ada = AdaBoostRegressor()
ada.fit(X_train, y_train)
ada.score(X_train, y_train), ada.score(X_val, y_val)

CPU times: user 18.8 s, sys: 1.55 s, total: 20.4 s
Wall time: 20.6 s


(0.45409472786158434, 0.47358483206804036)

In [21]:
ada_preds = ada.predict(X_valss)
metrics.mean_absolute_error(y_val, ada_preds)



4824.529269854149

# Model 7

In [22]:
%%time
bag = BaggingRegressor()
bag.fit(X_train, y_train)
bag.score(X_train, y_train), bag.score(X_val, y_val)

CPU times: user 2.64 s, sys: 315 ms, total: 2.95 s
Wall time: 2.97 s


(0.9981385569366159, 0.9892925479518176)

In [23]:
bag_preds = bag.predict(X_val)
metrics.mean_absolute_error(y_val, bag_preds)

178.25615793178156

# Model 8

In [24]:
%%time
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
knn.score(X_train, y_train), knn.score(X_val, y_val)

CPU times: user 50.5 s, sys: 832 ms, total: 51.3 s
Wall time: 7.04 s


(0.9129008690030188, 0.8751809015267563)

In [25]:
knn_preds = knn.predict(X_val)
metrics.mean_absolute_error(y_val, knn_preds)

857.9554020835772

# Model 9

In [26]:
%%time
las = Lasso(max_iter=10_000)
las.fit(X_train, y_train)
las.score(X_train, y_train), las.score(X_val, y_val)

CPU times: user 2min 43s, sys: 5.39 s, total: 2min 49s
Wall time: 22.1 s


(0.9412194494642548, 0.9373503634864615)

In [27]:
las_preds = las.predict(X_val)
metrics.mean_absolute_error(y_val, las_preds)

1329.71024634881

# Model 10

In [28]:
rid = Ridge()
rid.fit(X_train, y_train)
rid.score(X_train, y_train), rid.score(X_val, y_val)

(0.9513130413847479, 0.947140318018556)

In [29]:
rid_preds = rid.predict(X_val)
metrics.mean_absolute_error(y_val, rid_preds)

1196.941044094705

# Model 11

In [30]:
pgrid = {
    'estimator__max_depth': [None, 1, 2, 3, 4, 5, 6, 7],
    'estimator__min_samples_leaf': np.arange(11, 22, 2) 
}

tree = DecisionTreeRegressor()
bag = BaggingRegressor(tree, n_estimators = 500, random_state=42)
gs = GridSearchCV(bag, pgrid, cv=kf, n_jobs=-1)

In [31]:
%%time
gs.fit(X_train, y_train)

CPU times: user 1min 47s, sys: 3.05 s, total: 1min 50s
Wall time: 4h 18min 32s


In [32]:
gs.best_params_

{'estimator__max_depth': None, 'estimator__min_samples_leaf': 11}

In [33]:
gs.score(X_train, y_train), gs.score(X_val, y_val)

(0.97700484918061, 0.9700940066860261)

In [34]:
gs_preds = gs.predict(X_val)
metrics.mean_absolute_error(y_val, gs_preds)

601.4636436086573

# Pickling best Model

In [69]:
# Preprocessing
cvec1 = CountVectorizer()
trimcvec = cvec1.fit_transform(df['trim'])
model1 = pd.DataFrame(trimcvec.todense(), columns=cvec1.get_feature_names_out())
oh = OneHotEncoder(drop='first', sparse = False)
test = oh.fit_transform(df[['make','model','transmission_type','fuel_type']])

test1 = pd.DataFrame(test, columns=oh.get_feature_names_out())
data = pd.concat([model1,test1], axis=1)
data[['engine_size', 'num_doors', 'mileage', 'reg_year','is_electric','price']] = \
            df[['engine_size','num_doors','mileage','reg_year','is_electric', 'price']].copy()



In [70]:
#train test split
X = data.drop(columns=['price'])
y = data['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

#fitting model
extra = ExtraTreesRegressor(n_estimators=500)
extra.fit(X_train, y_train)

In [71]:
extra.score(X_train, y_train), extra.score(X_val, y_val)

(0.9999994305790598, 0.9921912518588221)

In [76]:
preds = extra.predict(X_val)
metrics.mean_absolute_error(y_val, preds)

136.80797391463292

In [77]:
with open('../Pickled_Model/cvec.pkl', 'wb') as file:
    pickle.dump(cvec1, file)
with open('../Pickled_Model/OneHotEncoder.pkl','wb') as file:
    pickle.dump(oh, file)
with open('../Pickled_Model/Production_Model.pkl','wb') as file:
    pickle.dump(extra, file)