In [1]:
import sys
import os

sys.path.append(os.path.abspath(".."))


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders import TargetEncoder

from src.transformers.imputer import CleanTitleImputer
from src.transformers.imputer import AccidentReportImputer
from src.transformers.imputer import FuelTypeImputer
from src.transformers.imputer import MultipleTransmissionHandler
from src.transformers.utils import ToDict

from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, Normalizer, PowerTransformer
from sklearn.ensemble import GradientBoostingRegressor


In [3]:
set_config(display="diagram")


In [4]:
data = pd.read_csv('../data/train.csv')
data.head()


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [5]:
len(data)


188533

In [6]:
unique_values_per_columns = {
    "Column name": data.columns,
    "Unique Values": [data[col].nunique() for col in data.columns], 
    "Type": [data[col].dtypes for col in data.columns],
    "Transformation": [None, "TargetEncoding", "HashEncoding", "Normalization", "Normalization", "TargetEncoding", "HashEncoding", "OneHotEncoding", "TargetEncoding", "TargetEncoding", "OneHotEncoding", "OneHotEncoding", None],
    }

df = pd.DataFrame(unique_values_per_columns)
df.head(30)


Unnamed: 0,Column name,Unique Values,Type,Transformation
0,id,188533,int64,
1,brand,57,object,TargetEncoding
2,model,1897,object,HashEncoding
3,model_year,34,int64,Normalization
4,milage,6651,int64,Normalization
5,fuel_type,7,object,TargetEncoding
6,engine,1117,object,HashEncoding
7,transmission,52,object,OneHotEncoding
8,ext_col,319,object,TargetEncoding
9,int_col,156,object,TargetEncoding


In [7]:
clean_title_imputer = CleanTitleImputer()
accident_report_imputer = AccidentReportImputer()
fuel_type_imputer = FuelTypeImputer()
transmission_handler = MultipleTransmissionHandler()

new_data = clean_title_imputer.transform(data)
new_data = accident_report_imputer.transform(new_data)
new_data = fuel_type_imputer.transform(new_data)
new_data = transmission_handler.transform(new_data)


In [8]:
from xgboost import XGBRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV

model = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.03)


In [9]:
new_data = data


In [18]:
data_imputer = Pipeline([
    ("clean_title_imputer", CleanTitleImputer()),
    ("accident_report_imputer", AccidentReportImputer()),
    ("fuel_type_imputer", FuelTypeImputer()),
    ("transmission_handler", MultipleTransmissionHandler()),
])

hasher_pipeline = Pipeline([
    ("to_dict", ToDict()),
    ("hash_encoder", FeatureHasher(n_features=1024, input_type="dict"))
])

preprocessor = ColumnTransformer([
    #("hash_encoding", hasher_pipeline, ["model"]),
    ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"), ["clean_title", "accident", "transmission"]),
    ("target_encoder", TargetEncoder(), ["brand", "fuel_type"]),
    ("box_cox_transform", PowerTransformer(method="yeo-johnson"), ["model_year", "milage"])
])

pipeline = Pipeline([
    ("preprocessing", preprocessor),
])


In [19]:
new_data = data_imputer.transform(new_data)




In [37]:
X, y = new_data.copy().drop(["price", "id", "engine", "model"], axis=1), data.copy()["price"]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)


In [38]:
X  = pipeline.fit_transform(X, y)


In [None]:
X


array([[ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
         4.33446667e+04, -1.56278221e+00,  2.09914312e+00],
       [ 0.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
         4.33446667e+04, -2.33616770e+00,  1.36400115e+00],
       [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
         2.67297555e+04, -2.33616770e+00,  1.28562030e+00],
       ...,
       [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
         4.33446667e+04,  9.45463098e-01, -1.15167465e+00],
       [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
         4.33446667e+04,  1.14594199e+00, -1.14061594e+00],
       [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
         4.33446667e+04, -1.24135144e-02,  1.20014585e-01]],
      shape=(188533, 10))

In [40]:
model.fit(X_train, y_train)


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [42]:
y_pred = model.predict(X_test)


In [None]:
np.sqrt(np.mean(np.square(y_pred - y_test)))


np.float64(67712.1674372253)

In [14]:
# print("Best RMSE:", -grid.best_score_)
# print("Best Params:", grid.best_params_)


In [15]:
# best_model = grid.best_estimator_


In [47]:
import joblib
model.save_model("../models/xgb_model.json")  # or .bin
joblib.dump(preprocessor, "../models/preprocessor.pkl")


['../models/preprocessor.pkl']