In [207]:
import sys
import os

sys.path.append(os.path.abspath(".."))


In [208]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders import TargetEncoder

from src.transformers.imputer import CleanTitleImputer
from src.transformers.imputer import AccidentReportImputer
from src.transformers.imputer import FuelTypeImputer
from src.transformers.imputer import MultipleTransmissionHandler
from src.transformers.utils import ToDict

from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, Normalizer, PowerTransformer
from sklearn.ensemble import GradientBoostingRegressor


In [209]:
set_config(display="diagram")


In [210]:
data = pd.read_csv('../data/train.csv')
data.head()


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [211]:
len(data)


188533

In [212]:
unique_values_per_columns = {
    "Column name": data.columns,
    "Unique Values": [data[col].nunique() for col in data.columns], 
    "Type": [data[col].dtypes for col in data.columns],
    "Transformation": [None, "TargetEncoding", "HashEncoding", "Normalization", "Normalization", "TargetEncoding", "HashEncoding", "OneHotEncoding", "TargetEncoding", "TargetEncoding", "OneHotEncoding", "OneHotEncoding", None],
    }

df = pd.DataFrame(unique_values_per_columns)
df.head(30)


Unnamed: 0,Column name,Unique Values,Type,Transformation
0,id,188533,int64,
1,brand,57,object,TargetEncoding
2,model,1897,object,HashEncoding
3,model_year,34,int64,Normalization
4,milage,6651,int64,Normalization
5,fuel_type,7,object,TargetEncoding
6,engine,1117,object,HashEncoding
7,transmission,52,object,OneHotEncoding
8,ext_col,319,object,TargetEncoding
9,int_col,156,object,TargetEncoding


In [213]:
clean_title_imputer = CleanTitleImputer()
accident_report_imputer = AccidentReportImputer()
fuel_type_imputer = FuelTypeImputer()
transmission_handler = MultipleTransmissionHandler()

new_data = clean_title_imputer.transform(data)
new_data = accident_report_imputer.transform(new_data)
new_data = fuel_type_imputer.transform(new_data)
new_data = transmission_handler.transform(new_data)


In [214]:
from xgboost import XGBRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV

model = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.03)


In [215]:
new_data = data


In [None]:
data_imputer = Pipeline([
    ("clean_title_imputer", CleanTitleImputer()),
    ("accident_report_imputer", AccidentReportImputer()),
    ("fuel_type_imputer", FuelTypeImputer()),
    ("transmission_handler", MultipleTransmissionHandler()),
])

hasher_pipeline = Pipeline([
    ("to_dict", ToDict()),
    ("hash_encoder", FeatureHasher(n_features=1024, input_type="dict"))
])

preprocessor = ColumnTransformer([
    #("hash_encoding", hasher_pipeline, ["model"]),
    ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"), ["clean_title", "accident", "transmission"]),
    ("target_encoder", TargetEncoder(), ["brand", "fuel_type"]),
    ("box_cox_transform", PowerTransformer(method="yeo-johnson"), ["model_year", "milage"])
])

pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("regressor", model),
])


In [217]:
new_data = data_imputer.transform(new_data)




In [218]:
X, y = new_data.copy().drop(["price", "id", "engine", "model"], axis=1), data.copy()["price"]


In [219]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(pipeline, X, y, cv=5, scoring="neg_root_mean_squared_error")
print("RMSE (cross-val):", -scores.mean())


RMSE (cross-val): 73814.121875


In [220]:
# print("Best RMSE:", -grid.best_score_)
# print("Best Params:", grid.best_params_)


In [221]:
# best_model = grid.best_estimator_


In [223]:
import joblib

joblib.dump(model, "../models/model.pkl")
joblib.dump(preprocessor, "../models/preprocessor.pkl")


['../models/preprocessor.pkl']