In [3]:
import sys
import os

sys.path.append(os.path.abspath(".."))


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders import TargetEncoder

from src.transformers.imputer import CleanTitleImputer
from src.transformers.imputer import AccidentReportImputer
from src.transformers.imputer import FuelTypeImputer
from src.transformers.imputer import MultipleTransmissionHandler
from src.transformers.utils import ToDict

from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.ensemble import GradientBoostingRegressor


In [6]:
set_config(display="diagram")


In [7]:
data = pd.read_csv('../data/train.csv')
data.head()


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [8]:
len(data)


188533

In [9]:
unique_values_per_columns = {
    "Column name": data.columns,
    "Unique Values": [data[col].nunique() for col in data.columns], 
    "Type": [data[col].dtypes for col in data.columns],
    "Transformation": [None, "TargetEncoding", "HashEncoding", "Normalization", "Normalization", "TargetEncoding", "HashEncoding", "OneHotEncoding", "TargetEncoding", "TargetEncoding", "OneHotEncoding", "OneHotEncoding", None],
    }

df = pd.DataFrame(unique_values_per_columns)
df.head(30)


Unnamed: 0,Column name,Unique Values,Type,Transformation
0,id,188533,int64,
1,brand,57,object,TargetEncoding
2,model,1897,object,HashEncoding
3,model_year,34,int64,Normalization
4,milage,6651,int64,Normalization
5,fuel_type,7,object,TargetEncoding
6,engine,1117,object,HashEncoding
7,transmission,52,object,OneHotEncoding
8,ext_col,319,object,TargetEncoding
9,int_col,156,object,TargetEncoding


In [10]:
clean_title_imputer = CleanTitleImputer()
accident_report_imputer = AccidentReportImputer()
fuel_type_imputer = FuelTypeImputer()
transmission_handler = MultipleTransmissionHandler()

new_data = clean_title_imputer.transform(data)
new_data = accident_report_imputer.transform(new_data)
new_data = fuel_type_imputer.transform(new_data)
new_data = transmission_handler.transform(new_data)


In [None]:
from xgboost import XGBRegressor

model = XGBRegressor()


In [32]:
data_imputer = Pipeline([
    ("clean_title_imputer", CleanTitleImputer()),
    ("accident_report_imputer", AccidentReportImputer()),
    ("fuel_type_imputer", FuelTypeImputer()),
    ("transmission_handler", MultipleTransmissionHandler()),
])

hasher_pipeline = Pipeline([
    ("to_dict", ToDict()),
    ("hash_encoder", FeatureHasher(n_features=1024, input_type="dict"))
])

preprocessor = ColumnTransformer([
    ("hash_encoding", hasher_pipeline, ["model", "engine"]),
    ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"), ["clean_title", "accident", "transmission"]),
    ("target_encoder", TargetEncoder(), ["ext_col", "int_col", "brand", "fuel_type"])
])

pipeline = Pipeline([
    ("imputer", data_imputer),
    ("preprocessing", preprocessor),
    ("regressor", model),
])


In [33]:
X, y = data.drop(["price", "id"], axis=1), data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [34]:
pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('imputer', ...), ('preprocessing', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('clean_title_imputer', ...), ('accident_report_imputer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('hash_encoding', ...), ('one_hot_encoder', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,n_features,1024
,input_type,'dict'
,dtype,<class 'numpy.float64'>
,alternate_sign,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [35]:
y_pred = pipeline.predict(X_test)




In [36]:
res = {"Result": y_pred, "Real Values": y_test}
df_res = pd.DataFrame(res)
df_res


Unnamed: 0,Result,Real Values
25954,30700.509766,20999
90882,16350.233398,14900
29131,34560.261719,13500
29649,56093.707031,81500
93869,35871.437500,58000
...,...,...
142676,13495.969727,11100
186068,37808.277344,19750
141284,27553.500000,18999
59990,40720.984375,6700


In [37]:
rmse = np.sqrt(np.mean((y_pred - y_test) ** 2))


In [38]:
rmse


np.float64(76833.48589329167)

In [39]:
y.mean()


np.float64(43878.01617753921)

In [40]:
rmse / y.mean()


np.float64(1.751070184723212)