In [178]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [179]:
# Load data
df_one2car = pd.read_csv('./data_one2car.csv')
df = df_one2car
df.rename(columns={'Unnamed: 0': 'Id'}, inplace=True)
df.head()

Unnamed: 0,Id,car_year,brand,model,sub_model,sub_model_name,car_type,transmission,model_year_start,model_year_end,color,mile,date,webid,cost
0,0,2015,Mazda,2,1.5,Sports Maxx Sports,Hatchback,AT,9,14,white,140000,2023-10-25,mazda-2-sports-maxx-sports-กรุงเทพและปริมณฑล-ม...,245000
1,1,2012,Mazda,3,2.0,Maxx Sports,Hatchback,AT,11,14,brown,82000,2023-10-25,mazda-3-maxx-sports-กรุงเทพและปริมณฑล-มีนบุรี/...,269000
2,2,2021,Mazda,2,1.3,S Leather,Sedan,AT,15,25,gray,37500,2023-10-25,mazda-2-s-leather-ภาคอีสาน-อำเภอเมืองร้อยเอ็ด/...,390000
3,3,2021,Mazda,CX-30,2.0,SP,SUV,AT,20,25,red,82500,2023-10-25,mazda-cx-30-sp-กรุงเทพและปริมณฑล-อำเภอบางพลี/1...,650000
4,4,2016,Mazda,2,1.3,High Connect,Sedan,AT,15,25,red,152500,2023-10-25,mazda-2-high-connect-กรุงเทพและปริมณฑล-กาญจนาภ...,357900


In [180]:
nominal_cols = ['brand', 'model', 'sub_model', 'sub_model_name', 'car_type', 'transmission', 'color']

df['car_model'] = df['brand'] + ' ' + df['model'] + ' ' + df['sub_model'].astype(str) + ' ' + df['sub_model_name'] + ' ' + df['car_type']
df = df[df.groupby('car_model')['car_model'].transform('count')>=5]

In [181]:
# Numeric features
numeric_cols = ['car_year', 'model_year_start', 'model_year_end', 'mile']

# Nominal features
nominal_cols = ['brand', 'model', 'sub_model', 'sub_model_name', 'car_type', 'transmission', 'color']

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('nom', OneHotEncoder(drop='first', dtype='int8', sparse_output=False), nominal_cols)
    ]
)

preprocessor.set_output(transform='pandas')

# Transform the data
df_transformed = preprocessor.fit_transform(df)


In [182]:
df_transformed['Id'] = df['Id']
df_transformed['cost'] = df['cost']
df_transformed['car_model'] = df['car_model']

print(df_transformed.shape)
df_transformed.head()

(2172, 83)


Unnamed: 0,num__car_year,num__model_year_start,num__model_year_end,num__mile,nom__model_3,nom__model_BT-50 PRO,nom__model_CX-3,nom__model_CX-30,nom__model_CX-5,nom__model_CX-8,...,nom__color_gray,nom__color_green,nom__color_other,nom__color_red,nom__color_silver,nom__color_sky,nom__color_white,Id,cost,car_model
0,-0.680002,-2.01442,-1.817434,0.851441,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,245000,Mazda 2 1.5 Sports Maxx Sports Hatchback
1,-1.670146,-1.276841,-1.817434,-0.039278,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,269000,Mazda 3 2.0 Maxx Sports Hatchback
2,1.300286,0.198318,0.917991,-0.722674,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2,390000,Mazda 2 1.3 S Leather Sedan
3,1.300286,2.042266,0.917991,-0.031599,0,0,0,1,0,0,...,0,0,0,1,0,0,0,3,650000,Mazda CX-30 2.0 SP SUV
4,-0.349954,0.198318,0.917991,1.043406,0,0,0,0,0,0,...,0,0,0,1,0,0,0,4,357900,Mazda 2 1.3 High Connect Sedan


In [183]:
df_transformed.to_csv('preprocessed_data.csv', index=False)

In [184]:
import pickle

model_file = open('preprocessor.model', 'wb')

# Save Decision tree model
pickle.dump(preprocessor, model_file)

# Close file
model_file.close()

In [185]:
# Open file - Read binary mode
model_file = open('preprocessor.model', 'rb')

# Load your model
preprocessor = pickle.load(model_file)

# Close file
model_file.close()

In [186]:
preprocessor.transform(df.drop(columns=['Id', 'webid', 'car_model', 'date', 'cost']))

Unnamed: 0,num__car_year,num__model_year_start,num__model_year_end,num__mile,nom__model_3,nom__model_BT-50 PRO,nom__model_CX-3,nom__model_CX-30,nom__model_CX-5,nom__model_CX-8,...,nom__color_brown,nom__color_cream,nom__color_gold,nom__color_gray,nom__color_green,nom__color_other,nom__color_red,nom__color_silver,nom__color_sky,nom__color_white
0,-0.680002,-2.014420,-1.817434,0.851441,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,-1.670146,-1.276841,-1.817434,-0.039278,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1.300286,0.198318,0.917991,-0.722674,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1.300286,2.042266,0.917991,-0.031599,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,-0.349954,0.198318,0.917991,1.043406,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2252,0.970238,0.198318,0.917991,-0.876246,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2253,0.640190,0.198318,0.171966,-0.108385,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2254,0.970238,-0.170472,-0.076709,-0.492315,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2255,0.640190,-0.170472,-0.076709,0.582689,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
