# Data Preprocessing
Data preprocessing for all websites after specific preprocess for each website 
### Import Libraries and Read data

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler 

In [4]:
df_one2car = pd.read_csv('./data_one2car.csv')
df = df_one2car
df.rename(columns={'Unnamed: 0':'Id'}, inplace=True)
df.head(5)

Unnamed: 0,Id,car_year,brand,model,sub_model,sub_model_name,car_type,transmission,model_year_start,model_year_end,color,mile,date,webid,cost
0,0,2015,Mazda,2,1.5,Sports Maxx Sports,Hatchback,AT,9,14,white,140000,2023-10-25,mazda-2-sports-maxx-sports-กรุงเทพและปริมณฑล-ม...,245000
1,1,2012,Mazda,3,2.0,Maxx Sports,Hatchback,AT,11,14,brown,82000,2023-10-25,mazda-3-maxx-sports-กรุงเทพและปริมณฑล-มีนบุรี/...,269000
2,2,2021,Mazda,2,1.3,S Leather,Sedan,AT,15,25,gray,37500,2023-10-25,mazda-2-s-leather-ภาคอีสาน-อำเภอเมืองร้อยเอ็ด/...,390000
3,3,2021,Mazda,CX-30,2.0,SP,SUV,AT,20,25,red,82500,2023-10-25,mazda-cx-30-sp-กรุงเทพและปริมณฑล-อำเภอบางพลี/1...,650000
4,4,2016,Mazda,2,1.3,High Connect,Sedan,AT,15,25,red,152500,2023-10-25,mazda-2-high-connect-กรุงเทพและปริมณฑล-กาญจนาภ...,357900


### Data transformation
#### Normalization
using z-score

In [5]:
numeric_cols = ['car_year', 'model_year_start', 'model_year_end', 'mile']

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

#### Data type conversion
convert categorical columns to numeric features

In [6]:
nominal_cols = ['brand', 'model', 'sub_model', 'sub_model_name', 'car_type', 'transmission', 'color']
df = pd.concat([pd.get_dummies(data = df, columns= nominal_cols, drop_first=True, dtype='int8'), df[nominal_cols]], axis=1)

### Data reduction
#### Feature selection
drop feature that does not helpful when appraisal

In [7]:
df.drop(columns=['webid', 'date'], inplace=True)

### Numerosity reduction
Drop rows that have unfrequent car model

In [8]:
df['car_model'] = df['brand'] + ' ' + df['model'] + ' ' + df['sub_model'].astype(str) + ' ' + df['sub_model_name'] + ' ' + df['car_type']
df.drop(columns=nominal_cols, inplace=True)
df = df[df.groupby('car_model')['car_model'].transform('count')>=5]

In [9]:
df.head()

Unnamed: 0,Id,car_year,model_year_start,model_year_end,mile,cost,model_3,model_BT-50 PRO,model_CX-3,model_CX-30,...,color_cream,color_gold,color_gray,color_green,color_other,color_red,color_silver,color_sky,color_white,car_model
0,0,-0.622813,-1.894351,-1.742805,0.831625,245000,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Mazda 2 1.5 Sports Maxx Sports Hatchback
1,1,-1.572176,-1.189571,-1.742805,-0.053644,269000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Mazda 3 2.0 Maxx Sports Hatchback
2,2,1.275912,0.21999,0.922832,-0.732859,390000,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Mazda 2 1.3 S Leather Sedan
3,3,1.275912,1.981942,0.922832,-0.046012,650000,0,0,0,1,...,0,0,0,0,0,1,0,0,0,Mazda CX-30 2.0 SP SUV
4,4,-0.306359,0.21999,0.922832,1.022416,357900,0,0,0,0,...,0,0,0,0,0,1,0,0,0,Mazda 2 1.3 High Connect Sedan


In [10]:
df.shape

(2172, 109)

### Save data as csv file

In [11]:
df.to_csv('preprocessed_data.csv', index=False)