# Data Preprocessing
Data preprocessing for all websites after specific preprocess for each website 
### Import Libraries and Read data

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler 

In [2]:
df_one2car = pd.read_csv('./data_one2car.csv')
df = df_one2car
df.rename(columns={'Unnamed: 0':'Id'}, inplace=True)
df.head(5)

Unnamed: 0,Id,car_year,brand,model,sub_model,sub_model_name,car_type,transmission,model_year_start,model_year_end,color,mile,date,webid,cost
0,0,2015,Mazda,2,1.5,Sports Maxx Sports,Hatchback,AT,9,14,white,140000,2023-10-25,mazda-2-sports-maxx-sports-กรุงเทพและปริมณฑล-ม...,245000
1,1,2012,Mazda,3,2.0,Maxx Sports,Hatchback,AT,11,14,brown,82000,2023-10-25,mazda-3-maxx-sports-กรุงเทพและปริมณฑล-มีนบุรี/...,269000
2,2,2021,Mazda,2,1.3,S Leather,Sedan,AT,15,25,gray,37500,2023-10-25,mazda-2-s-leather-ภาคอีสาน-อำเภอเมืองร้อยเอ็ด/...,390000
3,3,2021,Mazda,CX-30,2.0,SP,SUV,AT,20,25,red,82500,2023-10-25,mazda-cx-30-sp-กรุงเทพและปริมณฑล-อำเภอบางพลี/1...,650000
4,4,2016,Mazda,2,1.3,High Connect,Sedan,AT,15,25,red,152500,2023-10-25,mazda-2-high-connect-กรุงเทพและปริมณฑล-กาญจนาภ...,357900


### Numerosity reduction
Drop rows that have unfrequent car model

In [3]:
df['car_model'] = df['brand'] + ' ' + df['model'] + ' ' + df['sub_model'].astype(str) + ' ' + df['sub_model_name'] + ' ' + df['car_type']
df = df[df.groupby('car_model')['car_model'].transform('count')>=5]

### Data transformation
#### Normalization
using z-score

In [4]:
numeric_cols = ['car_year', 'model_year_start', 'model_year_end', 'mile']

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


#### Data type conversion
convert categorical columns to numeric features

In [5]:
nominal_cols = ['brand', 'model', 'sub_model', 'sub_model_name', 'car_type', 'transmission', 'color']
df = pd.concat([pd.get_dummies(data = df, columns= nominal_cols, drop_first=True, dtype='int8'), df[nominal_cols]], axis=1)

### Data reduction
#### Feature selection
drop feature that does not helpful when appraisal

In [6]:
df.drop(columns=['webid', 'date'], inplace=True)

In [7]:
df.drop(columns=nominal_cols, inplace=True)

In [8]:
df.head()

Unnamed: 0,Id,car_year,model_year_start,model_year_end,mile,cost,car_model,model_3,model_BT-50 PRO,model_CX-3,...,color_brown,color_cream,color_gold,color_gray,color_green,color_other,color_red,color_silver,color_sky,color_white
0,0,-0.680002,-2.01442,-1.817434,0.851441,245000,Mazda 2 1.5 Sports Maxx Sports Hatchback,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,-1.670146,-1.276841,-1.817434,-0.039278,269000,Mazda 3 2.0 Maxx Sports Hatchback,1,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2,1.300286,0.198318,0.917991,-0.722674,390000,Mazda 2 1.3 S Leather Sedan,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,3,1.300286,2.042266,0.917991,-0.031599,650000,Mazda CX-30 2.0 SP SUV,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,4,-0.349954,0.198318,0.917991,1.043406,357900,Mazda 2 1.3 High Connect Sedan,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [9]:
df.shape

(2172, 83)

### Save data as csv file

In [10]:
df.to_csv('preprocessed_data.csv', index=False)