<a href="https://colab.research.google.com/github/sohyun329/ML_Study/blob/main/%EB%9E%9C%EB%8D%A4%ED%8F%AC%EB%A0%88%EC%8A%A4%ED%8A%B8_%EC%A4%91%EA%B3%A0%EC%B0%A8%EA%B0%80%EA%B2%A9%EC%98%88%EC%B8%A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 랜덤포레스트
- 결정트리의 단점인 오버피팅 문제를 완화시켜주는 발전된 형태의 트리 모델
- 랜덤으로 생성된 무수히 많은 트리를 이용하여 예측
- 앙상블 기법 : 여러 모델을 만들고 각 예측값들을 투표/평균 등으로 통합하여 더 정확한 예측을 도모하는 방법
- 장점 :
  - 아웃라이어에 거의 영향을 받지 않는다
  - 선형/비선형 데이터에 상관없이 잘 작동
- 단점 :
  - 학습 속도가 상대적으로 느림
  - 수많은 트리를 동원하기 때문에 모델 해석이 어려움
- 유용한 곳 :
  - 종속변수가 연속형 데이터와 범주형 데이터인 경우 모두 사용 가능
  - 아웃라이어가 문제가 되는 경우 선형 모델보다 좋은 대안이 될 수 있음
  - 오버피팅 문제로 결정 트리를 사용하기 어려울 때, 랜덤 포레스트 사용 가능

## 1. 라이브러리 및 데이터 불러오기, 데이터 확인

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

file_url = 'https://media.githubusercontent.com/media/musthave-ML10/data_source/main/car.csv'
data = pd.read_csv(file_url)

In [2]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


In [4]:
round(data.describe(),2)

Unnamed: 0,year,selling_price,km_driven,seats
count,8128.0,8128.0,8128.0,7907.0
mean,2013.8,638271.81,69819.51,5.42
std,4.04,806253.4,56550.55,0.96
min,1983.0,29999.0,1.0,2.0
25%,2011.0,254999.0,35000.0,5.0
50%,2015.0,450000.0,60000.0,5.0
75%,2017.0,675000.0,98000.0,5.0
max,2020.0,10000000.0,2360457.0,14.0


## 2. 전처리 : 텍스트 데이터

1. engine 변수 처리

In [7]:
data[['engine','engine_unie']] = data['engine'].str.split(expand=True)
data['engine'].head()

Unnamed: 0,engine
0,1248
1,1498
2,1497
3,1396
4,1298


In [8]:
data['engine'] = data['engine'].astype('float32')
data['engine'].head()

Unnamed: 0,engine
0,1248.0
1,1498.0
2,1497.0
3,1396.0
4,1298.0


In [10]:
data['engine_unie'].unique()

array(['CC', nan], dtype=object)

In [11]:
data.drop('engine_unie',axis=1, inplace=True)

2. max_power 변수 전처리

In [12]:
data[['max_power','max_power_unit']] = data['max_power'].str.split(expand=True)
data['max_power'].head()

Unnamed: 0,max_power
0,74.0
1,103.52
2,78.0
3,90.0
4,88.2


In [14]:
data['max_power'] = data['max_power'].astype('float32')

ValueError: could not convert string to float: 'bhp'

In [13]:
data[data['max_power']=='bhp']

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,max_power_unit
4933,Maruti Omni CNG,2000,80000,100000,CNG,Individual,Manual,Second Owner,10.9 km/kg,796.0,bhp,,8.0,


In [16]:
def isFloat(value):
  try:
    num = float(value)
    return num
  except ValueError:
    return np.NaN

data['max_power'] = data['max_power'].apply(isFloat)

In [17]:
data['max_power_unit'].unique()

array(['bhp', nan, None], dtype=object)

In [18]:
data.drop('max_power_unit', axis=1, inplace=True)

3. mileage 변수 전처리

In [19]:
data[['mileage','mileage_unit']] = data['mileage'].str.split(expand=True)

In [20]:
data['mileage'] = data['mileage'].astype('float32')

In [21]:
data['mileage_unit'].unique()

array(['kmpl', 'km/kg', nan], dtype=object)

- kmpl = km/l : 리터당 킬로미터 ➡️ 휘발유 & 디젤
- km/kg = km/kg : kg당 킬로미터 ➡️ LGP & CNG

In [22]:
data['fuel'].unique()

array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)

- Petrol : 리터당 80.43 달러
- Diesel : 리터당 73.56 달러
- LPG : 킬로그램당 40.85 달러
- CNG : 킬로그램당 44.23 달러

In [23]:
def mile(x):
  if x['fuel'] == 'Petrol':
    return x['mileage']/80.43
  elif x['fuel'] == 'Diesel':
    return x['mileage']/73.56
  elif x['fuel'] == 'LPG':
    return x['mileage']/40.85
  else:
    return x['mileage']/44.23

In [24]:
data['mileage'] = data.apply(mile, axis=1)

In [25]:
data.drop('mileage_unit',axis=1,inplace=True)

4. torque 변수 전처리

In [26]:
data['torque'].head()

Unnamed: 0,torque
0,190Nm@ 2000rpm
1,250Nm@ 1500-2500rpm
2,"12.7@ 2,700(kgm@ rpm)"
3,22.4 kgm at 1750-2750rpm
4,"11.5@ 4,500(kgm@ rpm)"


In [27]:
data['torque'] = data['torque'].str.upper()

In [28]:
def torque_unit(x):
  if 'NM' in str(x):
    return 'Nm'
  elif 'KGM' in str(x):
    return 'kgm'

In [29]:
data['torque_unit'] = data['torque'].apply(torque_unit)

In [30]:
data['torque_unit'].unique()

array(['Nm', 'kgm', None], dtype=object)

In [31]:
data['torque_unit'].isna()
data[data['torque_unit'].isna()]
data[data['torque_unit'].isna()]['torque'].unique()

array([nan, '250@ 1250-5000RPM', '510@ 1600-2400', '110(11.2)@ 4800',
       '210 / 1900'], dtype=object)

In [32]:
data['torque_unit'].fillna('Nm',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['torque_unit'].fillna('Nm',inplace=True)


In [33]:
def split_num(x):
  x = str(x)
  for i,j in enumerate(x):
    if j not in '0123456789.':
      cut = i
      break

  return x[:cut]

data['torque'] = data['torque'].apply(split_num)

In [34]:
data['torque']

Unnamed: 0,torque
0,190
1,250
2,12.7
3,22.4
4,11.5
...,...
8123,113.7
8124,24
8125,190
8126,140


In [35]:
data['torque'] = data['torque'].astype('float64')

ValueError: could not convert string to float: ''

In [36]:
data['torque'] = data['torque'].replace('',np.NaN)

In [37]:
data['torque'] = data['torque'].astype('float64')

In [38]:
data['torque'].head()

Unnamed: 0,torque
0,190.0
1,250.0
2,12.7
3,22.4
4,11.5


In [39]:
def torque_trans(x):
  if x['torque_unit'] == 'kgm':
    return x['torque']*9.8066
  else:
    return x['torque']

data['torque'] = data.apply(torque_trans, axis=1)

In [40]:
data.drop('torque_unit',axis=1,inplace=True)

In [41]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,0.318108,1248.0,74.0,190.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,0.287384,1498.0,103.52,250.0,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,0.220067,1497.0,78.0,124.54382,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,0.31267,1396.0,90.0,219.66784,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,0.200174,1298.0,88.2,112.7759,5.0


5. name 변수 전처리

In [42]:
data['name'] = data['name'].str.split(expand=True)[0]

In [43]:
data['name'].unique()

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Fiat', 'Datsun', 'Jeep',
       'Mercedes-Benz', 'Mitsubishi', 'Audi', 'Volkswagen', 'BMW',
       'Nissan', 'Lexus', 'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo',
       'Kia', 'Force', 'Ambassador', 'Ashok', 'Isuzu', 'Opel', 'Peugeot'],
      dtype=object)

In [44]:
data['name'] = data['name'].replace('Land','Land Rover')

## 3. 결측치 처리 & 더미 변수 변환

In [45]:
data.isna().mean()

Unnamed: 0,0
name,0.0
year,0.0
selling_price,0.0
km_driven,0.0
fuel,0.0
seller_type,0.0
transmission,0.0
owner,0.0
mileage,0.02719
engine,0.02719


In [47]:
data.dropna(inplace=True)
len(data)

7906

In [48]:
data = pd.get_dummies(data, columns=['name','fuel','seller_type','transmission','owner'],drop_first=True)
data.head()

Unnamed: 0,year,selling_price,km_driven,mileage,engine,max_power,torque,seats,name_Ashok,name_Audi,...,fuel_Diesel,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2014,450000,145500,0.318108,1248.0,74.0,190.0,5.0,False,False,...,True,False,False,True,False,True,False,False,False,False
1,2014,370000,120000,0.287384,1498.0,103.52,250.0,5.0,False,False,...,True,False,False,True,False,True,False,True,False,False
2,2006,158000,140000,0.220067,1497.0,78.0,124.54382,5.0,False,False,...,False,False,True,True,False,True,False,False,False,True
3,2010,225000,127000,0.31267,1396.0,90.0,219.66784,5.0,False,False,...,True,False,False,True,False,True,False,False,False,False
4,2007,130000,120000,0.200174,1298.0,88.2,112.7759,5.0,False,False,...,False,False,True,True,False,True,False,False,False,False


## 4. 모델링 및 평가

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop('selling_price',axis=1),data['selling_price'],test_size=0.2, random_state=100)

In [50]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=100)
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

In [51]:
from sklearn.metrics import mean_squared_error
print('train_rmse:',mean_squared_error(y_train, train_pred)**0.5, 'test_rmse:',mean_squared_error(y_test, test_pred)**0.5)

train_rmse: 53531.41548125947 test_rmse: 131855.18391308116


## 5. K-폴드 교차검증
- 교차검증 목적 : 모델의 예측력을 더 안정적으로 평가하기 위함
- 데이터를 특정 개수(K개)로 쪼개어서 그 중 하나씩을 선택하여 시험셋으로 사용하되, 이 과정을 K번 만큼 반복
- KFold는 인덱스 값을 이용하여 데이터를 분할하기 때문에 인덱스 값에 빈 값이 있는지 확인해야함

In [52]:
from sklearn.model_selection import KFold

data.reset_index(drop=True, inplace=True)

In [53]:
data

Unnamed: 0,year,selling_price,km_driven,mileage,engine,max_power,torque,seats,name_Ashok,name_Audi,...,fuel_Diesel,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2014,450000,145500,0.318108,1248.0,74.00,190.00000,5.0,False,False,...,True,False,False,True,False,True,False,False,False,False
1,2014,370000,120000,0.287384,1498.0,103.52,250.00000,5.0,False,False,...,True,False,False,True,False,True,False,True,False,False
2,2006,158000,140000,0.220067,1497.0,78.00,124.54382,5.0,False,False,...,False,False,True,True,False,True,False,False,False,True
3,2010,225000,127000,0.312670,1396.0,90.00,219.66784,5.0,False,False,...,True,False,False,True,False,True,False,False,False,False
4,2007,130000,120000,0.200174,1298.0,88.20,112.77590,5.0,False,False,...,False,False,True,True,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7901,2013,320000,110000,0.230014,1197.0,82.85,113.70000,5.0,False,False,...,False,False,True,True,False,True,False,False,False,False
7902,2007,135000,119000,0.228385,1493.0,110.00,235.35840,5.0,False,False,...,True,False,False,True,False,True,True,False,False,False
7903,2009,382000,120000,0.262371,1248.0,73.90,190.00000,5.0,False,False,...,True,False,False,True,False,True,False,False,False,False
7904,2013,290000,25000,0.320419,1396.0,70.00,140.00000,5.0,False,False,...,True,False,False,True,False,True,False,False,False,False


In [54]:
kf = KFold(n_splits=5)

X = data.drop('selling_price',axis=1)
y = data['selling_price']

for i, j in kf.split(X):
  print(i,j)

[1582 1583 1584 ... 7903 7904 7905] [   0    1    2 ... 1579 1580 1581]
[   0    1    2 ... 7903 7904 7905] [1582 1583 1584 ... 3160 3161 3162]
[   0    1    2 ... 7903 7904 7905] [3163 3164 3165 ... 4741 4742 4743]
[   0    1    2 ... 7903 7904 7905] [4744 4745 4746 ... 6322 6323 6324]
[   0    1    2 ... 6322 6323 6324] [6325 6326 6327 ... 7903 7904 7905]


In [55]:
for train_index, test_index in kf.split(X):
  X_train, X_test = X.loc[train_index], X.loc[test_index]
  y_train, y_test = y[train_index], y[test_index]

In [56]:
train_rmse_total = []
test_rmse_total = []

for train_index, test_index in kf.split(X):
  X_train, X_test = X.loc[train_index], X.loc[test_index]
  y_train, y_test = y[train_index], y[test_index]

  model = RandomForestRegressor(random_state=100)
  model.fit(X_train, y_train)
  train_pred = model.predict(X_train)
  test_pred = model.predict(X_test)

  train_rmse = mean_squared_error(y_train, train_pred)**0.5
  test_rmse = mean_squared_error(y_test, test_pred)**0.5

  train_rmse_total.append(train_rmse)
  test_rmse_total.append(test_rmse)

In [57]:
train_rmse_total

[50825.5556350298,
 58854.04054344074,
 57904.19615940739,
 56218.23740006373,
 58967.150857632456]

In [58]:
print('train_rmse:',sum(train_rmse_total)/5, 'test_rmse:',sum(test_rmse_total)/5)

train_rmse: 56553.836119114814 test_rmse: 142936.58918244042


## 8. 하이퍼파라미터 튜닝

In [59]:
train_rmse_total = []
test_rmse_total = []

for train_index, test_index in kf.split(X):
  X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
  y_train, y_test = y[train_index], y[test_index]

  model = RandomForestRegressor(n_estimators=300, max_depth=50, min_samples_split=5, min_samples_leaf=1, n_jobs=-1, random_state=100)
  model.fit(X_train, y_train)
  train_pred = model.predict(X_train)
  test_pred = model.predict(X_test)

  train_rmse = mean_squared_error(y_train, train_pred)**0.5
  test_rmse = mean_squared_error(y_test, test_pred)**0.5

  train_rmse_total.append(train_rmse)
  test_rmse_total.append(test_rmse)

In [60]:
print('train_rmse:',sum(train_rmse_total)/5, 'test_rmse:',sum(test_rmse_total)/5)

train_rmse: 66762.84568886801 test_rmse: 142205.83441414658
