In [38]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = sns.load_dataset('mpg')
x_train, x_test, y_train, y_test = train_test_split(df, df['mpg'], test_size = 0.2, random_state = 42)

In [39]:
x_train.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
18,27.0,4,97.0,88.0,2130,14.5,70,japan,datsun pl510
376,37.0,4,91.0,68.0,2025,18.2,82,japan,mazda glc custom l


In [40]:
x_train.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      5
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64

In [41]:
x_train.horsepower.value_counts()

90.0     17
150.0    16
100.0    16
88.0     15
105.0    12
         ..
122.0     1
113.0     1
208.0     1
79.0      1
87.0      1
Name: horsepower, Length: 89, dtype: int64

In [42]:
x_train['horsepower'] = x_train.horsepower.fillna(x_train.horsepower.median())
x_test['horsepower'] = x_test.horsepower.fillna(x_test.horsepower.median())

In [43]:
x_train.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64

In [44]:
# 라벨인코딩
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 3 to 102
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           318 non-null    float64
 1   cylinders     318 non-null    int64  
 2   displacement  318 non-null    float64
 3   horsepower    318 non-null    float64
 4   weight        318 non-null    int64  
 5   acceleration  318 non-null    float64
 6   model_year    318 non-null    int64  
 7   origin        318 non-null    object 
 8   name          318 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 24.8+ KB


In [45]:
x_train.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
18,27.0,4,97.0,88.0,2130,14.5,70,japan,datsun pl510
376,37.0,4,91.0,68.0,2025,18.2,82,japan,mazda glc custom l


In [46]:
from sklearn.preprocessing import LabelEncoder

label = ['origin','name']
x_train[label] = x_train[label].apply(LabelEncoder().fit_transform)
x_test[label] = x_test[label].apply(LabelEncoder().fit_transform)

In [47]:
# 카테고리 변환, 더미처리
category = ['origin']
for i in category:
    x_train[i] = x_train[i].astype('category')
    x_test[i] = x_test[i].astype('category')
    
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

In [48]:
x_train.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,name,origin_0,origin_1,origin_2
3,16.0,8,304.0,150.0,3433,12.0,70,10,0,0,1
18,27.0,4,97.0,88.0,2130,14.5,70,78,0,1,0
376,37.0,4,91.0,68.0,2025,18.2,82,149,0,1,0


In [49]:
x_train.horsepower.describe()

count    318.000000
mean     103.106918
std       37.091938
min       46.000000
25%       75.250000
50%       92.000000
75%      120.000000
max      225.000000
Name: horsepower, dtype: float64

In [50]:
# 파생변수 만들기
x_train['horsepower_qcut'] = pd.qcut(x_train['horsepower'], 5, labels = False)
x_test['horsepower_qcut'] = pd.qcut(x_test['horsepower'], 5, labels = False)

In [51]:
# 스케일작업
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 3 to 102
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   mpg              318 non-null    float64
 1   cylinders        318 non-null    int64  
 2   displacement     318 non-null    float64
 3   horsepower       318 non-null    float64
 4   weight           318 non-null    int64  
 5   acceleration     318 non-null    float64
 6   model_year       318 non-null    int64  
 7   name             318 non-null    int32  
 8   origin_0         318 non-null    uint8  
 9   origin_1         318 non-null    uint8  
 10  origin_2         318 non-null    uint8  
 11  horsepower_qcut  318 non-null    int64  
dtypes: float64(4), int32(1), int64(4), uint8(3)
memory usage: 24.5 KB


In [52]:
x_train.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,name,origin_0,origin_1,origin_2,horsepower_qcut
3,16.0,8,304.0,150.0,3433,12.0,70,10,0,0,1,4
18,27.0,4,97.0,88.0,2130,14.5,70,78,0,1,0,1
376,37.0,4,91.0,68.0,2025,18.2,82,149,0,1,0,0


In [53]:
from sklearn.preprocessing import MinMaxScaler
scaler = ['displacement', 'horsepower','weight']
mm = MinMaxScaler()
mm.fit(x_train[scaler])

x_train[scaler] = mm.transform(x_train[scaler])
x_test[scaler] = mm.transform(x_test[scaler])

In [54]:
# 6.데이터 분리
from sklearn.model_selection import train_test_split
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state= 42)

In [58]:
# 7. 모형학습

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor

model1 = LinearRegression()
model1.fit(x_tr, y_tr)
pred1 = model1.predict(x_val)

model2 = RandomForestRegressor()
model2.fit(x_tr, y_tr)
pred2 = model2.predict(x_val)

# 8. 앙상블 (스태킹)
estimators = [('linear', model1), ('randomF', model2)]
model3 = StackingRegressor(estimators = estimators, final_estimator = RandomForestRegressor())
model3.fit(x_tr, y_tr)
pred3 = model3.predict(x_val)

In [59]:
# 9. 모형평가 
from sklearn.metrics import mean_squared_error
print('선형회귀 RMSE: ', np.sqrt(mean_squared_error(y_val, pred1)))
print('랜덤포레스트 RMSE: ', np.sqrt(mean_squared_error(y_val, pred2)))
print('스태킹 MSE: ', np.sqrt(mean_squared_error(y_val, pred3)))

선형회귀 RMSE:  4.0896505422873116e-13
랜덤포레스트 RMSE:  0.25418945936053416
스태킹 MSE:  0.21041517738746895


In [57]:
# 10. 하이퍼파라미터 튜닝
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': [50, 100], 'max_depth': [4,6]}
model4 = RandomForestRegressor()
gscv = GridSearchCV(estimator = model4, param_grid = parameters, cv = 3)
gscv.fit(x_tr, y_tr)
print("최적 파라미터: ", gscv.best_params_)

최적 파라미터:  {'max_depth': 6, 'n_estimators': 100}


In [62]:
# 11. 저장
result = pd.DataFrame(model2.predict(x_test))
result = result.iloc[:,0]
pd.DataFrame({'id': x_test.index, 'predict': result}).to_csv('000.csv',index = False)
check = pd.read_csv('000.csv')
check.head(3)

Unnamed: 0,id,predict
0,198,33.105
1,396,28.097
2,33,19.032
