# 회귀 모델 완벽 마스터
---

## 1. 펭귄 무게 예측 (펭귄 데이터)
---

In [87]:
import pandas as pd

X_train = pd.read_csv('./datasets/penguin_X_train.csv')
X_test = pd.read_csv('./datasets/penguin_X_test.csv')
y_train = pd.read_csv('./datasets/penguin_y_train.csv')

### (1) 데이터 정보 확인

In [88]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            240 non-null    object 
 1   island             240 non-null    object 
 2   sex                232 non-null    object 
 3   bill_length_mm     238 non-null    float64
 4   bill_depth_mm      238 non-null    float64
 5   flipper_length_mm  238 non-null    float64
dtypes: float64(3), object(3)
memory usage: 11.4+ KB
None


- `sex`, `bill_length_mm`, `bill_depth_mm`, `flipper_length_mm` 컬럼에 결측치가 있음을 확인할 수 있다.

In [89]:
print(X_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            101 non-null    object 
 1   island             101 non-null    object 
 2   sex                101 non-null    object 
 3   bill_length_mm     101 non-null    float64
 4   bill_depth_mm      101 non-null    float64
 5   flipper_length_mm  101 non-null    float64
dtypes: float64(3), object(3)
memory usage: 4.9+ KB
None


In [90]:
print(y_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   body_mass_g  238 non-null    float64
dtypes: float64(1)
memory usage: 2.0 KB
None


In [91]:
print(X_train.describe())

       bill_length_mm  bill_depth_mm  flipper_length_mm
count      238.000000     238.000000         238.000000
mean        43.948739      17.205462         200.684874
std          5.475004       1.973498          14.110080
min         32.100000      13.200000         172.000000
25%         39.200000      15.700000         190.000000
50%         44.700000      17.300000         197.000000
75%         48.700000      18.700000         213.000000
max         58.000000      21.500000         231.000000


In [92]:
print(X_test.describe())

       bill_length_mm  bill_depth_mm  flipper_length_mm
count      101.000000     101.000000         101.000000
mean        43.997030      17.023762         201.623762
std          5.400305       1.982228          13.951238
min         34.000000      13.100000         176.000000
25%         39.600000      15.300000         190.000000
50%         44.000000      17.200000         198.000000
75%         47.700000      18.600000         214.000000
max         59.600000      21.200000         230.000000


In [93]:
print(y_train.describe())

       body_mass_g
count   238.000000
mean   4226.260504
std     802.595662
min    2850.000000
25%    3600.000000
50%    4050.000000
75%    4800.000000
max    6300.000000


### (2) 결측치 처리

- `X_train`과 `y_train`을 합쳐서 결측치를 제거해준다.

In [94]:
# X_train과 y_train을 합쳐서 결측치를 제거해준다.
train = pd.concat([X_train, y_train], axis=1)
# print(train.info())

# 결측치 제거
train = train.dropna()

# 드랍할 경우, 드랍된 부분의 인덱스가 비어지게 된다.
# 따라서 인덱스를 초기화해준다.
train.reset_index(drop=True, inplace=True)   # 인덱스 초기화
print(train.info())

# 다시 X_train, y_train으로 분리
X_train = train[X_train.columns]
y_train = train[y_train.columns]

print(X_train.shape, y_train.shape)

# 기술 통계 확인
print(X_train.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            232 non-null    object 
 1   island             232 non-null    object 
 2   sex                232 non-null    object 
 3   bill_length_mm     232 non-null    float64
 4   bill_depth_mm      232 non-null    float64
 5   flipper_length_mm  232 non-null    float64
 6   body_mass_g        232 non-null    float64
dtypes: float64(4), object(3)
memory usage: 12.8+ KB
None
(232, 6) (232, 1)
       bill_length_mm  bill_depth_mm  flipper_length_mm
count      232.000000     232.000000         232.000000
mean        43.990948      17.226293         200.681034
std          5.509760       1.964677          14.064231
min         32.100000      13.200000         172.000000
25%         39.200000      15.700000         190.000000
50%         44.950000      17.350000         197.000

### (3) 자주 사용하는 컬럼을 유형별로 분리

In [95]:
COL_DEL = []  # 삭제할 변수
COL_NUM = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']   # 수치형
COL_CAT = ['species', 'island', 'sex']   # 범주형
COL_Y = ['body_mass_g']   # 종속변수

### (4) 범주형 변수 인코딩

- One Hot Encoding, Label Encoding 2가지 방법을 사용할 수 있다.
- 빠르게 작업하기 위해 보통 Label Encoding 방법을 사용한다.

In [96]:
# X 훈련 데이터와 X 테스트 데이터를 합친 후, 인코딩을 진행한다.
X = pd.concat([X_train, X_test])
print(X.shape)

(333, 6)


> One Hot Encoding

In [97]:
from sklearn.preprocessing import OneHotEncoder

# 1. 인코더 생성
ohe = OneHotEncoder()
ohe.fit(X[COL_CAT])   # 범주형 변수 인코딩

X_train_res = ohe.transform(X_train[COL_CAT])
X_test_res = ohe.transform(X_test[COL_CAT])

print(X_train.shape, X_train_res.shape, X_test_res.shape)

(232, 6) (232, 8) (101, 8)


In [98]:
# 2. 인코딩 데이터 결합
X_train_ohe = pd.DataFrame(X_train_res.toarray(), columns=ohe.get_feature_names_out())
X_test_ohe = pd.DataFrame(X_test_res.toarray(), columns=ohe.get_feature_names_out())

print(X_train_ohe.shape)
print(X_train_ohe.head())

print("*" * 77)

print(X_test_ohe.shape)
print(X_test_ohe.head())

(232, 8)
   species_Adelie  species_Chinstrap  species_Gentoo  island_Biscoe  \
0             0.0                0.0             1.0            1.0   
1             1.0                0.0             0.0            0.0   
2             0.0                1.0             0.0            0.0   
3             0.0                0.0             1.0            1.0   
4             1.0                0.0             0.0            0.0   

   island_Dream  island_Torgersen  sex_FEMALE  sex_MALE  
0           0.0               0.0         1.0       0.0  
1           0.0               1.0         0.0       1.0  
2           1.0               0.0         0.0       1.0  
3           0.0               0.0         0.0       1.0  
4           1.0               0.0         1.0       0.0  
*****************************************************************************
(101, 8)
   species_Adelie  species_Chinstrap  species_Gentoo  island_Biscoe  \
0             1.0                0.0             0.0      

In [99]:
# 3. 인코딩 데이터와 수치형 데이터 연결
X_train_fin = pd.concat([X_train[COL_NUM], X_train_ohe], axis=1)
X_test_fin = pd.concat([X_test[COL_NUM], X_test_ohe], axis=1)


print(X_train_fin.info())
print(X_test_fin.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   bill_length_mm     232 non-null    float64
 1   bill_depth_mm      232 non-null    float64
 2   flipper_length_mm  232 non-null    float64
 3   species_Adelie     232 non-null    float64
 4   species_Chinstrap  232 non-null    float64
 5   species_Gentoo     232 non-null    float64
 6   island_Biscoe      232 non-null    float64
 7   island_Dream       232 non-null    float64
 8   island_Torgersen   232 non-null    float64
 9   sex_FEMALE         232 non-null    float64
 10  sex_MALE           232 non-null    float64
dtypes: float64(11)
memory usage: 20.1 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   bill_length_mm    

### (5) 데이터 모형 구축

In [100]:
# 데이터 분리
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(X_train_fin, y_train, test_size=0.2, random_state=100)
print(X_tr.shape, X_val.shape, y_tr.shape)

# 스케일링
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_tr[COL_NUM])   # 수치형 변수

X_tr[COL_NUM] = scaler.transform(X_tr[COL_NUM])
X_val[COL_NUM] = scaler.transform(X_val[COL_NUM])
X_test_fin[COL_NUM] = scaler.transform(X_test_fin[COL_NUM])

print(X_tr[:5])
print("*" * 100)
print(X_val[:5])
print("*" * 100)
print(X_test_fin[:5])


(185, 11) (47, 11) (185, 1)
    bill_length_mm  bill_depth_mm  flipper_length_mm  species_Adelie  \
23        0.756757       0.855422           0.372881             0.0   
22        0.274131       0.650602           0.305085             1.0   
62        0.366795       0.578313           0.338983             1.0   
96        0.220077       0.819277           0.305085             1.0   
64        0.339768       0.409639           0.254237             0.0   

    species_Chinstrap  species_Gentoo  island_Biscoe  island_Dream  \
23                1.0             0.0            0.0           1.0   
22                0.0             0.0            0.0           1.0   
62                0.0             0.0            1.0           0.0   
96                0.0             0.0            1.0           0.0   
64                1.0             0.0            0.0           1.0   

    island_Torgersen  sex_FEMALE  sex_MALE  
23               0.0         0.0       1.0  
22               0.0        

In [101]:
# 선형 회귀
from sklearn.linear_model import LinearRegression

model_lr = LinearRegression()
model_lr.fit(X_tr, y_tr)

y_val_pred = model_lr.predict(X_val)
print(y_val_pred)

[[5651.44624447]
 [4085.30634454]
 [3310.47964971]
 [3337.33330811]
 [5525.50020499]
 [3382.48601681]
 [5366.78042648]
 [5406.05885837]
 [4022.32697125]
 [3292.25515467]
 [3424.13065267]
 [3944.41147668]
 [5658.89265934]
 [4026.09038061]
 [3533.76962506]
 [3980.68233539]
 [4273.49966842]
 [3450.37087712]
 [5409.3269011 ]
 [3428.85375379]
 [4574.80328694]
 [5333.02167391]
 [4960.1175087 ]
 [3468.14570894]
 [3633.49745479]
 [4140.93518486]
 [4223.38273249]
 [3899.22819232]
 [3478.01099796]
 [4235.55908562]
 [4168.2834231 ]
 [4104.85171529]
 [3829.67793272]
 [4736.24134662]
 [4137.16318441]
 [4611.96157315]
 [3810.62622447]
 [3618.78537788]
 [3460.48216587]
 [5384.54933849]
 [4757.06790988]
 [4087.63096112]
 [4701.25158001]
 [3537.19879735]
 [4167.09564203]
 [3288.96163916]
 [4635.71851319]]


In [102]:
# y = ax + b (a : coef(회귀계수), b: intercept(절편))
print(model_lr.coef_[0])
print(model_lr.intercept_[0])


coef = pd.Series(model_lr.coef_[0], index=X_tr.columns)
print(coef.sort_values())

[ 588.0134175   517.07440279  844.48940437 -237.87174116 -476.13934612
  714.01108728   30.25151452  -59.25996583   29.00845132 -203.85322113
  203.85322113]
3247.034154636649
species_Chinstrap   -476.139346
species_Adelie      -237.871741
sex_FEMALE          -203.853221
island_Dream         -59.259966
island_Torgersen      29.008451
island_Biscoe         30.251515
sex_MALE             203.853221
bill_depth_mm        517.074403
bill_length_mm       588.013418
species_Gentoo       714.011087
flipper_length_mm    844.489404
dtype: float64


In [103]:
# 랜덤 포레스트
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor()
model_rf.fit(X_tr, y_tr)

y_val_pred_rf = model_rf.predict(X_val)
print(y_val_pred_rf)

[5737.   3898.75 3453.5  3249.   5533.5  3522.75 5524.25 5350.5  3749.
 3221.25 3483.25 3802.75 5831.   4010.25 3444.   3793.25 4146.5  3434.
 5378.5  3409.5  4406.   5514.25 5014.25 3696.   3706.75 3721.   4338.
 4041.   3580.5  4286.75 4062.25 4167.25 4062.75 4613.75 4103.75 4211.
 4018.75 3551.   3491.25 5541.   4821.75 3939.25 4731.5  3764.5  4266.5
 3429.5  4463.25]


  return fit_method(estimator, *args, **kwargs)


### (6) 모델 평가

In [104]:
from sklearn.metrics import mean_squared_error

# 선형회귀
mse = mean_squared_error(y_val, y_val_pred)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print(mse, rmse)

# 랜덤 포레스트
mse_rf = mean_squared_error(y_val, y_val_pred_rf)
rmse_rf = mean_squared_error(y_val, y_val_pred_rf, squared=False)
print(mse_rf, rmse_rf)

# rmse는 error이므로, 값이 작을수록 성능이 더 좋다.

117089.24010380742 342.18305057937545
106928.19414893616 326.99876780950746




### (7) 답안 제출

In [105]:
pred = model_rf.predict(X_test_fin)
print(pred)

pd.DataFrame({'body_mass_g': pred}).to_csv('./outputs/regression01-1.csv', index=False)

[4217.75 5261.5  4054.5  3425.75 4533.75 3879.25 3384.5  3435.75 3511.75
 3470.   3391.25 5459.5  3440.5  3909.75 3894.75 3931.25 4437.5  3744.5
 3491.   3401.25 5288.75 4127.   5748.   4676.   5640.5  5471.5  5493.
 4326.5  3473.75 4046.75 4108.75 3375.25 3519.75 4174.   3308.5  3430.
 3088.   3581.5  3877.5  4966.5  3344.75 5683.5  5025.25 4820.5  5554.
 4109.75 4631.   3258.5  5153.5  4235.   4672.75 4758.5  4212.25 3812.5
 3161.   4524.25 4704.   4415.5  3826.75 4169.   3821.75 5028.5  3494.5
 5042.25 3732.   4017.75 5603.5  3067.5  4748.5  3733.   4046.5  3437.5
 4727.75 3331.25 4694.   4250.5  5599.   3544.5  3438.75 4821.25 3462.
 5716.   3477.25 4905.25 4389.5  3298.25 5791.   4046.75 3993.75 4691.
 4152.5  3979.25 4268.   4143.25 5389.   3977.5  5157.   3515.25 4878.75
 5661.   3546.5 ]


### 단순화하여 문제 풀기

In [106]:
import pandas as pd

X_train = pd.read_csv('./datasets/penguin_X_train.csv')
X_test = pd.read_csv('./datasets/penguin_X_test.csv')
y_train = pd.read_csv('./datasets/penguin_y_train.csv')

# 1. 데이터 확인
# print(X_train.info())
# print("*" * 100)
# print(y_train.info())

# 2. 결측치 처리 (X, Y 훈련 데이터 합친 후)
train = pd.concat([X_train, y_train], axis=1)   
train = train.dropna()
train.reset_index(drop=True, inplace=True)  # 인덱스 초기화

X_train = train[X_train.columns]
y_train = train[y_train.columns]

# print(X_train.info())
# print(y_train.info())

# 3. 기술 통계 확인
# print(X_train.describe())

# 4. 자주 사용하는 컬럼 유형별 분리
COL_DEL = []  # 삭제할 컬럼
COL_NUM = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']   # 수치형 컬럼
COL_CAT = ['species', 'island', 'sex']   # 범주형 컬럼
COL_Y = ['body_mass_g']   # 종속변수

# 5. 인코딩 (Label Encoding)
X = pd.concat([X_train, X_test])   # 데이터와 데이터를 연결하기 대문에 axis=1을 하지 않는다.

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in COL_CAT:
    le.fit(X[col])
    X_train[col] = le.transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

# print(X_train.head())

# 6. 데이터 모형 구축
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=100)
print(X_tr.shape, X_val.shape, y_tr.shape)

# 7. 스케일링 -> 생략

# 8. Random Forest
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor()
model_rf.fit(X_tr, y_tr)

x_val_pred_rf = model_rf.predict(X_val)

# 9. 평가
from sklearn.metrics import mean_squared_error

rmse_rf = mean_squared_error(y_val, y_val_pred_rf, squared=False)
print(rmse_rf)

# 10. 제출
pred = model_rf.predict(X_test)
print(pred.shape)

pd.DataFrame({'body_mass_g': pred}).to_csv('./outputs/regression01-2.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = le.transform(X_train[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = le.transform(X_train[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = le.transform(X_train[col])
  return fit_method(estimator, *args, **kwargs)


(185, 6) (47, 6) (185, 1)
326.99876780950746
(101,)


