### 지도학습 - 회귀모델 학습 추가 문제2
- Red Wine Quality Dataset을 이용한 와인 품질 예측 모델 구현

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [15]:
#데이터 가져오기
df = pd.read_csv('wine_quality_red.csv')

df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [8]:
#표준화
#객체 생성
scaler = StandardScaler()

#표준화할 열 리스트
columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides']

#표준화 수행
df_scaling = scaler.fit_transform(df[columns])

#데이터 프레임으로 변환
df_scaled = pd.DataFrame(df_scaling, columns=columns)

print(df_scaled.head(10))

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides
0      -0.528360          0.961877    -1.391472       -0.453218  -0.243707
1      -0.298547          1.967442    -1.391472        0.043416   0.223875
2      -0.298547          1.297065    -1.186070       -0.169427   0.096353
3       1.654856         -1.384443     1.484154       -0.453218  -0.264960
4      -0.528360          0.961877    -1.391472       -0.453218  -0.243707
5      -0.528360          0.738418    -1.391472       -0.524166  -0.264960
6      -0.241094          0.403229    -1.083370       -0.666062  -0.392483
7      -0.585813          0.682553    -1.391472       -0.949853  -0.477498
8      -0.298547          0.291499    -1.288771       -0.382271  -0.307468
9      -0.470907         -0.155419     0.457144        2.526589  -0.349975


In [9]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [16]:
# 품질과의 상관계수 확인
print(df.corr()['quality'].sort_values(ascending=False))

quality                 1.000000
alcohol                 0.476166
sulphates               0.251397
citric acid             0.226373
fixed acidity           0.124052
residual sugar          0.013732
free sulfur dioxide    -0.050656
pH                     -0.057731
chlorides              -0.128907
density                -0.174919
total sulfur dioxide   -0.185100
volatile acidity       -0.390558
Name: quality, dtype: float64


In [20]:
# 상관계수가 0.2를 넘는 열만 선택
corr = df.corr()['quality']
selected_coloumns = corr[abs(corr) > 0.2].index.tolist()
print("선택된 열:", selected_coloumns)

# 선택된 열로 새로운 데이터프레임 생성
df_selected = df[selected_coloumns]
print(df_selected.head())


선택된 열: ['volatile acidity', 'citric acid', 'sulphates', 'alcohol', 'quality']
   volatile acidity  citric acid  sulphates  alcohol  quality
0              0.70         0.00       0.56      9.4        5
1              0.88         0.00       0.68      9.8        5
2              0.76         0.04       0.65      9.8        5
3              0.28         0.56       0.58      9.8        6
4              0.70         0.00       0.56      9.4        5


In [21]:
#타겟: 품질(quality)
X = df_selected.drop(columns=['quality'])
y = df_selected['quality']

In [22]:
#데이터 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42)

In [23]:
#사용할 회귀 모델들 정의(딕셔너리)
models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1),
    "Decision Tree": DecisionTreeRegressor(max_depth=3),
    "Random Forest": RandomForestRegressor(n_estimators=100),
    "SVR": SVR()
}

In [24]:
#학습 및 예측 결과 저장
results = []


for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({
        'Model': name,
        'MSE': mse,
        'R2': r2
    })

print(results)


[{'Model': 'Linear', 'MSE': 0.40021634161714986, 'R2': 0.3875856874490632}, {'Model': 'Ridge', 'MSE': 0.4011968832554363, 'R2': 0.38608525463087207}, {'Model': 'Lasso', 'MSE': 0.5082232320546025, 'R2': 0.22231266213793854}, {'Model': 'Decision Tree', 'MSE': 0.46367250178134983, 'R2': 0.29048455322987166}, {'Model': 'Random Forest', 'MSE': 0.3571888151041667, 'R2': 0.45342675971448065}, {'Model': 'SVR', 'MSE': 0.4475974056196298, 'R2': 0.3150827965831813}]


In [25]:
#모델 성능 비교 출력
df_results = pd.DataFrame(results).sort_values(by='R2', ascending=False)
print(df_results)

           Model       MSE        R2
4  Random Forest  0.357189  0.453427
0         Linear  0.400216  0.387586
1          Ridge  0.401197  0.386085
5            SVR  0.447597  0.315083
3  Decision Tree  0.463673  0.290485
2          Lasso  0.508223  0.222313
