### 제2유형 연습하기. 당뇨 진척 정도 (회귀)

In [173]:
import pandas as pd
import numpy as np

# 표준화, 정규화
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# 변수 분리
from sklearn.model_selection import train_test_split

# 모델 선정 (RandomForest)
from sklearn.ensemble import RandomForestRegressor

# 모델 성능 평가 관련
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, mean_squared_error, r2_score


from sklearn.datasets import load_diabetes

# diabetes 데이터셋 로드

diabetes = load_diabetes()

x = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.DataFrame(diabetes.target)

# print(x.head())
# print(y.head())

# 실기 시험 데이터셋 셋팅

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2023)
# y_test는 실제로 주어지지 않으므로 무시!

x_train = pd.DataFrame(x_train.reset_index())
x_test = pd.DataFrame(x_test.reset_index())
y_train = pd.DataFrame(y_train.reset_index())

x_train.rename(columns={'index':'cust_id'}, inplace=True)
x_test.rename(columns={'index':'cust_id'}, inplace=True)
y_train.columns = ({'cust_id', 'target'})

# print(x_train.head())
# print(x_test.head())
# print(y_train.head())

In [174]:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

#### 01. 데이터 탐색(EDA)

In [175]:
print(x_train.describe().T)
print(x_test.describe().T)
print(y_train.describe().T)

         count        mean         std       min         25%         50%  \
cust_id  353.0  212.634561  126.668903  0.000000  105.000000  210.000000   
age      353.0    0.000804    0.047617 -0.107226   -0.038207    0.005383   
sex      353.0    0.000724    0.047673 -0.044642   -0.044642   -0.044642   
bmi      353.0    0.000640    0.048141 -0.084886   -0.035307   -0.006206   
bp       353.0   -0.000326    0.046585 -0.112399   -0.033213   -0.005670   
s1       353.0    0.001179    0.047891 -0.126781   -0.033216   -0.002945   
s2       353.0    0.001110    0.048248 -0.115613   -0.029184   -0.001314   
s3       353.0   -0.000452    0.048600 -0.102307   -0.039719   -0.006584   
s4       353.0    0.000901    0.048045 -0.076395   -0.039493   -0.002592   
s5       353.0    0.001446    0.047160 -0.126097   -0.033246    0.000272   
s6       353.0    0.000589    0.048122 -0.137767   -0.034215    0.003064   

                75%         max  
cust_id  322.000000  441.000000  
age        0.038076

In [176]:
# 결측치 확인

print(x_train.isnull().sum())
print(x_test.isnull().sum())
print(y_train.isnull().sum())

cust_id    0
age        0
sex        0
bmi        0
bp         0
s1         0
s2         0
s3         0
s4         0
s5         0
s6         0
dtype: int64
cust_id    0
age        0
sex        0
bmi        0
bp         0
s1         0
s2         0
s3         0
s4         0
s5         0
s6         0
dtype: int64
target     0
cust_id    0
dtype: int64


In [177]:
print(x_train.info())
print(x_test.info())
print(y_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353 entries, 0 to 352
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  353 non-null    int64  
 1   age      353 non-null    float64
 2   sex      353 non-null    float64
 3   bmi      353 non-null    float64
 4   bp       353 non-null    float64
 5   s1       353 non-null    float64
 6   s2       353 non-null    float64
 7   s3       353 non-null    float64
 8   s4       353 non-null    float64
 9   s5       353 non-null    float64
 10  s6       353 non-null    float64
dtypes: float64(10), int64(1)
memory usage: 30.5 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  89 non-null     int64  
 1   age      89 non-null     float64
 2   sex      89 non-null     float64
 3   bmi      89 non-null     float64
 4   bp      

#### 당뇨병 환자의 질병 진행 정도 예측
#### - 데이터의 결측치, 이상치, 변수들에 대해 전처리
#### - 회귀모델을 사용하여 Rsq, MSE 값 산출
#### - 제출은 cust_id, target 변수를 가진 DataFrame 형태로 제출

In [178]:
# 변수 처리
# cust_id는 모델에는 반영이 되면 안 되는 데이터!

cust_id = x_test['cust_id'].copy()

x_train = x_train.drop(columns=['cust_id'], axis=1)
x_test = x_test.drop(columns=['cust_id'], axis=1)
# y_train = y_train.drop(columns=['cust_id'], axis=1)

In [179]:
# 모델 적용

model = RandomForestRegressor(random_state=2023)

model.fit(x_train, y_train['target'])

In [180]:
# 훈련용 데이터, 검증용 데이터 분리

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train['target'], test_size=0.2, random_state=23)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(282, 10)
(89, 10)
(282,)
(89, 1)


In [181]:
# 모델을 이용한 검증용 데이터 예측

y_pred = model.predict(x_val)

In [182]:
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("MSE : ", mse)
print("r2(rsq) : ", r2)

MSE :  2031.0410535211272
r2(rsq) :  0.841090674401238
