In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from torchsummary import summary
import numpy as np
from sklearn.metrics import mean_squared_error

# 데이터 로드 및 탐색

In [3]:
data = pd.read_csv('C:/Users/송이두/Desktop/가천대/2-1/인공지능개론/과제/data/diabetes.csv')
data = data.drop('Outcome', axis=1)

In [4]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [5]:
data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 48.1 KB


In [7]:
data.nunique()

Pregnancies                  17
Glucose                     136
BloodPressure                47
SkinThickness                51
Insulin                     186
BMI                         248
DiabetesPedigreeFunction    517
Age                          52
dtype: int64

# 특성 조합 

In [8]:
data_corr=data.corr()
data_corr

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0


In [9]:
data_added = data.copy()
data_added["SkinPerAge"] = data_added['SkinThickness'] / (data_added['Age'] + 1e-7)
data_added.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,SkinPerAge
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,-0.306126
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,-0.03251
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.129852
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.906152
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.392042
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.350229
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.154193
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,-0.404176
SkinPerAge,-0.306126,-0.03251,0.129852,0.906152,0.392042,0.350229,0.154193,-0.404176,1.0


# 데이터 스케일링

In [10]:
X = data_added.drop('BMI', axis=1).values
y = data_added['BMI'].values

X.shape, y.shape

((768, 8), (768,))

In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_scaled = scaler.fit_transform(y.reshape(-1, 1))

# train, test 분리

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614, 1), (154, 1))

In [13]:
#pytorch 텐서로 변환
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [14]:
#텐서 데이터셋 설정 및 데이터 로더 설정
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [15]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

(torch.Size([614, 8]),
 torch.Size([154, 8]),
 torch.Size([614, 1]),
 torch.Size([154, 1]))

# 모델 정의

In [29]:
class DiabetesReg(nn.Module):
    def __init__(self):
        super(DiabetesReg, self).__init__()
        self.fc1 = nn.Linear(8, 32)
        self.fc2 = nn.Linear(32, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)  # binary clf

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        return x
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize the model, loss function, and optimizer
model = DiabetesReg()

# 손실 함수 및 최적화 기법 정의

In [30]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

# 모델 학습

In [31]:
model.train()
for epoch in range(30):
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        # y_batch의 데이터 타입을 Float으로 변환 (회귀 문제의 경우)
        y_batch = y_batch.float()
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

Epoch 1, Loss: 0.9441
Epoch 2, Loss: 0.8544
Epoch 3, Loss: 0.8223
Epoch 4, Loss: 0.7971
Epoch 5, Loss: 0.7889
Epoch 6, Loss: 0.7813
Epoch 7, Loss: 0.7568
Epoch 8, Loss: 0.7603
Epoch 9, Loss: 0.8777
Epoch 10, Loss: 0.7724
Epoch 11, Loss: 0.7516
Epoch 12, Loss: 0.7651
Epoch 13, Loss: 0.7632
Epoch 14, Loss: 0.7507
Epoch 15, Loss: 0.7401
Epoch 16, Loss: 0.7332
Epoch 17, Loss: 0.8349
Epoch 18, Loss: 0.7509
Epoch 19, Loss: 0.7252
Epoch 20, Loss: 0.7618
Epoch 21, Loss: 0.7109
Epoch 22, Loss: 0.7274
Epoch 23, Loss: 0.7264
Epoch 24, Loss: 0.7324
Epoch 25, Loss: 0.8416
Epoch 26, Loss: 0.7078
Epoch 27, Loss: 0.7009
Epoch 28, Loss: 0.6958
Epoch 29, Loss: 0.6905
Epoch 30, Loss: 0.6866


# 모델 평가

In [32]:
model.eval()
preds, actuals = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch).cpu().numpy()
        preds.extend(outputs)
        actuals.extend(y_batch.numpy())

mse = mean_squared_error(actuals, preds)
print(f"Test MSE: {mse:.4f}")

Test MSE: 0.9531
