In [None]:
import pandas as pd

# 1. 데이터 로드
file_path = '/content/drive/MyDrive/Halitosis/241121_Halitosis_data preprocessing.xlsx'
data = pd.read_excel(file_path)

# 2. 입력 Feature와 타겟 변수 설정
input_features = [
    'Sex', 'Age', 'Elderly', 'Super_Elderly', 'Xerostomia_subjective',
    'UFR', 'SFR', 'pH', 'BufferCapacity', 'VAS', 'StickySaliva', 'Oralhygiene',
    'Calculus', 'O_Stomatitis', 'O_RAU', 'O_Candidiasis', 'O_Periodontitis',
    'O_LichenPlanus', 'O_Sialodochitis', 'O_Glossodynia', 'O_BMS',
    'S_Hypertension', 'S_DM', 'S_Hyperlipidemia', 'S_RenalDiseases',
    'S_LiverDiseases', 'S_Rheumatism', 'S_Osteoporosis', 'S_CVD', 'S_TD',
    'S_MentalDisorders', 'S_UrinaryDiseases', 'S_Asthma', 'S_CancerOp',
    'NumberofSystmicDiseases', 'M_Hypertension', 'M_DM', 'M_Osteoporosis',
    'M_Hyperlipidemia', 'M_CV', 'M_TD', 'M_GI', 'M_UrinaryDiseases',
    'M_Arthritis', 'M_Rheumatism', 'M_Anxiolytic', 'M_SleepingPills',
    'M_Aspirin'
]
target_feature = 'Halitosis_subjective'

# Feature와 Target 분리
X = data[input_features]
y = data[target_feature]

# 데이터 크기 확인
print(f"Input Features Shape: {X.shape}")
print(f"Target Variable Shape: {y.shape}")

# 데이터 예시 출력
print("Input Features Sample:")
print(X.head())
print("\nTarget Variable Sample:")
print(y.head())

Input Features Shape: (821, 48)
Target Variable Shape: (821,)
Input Features Sample:
   Sex   Age  Elderly  Super_Elderly  Xerostomia_subjective   UFR  SFR   pH  \
0    0  13.5        0              0                      0  0.75  2.1  7.8   
1    0  12.0        0              0                      0  0.50  1.3  7.4   
2    0  15.2        0              0                      0  0.40  1.0  7.2   
3    0  10.0        0              0                      0  0.40  0.6  7.0   
4    0  16.0        0              0                      0  0.30  1.0  7.2   

   BufferCapacity  VAS  ...  M_Hyperlipidemia  M_CV  M_TD  M_GI  \
0            12.0    6  ...                 0     0     0     0   
1            12.0    5  ...                 0     0     0     0   
2            12.0    0  ...                 0     0     0     0   
3            12.0    2  ...                 0     0     0     0   
4            12.0    2  ...                 0     0     0     0   

   M_UrinaryDiseases  M_Arthritis  M_

In [None]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# 1. 클래스 불균형 해결 (SMOTE)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 클래스 분포 확인
print("Original class distribution:")
print(y.value_counts())
print("\nResampled class distribution:")
print(pd.Series(y_resampled).value_counts())

# 2. 데이터 정규화 (StandardScaler)
scaler = StandardScaler()
X_resampled_normalized = scaler.fit_transform(X_resampled)

# 데이터 형태 확인
print("\nResampled and Normalized Data Shape:")
print(f"Features Shape: {X_resampled_normalized.shape}")
print(f"Target Shape: {y_resampled.shape}")

# 정규화된 데이터 샘플 출력
print("\nNormalized Data Sample:")
print(X_resampled_normalized[:5])

Original class distribution:
Halitosis_subjective
0    687
1    134
Name: count, dtype: int64

Resampled class distribution:
Halitosis_subjective
0    687
1    687
Name: count, dtype: int64

Resampled and Normalized Data Shape:
Features Shape: (1374, 48)
Target Shape: (1374,)

Normalized Data Sample:
[[-1.22029932 -2.41470021 -0.80220987 -0.76452233 -0.48997163  1.65034332
   1.40329025  1.71775386  0.66942231  0.17661126 -0.5127062   0.2992409
   0.46004628  2.15201012 -0.17974019 -0.33279396 -0.13888065 -0.1731618
  -0.06622662 -0.11841512 -0.56558353 -0.60145183 -0.30651108 -0.14940358
  -0.10854507 -0.1468378  -0.12756249 -0.31913074 -0.21554232 -0.19832887
  -0.15441829 -0.16635473 -0.1215362  -0.15687097 -0.7617197  -0.27407548
  -0.15687097 -0.13047769 -0.26158983 -0.16402997 -0.14423029 -0.1468378
  -0.13888065 -0.11521431 -0.05403432 -0.1731618  -0.17091952 -0.14157875]
 [-1.22029932 -2.49781521 -0.80220987 -0.76452233 -0.48997163  0.58433793
   0.10170979  1.06742671  0.66942



In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# 데이터 분리 (Train/Test Split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# SMOTE 적용 (Train 데이터에만)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# 클래스 분포 확인
print("Original class distribution in Train set:")
print(y_train.value_counts())
print("\nResampled class distribution in Train set:")
print(pd.Series(y_train_resampled).value_counts())

print("\nOriginal class distribution in Test set:")
print(y_test.value_counts())

Original class distribution in Train set:
Halitosis_subjective
0    549
1    107
Name: count, dtype: int64

Resampled class distribution in Train set:
Halitosis_subjective
0    549
1    549
Name: count, dtype: int64

Original class distribution in Test set:
Halitosis_subjective
0    138
1     27
Name: count, dtype: int64




In [None]:
import torch
import numpy as np

class PositionalEncoding(nn.Module):
    def __init__(self, input_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, input_dim)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, input_dim, 2).float() * (-np.log(10000.0) / input_dim))
        self.encoding[:, 0::2] = torch.sin(position * div_term)  # 짝수 인덱스에 Sine 적용
        self.encoding[:, 1::2] = torch.cos(position * div_term)  # 홀수 인덱스에 Cosine 적용
        self.encoding = self.encoding.unsqueeze(0)  # Batch 차원 추가

    def forward(self, x):
        # 입력 데이터에 Positional Encoding 추가
        seq_len = x.size(1)
        return x + self.encoding[:, :seq_len, :].to(x.device)

# Transformer 모델 수정
class TransformerModel(nn.Module):
    def __init__(self, input_dim, num_heads, num_layers):
        super(TransformerModel, self).__init__()
        self.input_layer = nn.Linear(input_dim, 64)  # 입력 Feature 차원 -> 모델 차원 변환
        self.positional_encoding = PositionalEncoding(input_dim=64)  # Positional Encoding 추가
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=64, nhead=num_heads, batch_first=True, dropout=0.1)
        self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)  # Transformer 인코더
        self.output_layer = nn.Linear(64, 1)  # 출력 레이어 (1차원: Binary Classification)

    def forward(self, x):
        x = self.input_layer(x).unsqueeze(1)  # Linear 변환 후 차원 추가
        x = self.positional_encoding(x)  # Positional Encoding 추가
        x = self.transformer(x).squeeze(1)  # Transformer Encoder 통과
        x = self.output_layer(x)  # 출력 계산
        return torch.sigmoid(x)  # 확률 값 반환


# Positional Encoding 테스트
input_dim = 48  # 입력 Feature 개수
seq_len = 48    # Sequence 길이 (Feature 수와 동일)

# 임의의 입력 데이터
dummy_input = torch.zeros(1, seq_len, input_dim)  # [Batch, Sequence Length, Feature Dimension]

# Positional Encoding 적용
pos_encoder = PositionalEncoding(input_dim=input_dim)
encoded_input = pos_encoder(dummy_input)

print("Original Input Shape:", dummy_input.shape)
print("Encoded Input Shape:", encoded_input.shape)

Original Input Shape: torch.Size([1, 48, 48])
Encoded Input Shape: torch.Size([1, 48, 48])


In [None]:
import torch
import torch.nn as nn

# Custom Transformer 모델 수정
class CustomTransformerModel(nn.Module):
    def __init__(self, input_dim, num_heads, num_layers):
        super(CustomTransformerModel, self).__init__()
        self.input_layer = nn.Linear(input_dim, 64)  # 입력 차원을 임베딩 차원으로 변환
        self.positional_encoding = PositionalEncoding(input_dim=64)  # Positional Encoding 추가
        self.multihead_attn = nn.MultiheadAttention(embed_dim=64, num_heads=num_heads, batch_first=True)  # Multi-Head Attention
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=64, nhead=num_heads, batch_first=True, dropout=0.1)
            for _ in range(num_layers)
        ])
        self.feedforward = nn.Linear(64, 32)  # Feedforward Layer 추가
        self.output_layer = nn.Linear(32, 1)  # 최종 출력 레이어
        self.attention_weights = None  # Attention Weights 저장 변수

    def forward(self, x):
        x = self.input_layer(x).unsqueeze(1)  # [Batch, Sequence Length, Feature Dimension]
        x = self.positional_encoding(x)

        # Multi-Head Attention
        attn_output, attn_weights = self.multihead_attn(x, x, x)
        self.attention_weights = attn_weights  # Attention Weights 저장

        # Transformer Encoder Layers
        for layer in self.encoder_layers:
            x = layer(x)

        # Feedforward Layers
        x = self.feedforward(x[:, 0, :])  # 첫 번째 시퀀스 출력 사용
        x = self.output_layer(x)  # 최종 출력 계산
        return torch.sigmoid(x)  # 이진 분류를 위한 Sigmoid

In [None]:


# 모델 초기화
model = CustomTransformerModel(input_dim=X_train.shape[1], num_heads=4, num_layers=2)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 1. Tensor로 변환
X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)

# 2. 학습 및 평가
epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    # Forward Pass
    outputs = model(X_train).squeeze()
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    # 평가
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test).squeeze()
        predictions = (test_outputs >= 0.5).float()
        accuracy = (predictions == y_test).sum().item() / len(y_test)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}")

Epoch 1/10, Loss: 0.6839, Accuracy: 0.8364
Epoch 2/10, Loss: 0.4466, Accuracy: 0.8364
Epoch 3/10, Loss: 0.4443, Accuracy: 0.8364
Epoch 4/10, Loss: 0.4446, Accuracy: 0.8364
Epoch 5/10, Loss: 0.4446, Accuracy: 0.8364
Epoch 6/10, Loss: 0.4432, Accuracy: 0.8364
Epoch 7/10, Loss: 0.4445, Accuracy: 0.8364
Epoch 8/10, Loss: 0.4447, Accuracy: 0.8364
Epoch 9/10, Loss: 0.4441, Accuracy: 0.8364
Epoch 10/10, Loss: 0.4430, Accuracy: 0.8364


In [None]:
# 고차원 학습 시작

# Custom Transformer 모델 수정
class CustomTransformerModel(nn.Module):
    def __init__(self, input_dim, num_heads, num_layers):
        super(CustomTransformerModel, self).__init__()
        self.input_layer = nn.Linear(input_dim, 64)  # 입력 차원을 64로 변환
        self.positional_encoding = PositionalEncoding(input_dim=64)  # Positional Encoding
        self.multihead_attn = nn.MultiheadAttention(embed_dim=64, num_heads=num_heads, batch_first=True)  # Multi-Head Attention
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=64, nhead=num_heads, batch_first=True, dropout=0.1)
            for _ in range(num_layers)
        ])
        self.feedforward1 = nn.Linear(64, 32)  # 첫 번째 Feedforward Layer
        self.activation = nn.ReLU()  # 활성화 함수
        self.feedforward2 = nn.Linear(32, 16)  # 두 번째 Feedforward Layer
        self.output_layer = nn.Linear(16, 1)  # 최종 출력 레이어
        self.attention_weights = None  # Attention Weights 저장 변수

    def forward(self, x):
        x = self.input_layer(x).unsqueeze(1)  # [Batch, Sequence Length, Feature Dimension]
        x = self.positional_encoding(x)

        # Multi-Head Attention
        attn_output, attn_weights = self.multihead_attn(x, x, x)
        self.attention_weights = attn_weights  # Attention Weights 저장

        # Transformer Encoder Layers
        for layer in self.encoder_layers:
            x = layer(x)

        # Feedforward Layers
        x = self.feedforward1(x[:, 0, :])  # 첫 번째 Feedforward
        x = self.activation(x)  # 활성화 함수
        x = self.feedforward2(x)  # 두 번째 Feedforward
        x = self.output_layer(x)  # 최종 출력 계산
        return torch.sigmoid(x)  # 이진 분류를 위한 Sigmoid

In [None]:
# 모델 초기화
model = CustomTransformerModel(input_dim=X_train.shape[1], num_heads=4, num_layers=2)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 학습 및 평가
epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    # Forward Pass
    outputs = model(X_train).squeeze()
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    # 평가
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test).squeeze()
        predictions = (test_outputs >= 0.5).float()
        accuracy = (predictions == y_test).sum().item() / len(y_test)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}")

Epoch 1/10, Loss: 0.7426, Accuracy: 0.8364
Epoch 2/10, Loss: 0.6393, Accuracy: 0.8364
Epoch 3/10, Loss: 0.5970, Accuracy: 0.8364
Epoch 4/10, Loss: 0.5713, Accuracy: 0.8364
Epoch 5/10, Loss: 0.5522, Accuracy: 0.8364
Epoch 6/10, Loss: 0.5361, Accuracy: 0.8364
Epoch 7/10, Loss: 0.5228, Accuracy: 0.8364
Epoch 8/10, Loss: 0.5113, Accuracy: 0.8364
Epoch 9/10, Loss: 0.5012, Accuracy: 0.8364
Epoch 10/10, Loss: 0.4905, Accuracy: 0.8364


In [None]:
# Output Layer 모델 정의

class CustomTransformerModel(nn.Module):
    def __init__(self, input_dim, num_heads, num_layers):
        super(CustomTransformerModel, self).__init__()
        self.input_layer = nn.Linear(input_dim, 64)  # 입력 차원을 64로 변환
        self.positional_encoding = PositionalEncoding(input_dim=64)  # Positional Encoding
        self.multihead_attn = nn.MultiheadAttention(embed_dim=64, num_heads=num_heads, batch_first=True)  # Multi-Head Attention
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=64, nhead=num_heads, batch_first=True, dropout=0.1)
            for _ in range(num_layers)
        ]의
        self.feedforward1 = nn.Linear(64, 32)  # 첫 번째 Feedforward Layer
        self.activation = nn.ReLU()  # 활성화 함수
        self.feedforward2 = nn.Linear(32, 16)  # 두 번째 Feedforward Layer
        self.output_layer = nn.Linear(16, 1)  # 최종 출력 레이어
        self.attention_weights = None  # Attention Weights 저장 변수

    def forward(self, x):
        x = self.input_layer(x).unsqueeze(1)  # [Batch, Sequence Length, Feature Dimension]
        x = self.positional_encoding(x)

        # Multi-Head Attention
        attn_output, attn_weights = self.multihead_attn(x, x, x)
        self.attention_weights = attn_weights  # Attention Weights 저장

        # Transformer Encoder Layers
        for layer in self.encoder_layers:
            x = layer(x)

        # Feedforward Layers
        x = self.feedforward1(x[:, 0, :])  # 첫 번째 Feedforward
        x = self.activation(x)  # 활성화 함수
        x = self.feedforward2(x)  # 두 번째 Feedforward

        # Output Layer
        x = self.output_layer(x)  # 최종 출력 계산
        return torch.sigmoid(x)  # 이진 분류를 위한 Sigmoid

In [None]:
# 모델 초기화
model = CustomTransformerModel(input_dim=X_train.shape[1], num_heads=4, num_layers=2)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 학습 및 평가
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    # Forward Pass
    outputs = model(X_train).squeeze()
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    # 평가
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test).squeeze()
        predictions = (test_outputs >= 0.5).float()
        accuracy = (predictions == y_test).sum().item() / len(y_test)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}")

Epoch 1/100, Loss: 0.7896, Accuracy: 0.8364
Epoch 2/100, Loss: 0.5614, Accuracy: 0.8364
Epoch 3/100, Loss: 0.5087, Accuracy: 0.8364
Epoch 4/100, Loss: 0.4834, Accuracy: 0.8364
Epoch 5/100, Loss: 0.4692, Accuracy: 0.8364
Epoch 6/100, Loss: 0.4585, Accuracy: 0.8364
Epoch 7/100, Loss: 0.4514, Accuracy: 0.8364
Epoch 8/100, Loss: 0.4482, Accuracy: 0.8364
Epoch 9/100, Loss: 0.4470, Accuracy: 0.8364
Epoch 10/100, Loss: 0.4460, Accuracy: 0.8364
Epoch 11/100, Loss: 0.4449, Accuracy: 0.8364
Epoch 12/100, Loss: 0.4456, Accuracy: 0.8364
Epoch 13/100, Loss: 0.4449, Accuracy: 0.8364
Epoch 14/100, Loss: 0.4461, Accuracy: 0.8364
Epoch 15/100, Loss: 0.4472, Accuracy: 0.8364
Epoch 16/100, Loss: 0.4475, Accuracy: 0.8364
Epoch 17/100, Loss: 0.4468, Accuracy: 0.8364
Epoch 18/100, Loss: 0.4456, Accuracy: 0.8364
Epoch 19/100, Loss: 0.4458, Accuracy: 0.8364
Epoch 20/100, Loss: 0.4443, Accuracy: 0.8364
Epoch 21/100, Loss: 0.4467, Accuracy: 0.8364
Epoch 22/100, Loss: 0.4458, Accuracy: 0.8364
Epoch 23/100, Loss: