## 캐글 자전거 수요 예측 : https://www.kaggle.com/competitions/bike-sharing-demand/overview
### 목표 : 전처리 방법 변경 및 모델을 Tensorflow 딥러닝 모델로 변경하여 제출 후 스코어 0.8 이하 도달하기

### baseline

In [None]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# 사용할 특성 선택
features = ['season', 'holiday', 'workingday', 'weather', 'temp', 
            'atemp', 'humidity', 'windspeed', 'year', 'month', 
            'day', 'hour', 'dayofweek']

# 데이터 전처리 및 정규화 함수 정의
def preprocess_and_scale(train_data, test_data, features):
    # datetime에서 유용한 특성 추출
    for dataset in [train_data, test_data]:
        dataset['datetime'] = pd.to_datetime(dataset['datetime'])
        dataset['year'] = dataset['datetime'].dt.year
        dataset['month'] = dataset['datetime'].dt.month
        dataset['day'] = dataset['datetime'].dt.day
        dataset['hour'] = dataset['datetime'].dt.hour
        dataset['dayofweek'] = dataset['datetime'].dt.dayofweek
    
    # StandardScaler를 사용해 데이터 정규화
    scaler = StandardScaler()
    X_train = scaler.fit_transform(train_data[features])
    X_test = scaler.transform(test_data[features])

    return X_train, X_test, scaler

# RMSLE 손실 함수 정의 (0 이하 값 방지를 위한 epsilon 추가)
class RMSLELoss(nn.Module):
    def __init__(self, epsilon=1e-6):
        super(RMSLELoss, self).__init__()
        self.epsilon = epsilon
    
    def forward(self, y_pred, y_true):
        # y_pred가 0 이하가 되지 않도록 epsilon 추가
        return torch.sqrt(torch.mean((torch.log1p(y_pred + self.epsilon) - torch.log1p(y_true + self.epsilon)) ** 2))

# 데이터 불러오기
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
y = train['count'].values

# 데이터 전처리 및 정규화
X_train, X_test, scaler = preprocess_and_scale(train, test, features)

# 학습/검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X_train, y, test_size=0.2, random_state=42)

# 데이터셋을 텐서로 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

# DataLoader 설정
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# 모델 정의
class BikeDemandModel(nn.Module):
    def __init__(self):
        super(BikeDemandModel, self).__init__()
        self.fc1 = nn.Linear(len(features), 32)
        self.fc2 = nn.Linear(32, 4)
        self.fc3 = nn.Linear(4, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return torch.relu(x)  # 출력에 ReLU 적용

# 모델 초기화
model = BikeDemandModel()
criterion = RMSLELoss(epsilon=0)  # 손실 함수에 epsilon 설정
#criterion = RMSLELoss(epsilon=1e-6)  # 손실 함수에 epsilon 설정
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 학습 함수 정의
def train_model(model, train_loader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        
        # 손실 계산 및 역전파
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(train_loader)

# 검증 함수 정의
def evaluate_model(model, val_loader, criterion):
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            # 예측값 후처리: 정수로 반올림하고 음수는 0으로 변환
            outputs = torch.round(outputs).clamp(min=0)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
    return val_loss / len(val_loader)

# 모델 저장 경로
best_model_path = "best_model.pth"

# 모델 학습 및 검증
n_epochs = 1000
best_val_loss = float('inf')  # 초기값을 무한대로 설정
for epoch in range(n_epochs):
    train_loss = train_model(model, train_loader, criterion, optimizer)
    val_loss = evaluate_model(model, val_loader, criterion)
    print(f'Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), best_model_path)  # 최적 모델 저장

# 최적 모델 불러오기
model.load_state_dict(torch.load(best_model_path))
print("Best model loaded for evaluation.")

# 검증 데이터로 성능 평가
model.eval()
with torch.no_grad():
    val_pred = model(X_val_tensor).numpy()
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    mae = mean_absolute_error(y_val, val_pred)
    print('검증 데이터 RMSE:', rmse)
    print('검증 데이터 MAE:', mae)

# 테스트 데이터 예측
with torch.no_grad():
    test_pred = model(X_test_tensor)
    test_pred = torch.round(test_pred).clamp(min=0).numpy()

# 제출 파일 생성
submission = pd.DataFrame({
    'datetime': test['datetime'],
    'count': test_pred.flatten()
})
submission.to_csv('submission.csv', index=False)

print('제출 파일이 생성되었습니다.')


Epoch 1/1000, Train Loss: 4.0886, Val Loss: 3.4837
Epoch 2/1000, Train Loss: 2.9277, Val Loss: 2.4314
Epoch 3/1000, Train Loss: 2.1199, Val Loss: 1.8543
Epoch 4/1000, Train Loss: 1.6721, Val Loss: 1.5286
Epoch 5/1000, Train Loss: 1.4084, Val Loss: 1.3366
Epoch 6/1000, Train Loss: 1.2542, Val Loss: 1.2212
Epoch 7/1000, Train Loss: 1.1596, Val Loss: 1.1489
Epoch 8/1000, Train Loss: 1.1022, Val Loss: 1.1006
Epoch 9/1000, Train Loss: 1.0665, Val Loss: 1.0678
Epoch 10/1000, Train Loss: 1.0302, Val Loss: 1.0445
Epoch 11/1000, Train Loss: 1.0097, Val Loss: 1.0274
Epoch 12/1000, Train Loss: 0.9977, Val Loss: 1.0143
Epoch 13/1000, Train Loss: 0.9855, Val Loss: 1.0050
Epoch 14/1000, Train Loss: 0.9759, Val Loss: 0.9971
Epoch 15/1000, Train Loss: 0.9685, Val Loss: 0.9912
Epoch 16/1000, Train Loss: 0.9654, Val Loss: 0.9860
Epoch 17/1000, Train Loss: 0.9575, Val Loss: 0.9815
Epoch 18/1000, Train Loss: 0.9537, Val Loss: 0.9773
Epoch 19/1000, Train Loss: 0.9536, Val Loss: 0.9750
Epoch 20/1000, Train 