In [1]:
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.utils.prune as prune
import torch.quantization as quantization

from tqdm import tqdm
from datetime import timedelta
from sklearn.pipeline import Pipeline
from skorch import NeuralNetRegressor
from sklearn.model_selection import GridSearchCV
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import LongformerModel, LongformerTokenizer

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
# 데이터 로드
stock_df = pd.read_excel('../../data/tb_stock.xlsx')
main_economic_df = pd.read_excel('../../data/tb_main_economic_index.xlsx')
korea_economic_df = pd.read_excel('../../data/tb_korea_economic_indicator.xlsx')

# 샘플 데이터 사용
stock_df = stock_df[:100]
main_economic_df = main_economic_df[:100]
korea_economic_df = korea_economic_df[:100]

# 필요한 열만 선택
stock_df = stock_df[['sc_date', 'sc_ss_stock']]
main_economic_df = main_economic_df[['mei_date', 'mei_gold', 'mei_sp500', 'mei_kospi']]
korea_economic_df = korea_economic_df[['kei_date', 'kei_m2_avg', 'kei_fr']]

# 열 이름 변경
stock_df.rename(columns={'sc_date': 'date'}, inplace=True)
main_economic_df.rename(columns={'mei_date': 'date'}, inplace=True)
korea_economic_df.rename(columns={'kei_date': 'date'}, inplace=True)

# 데이터프레임 병합
merged_df = pd.merge(stock_df, main_economic_df, on='date', how='inner')
merged_df = pd.merge(merged_df, korea_economic_df, on='date', how='inner')

In [3]:
# 텍스트 데이터 생성
merged_df['text'] = merged_df.apply(lambda row: f"On {row['date']}, gold price was {row['mei_gold']}, S&P 500 index was {row['mei_sp500']}, KOSPI index was {row['mei_kospi']}, M2 average was {row['kei_m2_avg']}, and FR was {row['kei_fr']}.", axis=1)

# 모델의 타겟 설정
merged_df['target'] = merged_df['sc_ss_stock']

# 날짜 형식 확인 및 변환
if not pd.api.types.is_datetime64_any_dtype(merged_df['date']):
    merged_df['date'] = pd.to_datetime(merged_df['date'])
    
print(merged_df[['date', 'text', 'target']].head())

        date                                               text  target
0 2014-09-17  On 2014-09-17, gold price was 1234.40002441406...   24520
1 2014-09-18  On 2014-09-18, gold price was 1225.69995117187...   24200
2 2014-09-19  On 2014-09-19, gold price was 1215.30004882812...   24200
3 2014-09-20  On 2014-09-20, gold price was 1215.30004882812...   24200
4 2014-09-21  On 2014-09-21, gold price was 1215.30004882812...   24200


In [4]:
# 1. 데이터셋 정의
class StockDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        target = self.data.iloc[idx]['target']
        
        inputs = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'target': torch.tensor(target, dtype=torch.float)
        }

In [5]:
# 3. Pruning을 적용하는 모델 정의
class PrunedStockPricePredictor(nn.Module):
    def __init__(self, longformer_model_name):
        super(PrunedStockPricePredictor, self).__init__()
        self.longformer = LongformerModel.from_pretrained(longformer_model_name)
        self.fc         = nn.Linear(self.longformer.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs    = self.longformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs[0][:, 0, :]
        return self.fc(cls_output)
    
    def apply_pruning(self, pruning_amount=0.4):
        # Fully connected layer에 L1 가지치기 적용
        prune.l1_unstructured(self.fc, name="weight", amount=pruning_amount)
        prune.remove(self.fc, 'weight')  # 가지치기 적용 후 pruned 상태에서 재학습을 위해 제거


In [6]:
# 3. 데이터 준비
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
max_length = 512

train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)

train_dataset = StockDataset(train_df, tokenizer, max_length)
test_dataset  = StockDataset(test_df, tokenizer, max_length)

train_loader  = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader   = DataLoader(test_dataset, batch_size=4, shuffle=False)



In [7]:
# 4 기본 학습 및 가지치기 적용
model = PrunedStockPricePredictor('allenai/longformer-base-4096')
model.apply_pruning(pruning_amount=0.4)  # 가지치기 적용
model.train()

optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.MSELoss()

for epoch in tqdm(range(5)):
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids      = batch['input_ids']
        attention_mask = batch['attention_mask']
        target        = batch['target']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss    = criterion(outputs.squeeze(), target)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

  return torch.load(checkpoint_file, map_location=map_location)
  0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
# 학습 완료 후 모델 저장
torch.save(model.state_dict(), 'initial_qat_pruned_longformer.pt')

In [None]:
# 5. 하이퍼파라미터 튜닝을 위한 설정
learning_rates = [1e-5, 3e-5, 5e-5]
model_names   = ['allenai/longformer-base-4096', 'allenai/longformer-large-4096']

best_model = None
best_score = float('inf')
best_params = {}

# 훈련 및 검증 데이터 분할
def prepare_data(loader):
    input_ids_list      = []
    attention_mask_list = []
    target_list         = []

    for batch in loader:
        input_ids_list.append(batch['input_ids'].numpy())
        attention_mask_list.append(batch['attention_mask'].numpy())
        target_list.append(batch['target'].numpy())
    
    input_ids = np.concatenate(input_ids_list, axis=0)
    attention_mask = np.concatenate(attention_mask_list, axis=0)
    targets = np.concatenate(target_list, axis=0)

    return input_ids, attention_mask, targets

In [None]:
train_input_ids, train_attention_mask, train_targets = prepare_data(train_loader)
train_data  =  np.hstack((train_input_ids, train_attention_mask))

X_train, X_val, y_train, y_val = train_test_split(train_data, train_targets, test_size=0.2, random_state=42)


# 파일을 저장할 디렉토리와 파일 이름 설정
save_directory = "./saved_models"
os.makedirs(save_directory, exist_ok=True)
best_model_path = os.path.join(save_directory, "best_model.pt")
# 초기 설정
best_score = float('inf')

In [None]:
print(1)

In [None]:
# 5. 최적의 하이퍼 파라미터 찾기
for lr in tqdm(learning_rates, desc='최고의 학습률'):
    for model_name in tqdm(model_names, desc='최고의 모델'):
        print(f"Training with lr={lr}, model_name={model_name}")

        # 모델 초기화
        model = PrunedStockPricePredictor(model_name)
        model.apply_pruning(pruning_amount=0.4)  # 가지치기 적용
        model.train()

        optimizer = optim.Adam(model.parameters(), lr=lr)
        criterion = nn.MSELoss()

        for epoch in range(3):
            model.train()
            for i in range(0, len(X_train), 4):
                input_ids = torch.tensor(X_train[i:i+4, :512]).to(model.longformer.device)
                attention_mask = torch.tensor(X_train[i:i+4, 512:]).to(model.longformer.device)
                targets = torch.tensor(y_train[i:i+4]).to(model.longformer.device)
                
                optimizer.zero_grad()
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.squeeze(), targets)
                loss.backward()
                optimizer.step()

        # 검증 단계
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for i in range(0, len(X_val), 4):
                input_ids = torch.tensor(X_val[i:i+4, :512]).to(model.longformer.device)
                attention_mask = torch.tensor(X_val[i:i+4, 512:]).to(model.longformer.device)
                targets = torch.tensor(y_val[i:i+4]).to(model.longformer.device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.squeeze(), targets)
                val_loss += loss.item()

        val_loss /= len(X_val) / 4  # 평균 손실 계산

        print(f"Validation Loss: {val_loss}")
        
        # 최적의 모델 저장
        if val_loss < best_score:
            best_score = val_loss
            torch.save(model.state_dict(), best_model_path)

print(f"Best Validation Loss: {best_score}")
print(f"Best model saved to {best_model_path}")

In [None]:
print(1)