# 데이터 살펴보기

In [1]:
import pandas as pd
import numpy as np
import os

data_path = '/home/restful3/workspaces/study/ds4th_study/source/텐초의 파이토치 딥러닝 특강/datasets/CH10/'

df = pd.read_csv(data_path + 'ArticlesApril2017.csv')
df.columns

Index(['abstract', 'articleID', 'articleWordCount', 'byline', 'documentType',
       'headline', 'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')

# 학습용 데이터 만들기

In [2]:
import numpy as np
import glob
import string

from torch.utils.data.dataset import Dataset

class TextGeneration(Dataset):
    def clean_text(self, txt):
        txt = "".join(v for v in txt if v not in string.punctuation).lower()
        return txt
    
    def __init__(self):
        all_headlines = []
        
        for filename in glob.glob(data_path+'*.csv'):
            if 'Articles' in filename:
                article_df = pd.read_csv(filename)
                all_headlines.extend(list(article_df.headline.values))
#                 break
                
        all_headlines = [h for h in all_headlines if h!= 'Unknown']
        
        self.corpus = [self.clean_text(x) for x in all_headlines]
        
        self.BOW = {}
        
        for line in self.corpus:
            for word in line.split():
                if word not in self.BOW.keys():
                    self.BOW[word] = len(self.BOW.keys())
                    
        self.data = self.generate_sequence(self.corpus)
        
    def generate_sequence(self, txt):
        seq = []
        
        for line in txt:
            line = line.split()
            line_bow = [self.BOW[word] for word in line]
            
            data = [([line_bow[i], line_bow[i+1]], line_bow[i+2]) for i in range(len(line_bow)-2)]
            
            seq.extend(data)
            
        return seq
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,i):
        data = np.array(self.data[i][0])
        label = np.array(self.data[i][1]).astype(np.float32)
        
        return data, label

In [3]:
txt = TextGeneration()

In [4]:
len(txt.BOW)

12148

# LSTM 모델 정의하기
- num_embeddings: 이 매개변수는 임베딩 층에 총 몇 개의 임베딩 벡터가 있는지를 지정합니다. 일반적으로 이 값은 단어나 토큰의 총 개수와 동일합니다. 예를 들어, 텍스트 데이터에서 고유한 단어의 수가 10,000개라면 num_embeddings은 10,000이 될 것입니다.

- embedding_dim: 이 매개변수는 임베딩 벡터의 차원을 지정합니다. 임베딩 벡터는 각각의 단어나 토큰을 나타내는 고정된 길이의 실수 벡터입니다. embedding_dim을 설정하면 임베딩 벡터의 차원이 결정됩니다. 일반적으로 이 값은 사전에 정의된 임베딩 차원의 크기로 설정됩니다. 예를 들어, embedding_dim을 300으로 설정하면 각각의 임베딩 벡터는 300차원의 실수 벡터가 됩니다.

In [5]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, num_embeddings):
        super(LSTM, self).__init__()
        
        self.embed = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=16)
        
        self.lstm = nn.LSTM(
            input_size=16,
            hidden_size=64,
            num_layers=5,
            batch_first=True
        )
        
        self.fc1 = nn.Linear(128, num_embeddings)
        self.fc2 = nn.Linear(num_embeddings, num_embeddings)
        
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.embed(x)
        
        x, _ = self.lstm(x)
        x = torch.reshape(x, (x.shape[0], -1))
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        
        return x

# 학습하기

In [6]:
import tqdm

import torch
from torch.utils.data.dataloader import DataLoader
from torch.optim.adam import Adam

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [7]:
dataset = TextGeneration()
model = LSTM(num_embeddings=len(dataset.BOW)).to(device)
loader = DataLoader(dataset, batch_size=64)
optim = Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

In [10]:
for epoch in range(200):
    iterator = tqdm.tqdm(loader)
    for data, label in iterator:
        data, label = data.to(device), label.to(device)
        
        optim.zero_grad()
        pred = model(torch.tensor(data, dtype=torch.long).to(device))
        loss = loss_fn(pred, torch.tensor(label, dtype=torch.long).to(device))
        
        loss.backward()
        optim.step()
        
        iterator.set_description(f'epoch {epoch}, loss : {loss.item()}')
        
torch.save(model.state_dict(), './models/LSTM.pth')        

  pred = model(torch.tensor(data, dtype=torch.long).to(device))
  loss = loss_fn(pred, torch.tensor(label, dtype=torch.long).to(device))
epoch 0, loss : 7.570168495178223: 100%|██████| 677/677 [00:15<00:00, 43.26it/s]
epoch 1, loss : 7.290179252624512: 100%|██████| 677/677 [00:15<00:00, 43.70it/s]
epoch 2, loss : 7.001972198486328: 100%|██████| 677/677 [00:15<00:00, 43.70it/s]
epoch 3, loss : 6.691782474517822: 100%|██████| 677/677 [00:15<00:00, 43.63it/s]
epoch 4, loss : 6.396479606628418: 100%|██████| 677/677 [00:15<00:00, 43.84it/s]
epoch 5, loss : 6.211499214172363: 100%|██████| 677/677 [00:15<00:00, 43.99it/s]
epoch 6, loss : 5.9702534675598145: 100%|█████| 677/677 [00:15<00:00, 43.83it/s]
epoch 7, loss : 5.700667858123779: 100%|██████| 677/677 [00:15<00:00, 43.69it/s]
epoch 8, loss : 5.5047807693481445: 100%|█████| 677/677 [00:15<00:00, 43.76it/s]
epoch 9, loss : 5.298403263092041: 100%|██████| 677/677 [00:15<00:00, 43.82it/s]
epoch 10, loss : 5.163243293762207: 100%|█████| 677/6

epoch 193, loss : 0.4252092242240906: 100%|███| 677/677 [00:15<00:00, 44.06it/s]
epoch 194, loss : 0.4041644036769867: 100%|███| 677/677 [00:15<00:00, 44.13it/s]
epoch 195, loss : 0.8233386278152466: 100%|███| 677/677 [00:15<00:00, 43.91it/s]
epoch 196, loss : 0.668724000453949: 100%|████| 677/677 [00:15<00:00, 43.92it/s]
epoch 197, loss : 0.4234485626220703: 100%|███| 677/677 [00:15<00:00, 44.04it/s]
epoch 198, loss : 0.5207594037055969: 100%|███| 677/677 [00:15<00:00, 44.09it/s]
epoch 199, loss : 0.32682323455810547: 100%|██| 677/677 [00:15<00:00, 44.04it/s]


# 모델 성능 평가하기

In [20]:
def generate(model, BOW, string='finding an ', strlen=10, device=device):    
    
    print(f'Input word : {string}')
    
    with torch.no_grad():
        for p in range(strlen):
            words = torch.tensor([BOW[w] for w in string.split()], dtype=torch.long).to(device)
            
            input_tensor = torch.unsqueeze(words[-2: ], dim=0)
            output = model(input_tensor)
            output_word = (torch.argmax(output).cpu().numpy())
            string += list(BOW.keys())[output_word]
            string += " "
            
    print(f'predicted sentence: {string}')

In [22]:

model.load_state_dict(torch.load("./models/LSTM.pth", map_location=device))

device = 'cuda' if torch.cuda.is_available() else 'cpu'
pred = generate(model, dataset.BOW, string='finding an ', strlen=10, device=device)

Input word : finding an 
predicted sentence: finding an workers sheep subject with laughing if on whisperer the — 
