<a href="https://colab.research.google.com/github/restful3/ds4th_study/blob/main/source/%ED%85%90%EC%B4%88%EC%9D%98%20%ED%8C%8C%EC%9D%B4%ED%86%A0%EC%B9%98%20%EB%94%A5%EB%9F%AC%EB%8B%9D%20%ED%8A%B9%EA%B0%95/LSTM%ED%85%8D%EC%8A%A4%ED%8A%B8%EC%83%9D%EC%84%B1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import pandas as pd
import os
import string
df= pd.read_csv('/content/drive/MyDrive/SelfStudy/딥러닝기초/data/CH10/ArticlesApril2017.csv')
print(df.columns)
print(df.shape)

Index(['abstract', 'articleID', 'articleWordCount', 'byline', 'documentType',
       'headline', 'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')
(886, 16)


헤드라인만 가져와서 텍스트분석을 진행한다.

![](https://drive.google.com/uc?id=15kGb1FM8HLEiLeypYdOGnu4sF35A2GsU)

- BOW (Bag of Words)
> 모든 단어를 겹치지 않도록 고유번호로 인식한다.
- [corpus ](https://ko.wikipedia.org/wiki/%EB%A7%90%EB%AD%89%EC%B9%98)
> 말뭉치라고도 하며, 자연어처리를 위해 구성되는 기본집단

![](https://drive.google.com/uc?id=1uqEw3liB1LOmktYxf-T7C613xskD196j)


In [2]:
help(string)

Help on module string:

NAME
    string - A collection of string constants.

MODULE REFERENCE
    https://docs.python.org/3.10/library/string.html
    
    The following documentation is automatically generated from the Python
    source files.  It may be incomplete, incorrect or include features that
    are considered implementation detail and may vary between Python
    implementations.  When in doubt, consult the module reference at the
    location listed above.

DESCRIPTION
    Public module variables:
    
    whitespace -- a string containing all ASCII whitespace
    ascii_lowercase -- a string containing all ASCII lowercase letters
    ascii_uppercase -- a string containing all ASCII uppercase letters
    ascii_letters -- a string containing all ASCII letters
    digits -- a string containing all ASCII decimal digits
    hexdigits -- a string containing all ASCII hexadecimal digits
    octdigits -- a string containing all ASCII octal digits
    punctuation -- a string containi

In [3]:
import numpy as np
import glob
from torch.utils.data.dataset import Dataset

class TextGeneration(Dataset):
    def clean_text(self, txt):
        # 모든 단어를 소문자로 바꾸고 특수문자를 제거
        txt = "".join(v for v in txt if v not in string.punctuation).lower()
        return txt

    def __init__(self):
        all_headlines = []

        # 모든 헤드라인의 텍스트를 불러옴
        for filename in glob.glob("/content/drive/MyDrive/SelfStudy/딥러닝기초/data/CH10/*.csv"):
            if 'Articles' in filename:
                article_df = pd.read_csv(filename)

                # 데이터셋의 headline의 값을 all_headlines에 추가
                all_headlines.extend(list(article_df.headline.values))
                break

        # headline 중 unknown 값은 제거
        all_headlines = [h for h in all_headlines if h != "Unknown"]

        # 구두점 제거 및 전처리가 된 문장들을 리스트로 반환
        self.corpus = [self.clean_text(x) for x in all_headlines]
        self.BOW = {}

        # 모든 문장의 단어를 추출해 고유번호 지정
        for line in self.corpus: # 전처리된 문장 하나씩 불러와서
            for word in line.split(): # 쪼갠 word를
                if word not in self.BOW.keys(): # 아직 키로 저장 안된것만 골라서
                    self.BOW[word] = len(self.BOW.keys()) # 새로운 키로 저장함

        # 모델의 입력으로 사용할 데이터
        self.data = self.generate_sequence(self.corpus)


    def generate_sequence(self, txt):
        seq = []
        for line in txt:
            line = line.split() # line 분해리스트
            line_bow = [self.BOW[word] for word in line] #BOW[word]의 고유번호 리스트
            # 단어 2개를 입력으로, 그다음 단어를 정답으로
            data = [([line_bow[i], line_bow[i+1]], line_bow[i+2])
            for i in range(len(line_bow)-2)]
            seq.extend(data)
        return seq

    def __len__(self):
        return len(self.data)
    def __getitem__(self, i):
        data = np.array(self.data[i][0])  # ❶ 입력 데이터
        label = np.array(self.data[i][1]).astype(np.float32)  # ❷ 출력 데이터

        return data, label

LSTM 모델 정의<br>
![](https://drive.google.com/uc?id=1CHLvEcJBBaxhvw08oRyHaPuZZJ50IhUA)


In [4]:
import torch.nn as nn
class LSTM(nn.Module):
  def __init__(self, num_embeddings):
    super(LSTM, self).__init__()

    # 밀집표현을 위한 임베딩 층
    self.embed = nn.Embedding(
        num_embeddings = num_embeddings, embedding_dim = 16
    )

    # LSTM을 5개 층으로 쌓음
    self.lstm = nn.LSTM(
        input_size = 16,
        hidden_size = 64,
        num_layers = 5,
        batch_first = True
    )

    # 분류를 위한 MLP층
    self.fc1 = nn.Linear(128, num_embeddings)
    self.fc2 = nn.Linear(num_embeddings,num_embeddings)
    # 활성화함수
    self.relu = nn.ReLU()

  def forward(self, x):
      x = self.embed(x)

      # LSTM 모델의 예측값
      x, _ = self.lstm(x)
      x = torch.reshape(x, (x.shape[0], -1))
      x = self.fc1(x)
      x = self.relu(x)
      x = self.fc2(x)

      return x

In [5]:
import tqdm
from torch.utils.data.dataloader import DataLoader
from torch.optim.adam import Adam
import torch

# 학습을 진행할 프로세서 정의
device = 'cuda' if torch.cuda.is_available() else 'cpu'

dataset = TextGeneration() # 데이터셋 정의
model = LSTM(num_embeddings=len(dataset.BOW)).to(device) # 모델정의
loader = DataLoader(dataset, batch_size=64)
optim = Adam(model.parameters(), lr=0.001)

for epoch in range(200):
  iterator = tqdm.tqdm(loader)
  for data, label in iterator:
    # 기울기 초기화
    optim.zero_grad()
    # 모델의 예측값
    pred = model(torch.tensor(data, dtype=torch.long).to(device))
    # 정답레이블은 long텐서로반환필요ㅕ
    loss = nn.CrossEntropyLoss()(
        pred, torch.tensor(label,dtype=torch.long).to(device))

    # 오차역전파
    loss.backward()
    optim.step()
    iterator.set_description(f'epoch{epoch} loss: {loss.item()}')
torch.save(model.state_dict(), 'lstm.pth')

  pred =model(torch.tensor(data, dtype=torch.long).to(device))
  pred, torch.tensor(label,dtype=torch.long).to(device))
epoch0 loss: 7.4215617179870605: 100%|██████████| 104/104 [00:02<00:00, 49.21it/s]
epoch1 loss: 6.937582492828369: 100%|██████████| 104/104 [00:00<00:00, 119.61it/s]
epoch2 loss: 6.2258195877075195: 100%|██████████| 104/104 [00:00<00:00, 122.64it/s]
epoch3 loss: 5.8591108322143555: 100%|██████████| 104/104 [00:00<00:00, 122.64it/s]
epoch4 loss: 5.5691680908203125: 100%|██████████| 104/104 [00:00<00:00, 117.60it/s]
epoch5 loss: 5.846491813659668: 100%|██████████| 104/104 [00:00<00:00, 122.71it/s]
epoch6 loss: 5.572712421417236: 100%|██████████| 104/104 [00:00<00:00, 117.64it/s]
epoch7 loss: 5.594508647918701: 100%|██████████| 104/104 [00:00<00:00, 113.33it/s]
epoch8 loss: 5.73337984085083: 100%|██████████| 104/104 [00:00<00:00, 113.97it/s]
epoch9 loss: 5.402045726776123: 100%|██████████| 104/104 [00:01<00:00, 86.38it/s]
epoch10 loss: 5.920941352844238: 100%|██████████|

In [19]:
def generate(model, BOW, string="finding an ", strlen=10):
   device = "cuda" if torch.cuda.is_available() else "cpu"

   print(f"input word: {string}")

   with torch.no_grad():
       for p in range(strlen):
           # 입력 문장을 텐서로 변경
           words = torch.tensor(
               [BOW[w] for w in string.split()], dtype=torch.long).to(device)

           #
           input_tensor = torch.unsqueeze(words[-2:], dim=0)
           output = model(input_tensor)  # 모델을 이용해 예측
           output_word = (torch.argmax(output).cpu().numpy())
           string += list(BOW.keys())[output_word]  # 문장에 예측된 단어를 추가
           string += " "

   print(f"predicted sentence: {string}")

model.load_state_dict(torch.load("lstm.pth", map_location=device))
pred = generate(model, dataset.BOW)

input word: finding an 
predicted sentence: finding an york france at award on gets webs be attacker ready 


In [20]:
generate(model, dataset.BOW, string = "a church ")

input word: a church 
predicted sentence: a church immigrants sprint a new spider family tree tries to untangle 


In [24]:
generate(model, dataset.BOW, string = "girl school ")

input word: girl school 
predicted sentence: girl school says he was too rude’ more can trump’s lawman ledge 
