In [1]:
import os
import pandas as pd
import torch
import pickle
import re
import numpy as np
## 장르 분류 - 현종 민하
# 현종 - 드라마 코미디 애니 뮤직컬 다큐 공연 액션 멜로 가족 판타지 ( 나머지는 기타로 분류 >> 뒤 이어서 진행)
# 민하 - 스릴러 미스터리 공포 범죄 어드벤처 SF 서부극 사극 전쟁 성인물 기타

#### 데이터 토큰화 및 단어사전 구축

In [2]:
from konlpy.tag import *
from collections import Counter
from torchinfo import summary
import torch.nn as nn

In [3]:
class SentenceClassifier(nn.Module):
    def __init__(self,
                 n_vocab,
                 hidden_dim,
                 embedding_dim,
                 n_layers,
                 n_classes, 
                 dropout = 0.5,
                 bidirectional = True,
                 model_type = 'lstm'):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim = embedding_dim,
            padding_idx= 0)
        
        if model_type =='rnn':
            self.model = nn.RNN(
                input_size=embedding_dim,
                hidden_size = hidden_dim,
                num_layers= n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True)
    
        elif model_type == "lstm":
            self.model = nn.LSTM(
                input_size=embedding_dim,
                hidden_size = hidden_dim,
                num_layers= n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True )
            
        if bidirectional:
            self.classifier = nn.Linear(hidden_dim * 2, n_classes)
            
        else:
            self.classifier = nn.Linear(hidden_dim, n_classes)
            
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs):
        embeddings =self.embedding(inputs)
        output, _ = self.model(embeddings)
        last_output = output[:, -1, :]
        last_output = self. dropout(last_output)
        logits = self. classifier(last_output)
        return logits

In [40]:
### models 폴더 아래 프로젝트 폴더 아래 모델 파일 저장
import os

# 모델 저장 경로
SAVE_PATH = './model/'

# 모델 파일명
SAVE_MODEL = SAVE_PATH + 'best_model97.pth'

# 단어사전
VOCAB_FILE = './models/vocab.txt'


In [5]:
def load_model(MODEL_FILE):
   if os.path.exists(MODEL_FILE):
      model = torch.load(MODEL_FILE)
      return model

   else:
      result = '파일이 없습니다.'
      return result

In [6]:
stopword_path = 'basic_ko_stopwords.txt'
with open(stopword_path, 'r', encoding='utf-8') as file:
    word = file.readlines()

In [7]:
words = list(map(lambda x:x.replace('\n',''), word))

In [8]:
import re
tokenizer = Okt()

# 토큰화
def getToken(textlist,tokenizer):
    text_to_token=[]
    for idx,text in enumerate(textlist):
        # 한글빼고 다지우기
        text=re.sub('[^ㄱ-ㅎ가-힣]+',' ',text)

        # 토큰 추출
        tokens=tokenizer.morphs(text,norm=True)
        for token in tokens:
            # stop word 체크
            if token in words:
                tokens.remove(token)
        text_to_token.append(tokens)

    return text_to_token


In [34]:
# 패딩화화
def pad_sequences(sequences, max_length, pad_value):
    result = list()
    
    sequences = sequences[:max_length]
    pad_length = max_length - len(sequences)
    padded_sequence = sequences +[pad_value] * pad_length
    result.append(padded_sequence)
    return np.asarray(result)

In [24]:
ex_text ="제법 볼만한 웃긴 영화, 다신 보고 싶지 않은 영화가 있을까? "

In [25]:
text_tokens = [ token for token in tokenizer.morphs(ex_text, norm=False, stem=False)]
print(text_tokens)

['제법', '볼', '만', '한', '웃긴', '영화']


In [12]:
with open(VOCAB_FILE, 'r', encoding='utf-8') as f:
    vocab = f.readlines()
    id_to_token ={idx: token.replace('\n','') for idx, token in enumerate(vocab)}
    token_to_id ={token.replace('\n',''):  idx for idx, token in enumerate(vocab)}
    print(id_to_token)

{0: '<pad>', 1: '<unk>', 2: '관람', 3: '한다', 4: '된다', 5: '하는', 6: '불가', 7: '의', 8: '청소년', 9: '자신', 10: '한국', 11: '시작', 12: '된', 13: '는', 14: '집', 15: '을', 16: '적', 17: '날', 18: '이상', 19: '그', 20: '이', 21: '사랑', 22: '남편', 23: '남자', 24: '친구', 25: '사람', 26: '있는', 27: '하게', 28: '일본', 29: '되고', 30: '여자', 31: '들', 32: '를', 33: '함께', 34: '알', 35: '일', 36: '다', 37: '년', 38: '미국', 39: '할', 40: '은', 41: '그녀', 42: '에', 43: '없는', 44: '하는데', 45: '이다', 46: '아내', 47: '되는데', 48: '관계', 49: '한', 50: '섹스', 51: '중', 52: '에게', 53: '가', 54: '속', 55: '마음', 56: '말', 57: '곳', 58: '하며', 59: '아버지', 60: '살', 61: '하기', 62: '아들', 63: '전', 64: '다시', 65: '사실', 66: '있다', 67: '결국', 68: '결혼', 69: '서로', 70: '딸', 71: '하고', 72: '영화', 73: '몸', 74: '하던', 75: '엄마', 76: '둘', 77: '사이', 78: '와', 79: '이야기', 80: '사건', 81: '모든', 82: '되어', 83: '아이', 84: '성', 85: '라는', 86: '온', 87: '시간', 88: '위해', 89: '생활', 90: '다른', 91: '돈', 92: '가족', 93: '모습', 94: '한편', 95: '부부', 96: '하지', 97: '때문', 98: '에는', 99: '같은', 100: '우연히', 101: '생각', 102: '마을

In [13]:
max_length = 200

In [14]:
key_unk = [key for key, value in id_to_token.items() if value == '<unk>']
unk_id = key_unk[0]
unk_id

1

In [29]:
text_ids = [token_to_id.get(token, unk_id) for token in text_tokens]
text_ids

[1, 904, 473, 49, 1, 72]

In [32]:
key_pad = [key for key, value in id_to_token.items() if value == '<pad>']
pad_id = key_pad[0]
pad_id

0

In [35]:
text_ids = pad_sequences(text_ids, max_length, pad_id)
text_ids

array([[  1, 904, 473,  49,   1,  72,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [36]:
text_ids = torch.tensor(text_ids)

In [37]:
cnt_vocab = len(token_to_id)
hidden_dim = 64
embedding_dim = 128
n_layers =2
n_classes = 11

device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

classifier = SentenceClassifier(n_vocab=cnt_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim,
                                 n_layers=n_layers, n_classes=n_classes).to(device)
criterion = nn.CrossEntropyLoss().to(device)

In [38]:
def test(model, dataset, criterion, device):
    model.eval()
    losses_V = list()
    corrects_V = list()

    for step, (input_ids, labels) in enumerate(dataset):
        input_ids = input_ids.to(device)
        labels = labels.to(device).long()  # Long 타입으로 변환

        logits = model(input_ids)
        loss = criterion(logits, labels)  # 이제 labels는 Long 타입
        losses_V.append(loss.item())

        yhat = torch.argmax(logits, dim=1)  # 다중 클래스 예측
        corrects_V.extend(torch.eq(yhat, labels).cpu().tolist())

    val_loss = np.mean(losses_V)
    val_accuracy = np.mean(corrects_V)

    print(f'Val Loss : {val_loss:.4f}, Val Accuracy : {val_accuracy:.4f}')

    return val_loss, val_accuracy 

In [45]:
genre = ['가족', '공연', '기타', '드라마', '멜로/로맨스', '뮤지컬', '서부극(웨스턴)', '애니메이션', '액션', '코미디', '판타지']

In [46]:
# 예측
mymodel = load_model(SAVE_MODEL)
pre_y = mymodel(text_ids)
genre[torch.argmax(pre_y).item()]

'드라마'