In [1]:
# torch install 방법
# torch 사이트에서 긁어서 사용 
# !pip3 install torch torchvision

# Padding

- 자연어 처리에서 각 문장(문서)의 길이는 다를 수 있음 
- 그러나 언어모델은 고정된 길이의 데이터를 효율적으로 처리함
- 따라서 모든 문장의 길이를 동일하게 맞춰주는 작업이 필요함 == 패딩 


**패딩 이점**
1. 일관된 입력 형식
2. 병렬 연산 최적화
3. 유연한 데이터 처리 

In [2]:
preprocessed_sentences = [['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'],
                          ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'],
                          ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'],
                          ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
                          ['barber', 'went', 'huge', 'mountain']]
                            # 전처리 완료된 데이터 

### 직접 구현

In [3]:
import torch
from collections import Counter

class TokenizerForPadding:
    def __init__(self, num_words = None, oov_token='<OOV>'):
        self.num_words = num_words
        self.oov_token = oov_token
        self.word_index = {}
        self.index_word = {}
        self.word_counts = Counter()
        
    def fit_on_texts(self, texts):
        # 빈도 수 세기
        for sentence in texts:
            self.word_counts.update(word for word in sentence if word)  # 실제 단어에 대해서만 빈도 수 세기 

        # 빈도 수 기반 vocabulary 생성 
        vocab = [self.oov_token] + [word for word, _ in self.word_counts.most_common(self.num_words - 2 if self.num_words else None)] # most_common : 빈도 수가 높은 것 부터 반환
                                                                                                                                      # -2가 된 이유 알아보기 (설명해주셨는데 못알아들음)

        self.word_index = {word: i+1 for i, word in enumerate(vocab)}  # word index 를 vocab에 채워넣음 (인덱스가 0부터 시작하기 떄문에 +1)
        self.index_word = {i: word for word, i in self.word_index.items()}

    def texts_to_sequences(self, texts):
        return [[self.word_index.get(word, self.word_index[self.oov_token]) for word in sentence] for sentence in texts] # text에서 문장단위 반환, 문장에서 토큰 단위 반환 > 정수 인코딩 된 결과 가 있으면 return 아니면 oov 반환

In [None]:
def pad_sequences(sequences, maxlen=None, padding='pre', truncating='pre', value=0): # 패딩은 기본값이 0임. truncating(잘라주는것)-pre와 post 두가지로 설정 가능
    if maxlen is None:
        maxlen = max(len(seq) for seq in sequences) 

    padded_sequences = []

    for seq in sequences :      # 문장하나씩 반환
        if len(seq) > maxlen :  # 정해준 길이보다 클경우 슬라이싱
            if truncating == 'pre' :  # 앞에 있는 것을 자르는 것
                seq = seq[-maxlen:]
            else :             # post
                seq = seq[:maxlen]     # 뒤에 것이 잘리는 것 
        else :                 # 정해준 길이보다 짧은 경우
            pad_length = maxlen - len(seq)
            if padding == 'pre':
                seq = [value] * pad_length + seq   # 앞에 채우기 
            else :            # post
                seq = seq + [value] * pad_length   # 뒤에 채우기
        padded_sequences.append(seq)
    return torch.tensor(padded_sequences)    # 토치 텐서 형태로 반환 > 결과적으로는 모델의 입력값으로 쓰이기 때문에 텐서 형태로 반환

In [None]:
tokenizer = TokenizerForPadding(num_words=15)
tokenizer.fit_on_texts(preprocessed_sentences)
sequences = tokenizer.texts_to_sequences(preprocessed_sentences)
sequences

[[2, 6],
 [2, 9, 6],
 [2, 4, 6],
 [10, 3],
 [3, 5, 4, 3],
 [4, 3],
 [2, 5, 7],
 [2, 5, 7],
 [2, 5, 3],
 [8, 8, 4, 3, 11, 2, 12],
 [2, 13, 4, 14]]

In [None]:
padded = pad_sequences(sequences, padding='pre', truncating='pre', maxlen=None)  # maxlen을 None 으로 주면 최대 길이값을 반환함(7로 사용) / pre 반환 시 앞에 붙임
padded

tensor([[ 0,  0,  0,  0,  0,  2,  6],
        [ 0,  0,  0,  0,  2,  9,  6],
        [ 0,  0,  0,  0,  2,  4,  6],
        [ 0,  0,  0,  0,  0, 10,  3],
        [ 0,  0,  0,  3,  5,  4,  3],
        [ 0,  0,  0,  0,  0,  4,  3],
        [ 0,  0,  0,  0,  2,  5,  7],
        [ 0,  0,  0,  0,  2,  5,  7],
        [ 0,  0,  0,  0,  2,  5,  3],
        [ 8,  8,  4,  3, 11,  2, 12],
        [ 0,  0,  0,  2, 13,  4, 14]])

In [None]:
padded = pad_sequences(sequences, padding='post', truncating='post', maxlen=3)  # post 부여 시 3 뒤에 붙이고 뒤에 자름. 
padded

tensor([[ 2,  6,  0],
        [ 2,  9,  6],
        [ 2,  4,  6],
        [10,  3,  0],
        [ 3,  5,  4],
        [ 4,  3,  0],
        [ 2,  5,  7],
        [ 2,  5,  7],
        [ 2,  5,  3],
        [ 8,  8,  4],
        [ 2, 13,  4]])

In [None]:
padded = pad_sequences(sequences, padding='post', truncating='pre', maxlen=3)  # post 부여 시 3 뒤에 붙이고, pre는 앞에 자름. 
padded

tensor([[ 2,  6,  0],
        [ 2,  9,  6],
        [ 2,  4,  6],
        [10,  3,  0],
        [ 5,  4,  3],
        [ 4,  3,  0],
        [ 2,  5,  7],
        [ 2,  5,  7],
        [ 2,  5,  3],
        [11,  2, 12],
        [13,  4, 14]])

### keras Tokenizer 이용 

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_sentences)
sequences = tokenizer.texts_to_sequences(preprocessed_sentences)
sequences

[[1, 5],
 [1, 8, 5],
 [1, 3, 5],
 [9, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [7, 7, 3, 2, 10, 1, 11],
 [1, 12, 3, 13]]

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(sequences, padding='post', maxlen=3, truncating='post')
padded

array([[ 1,  5,  0],
       [ 1,  8,  5],
       [ 1,  3,  5],
       [ 9,  2,  0],
       [ 2,  4,  3],
       [ 3,  2,  0],
       [ 1,  4,  6],
       [ 1,  4,  6],
       [ 1,  4,  2],
       [ 7,  7,  3],
       [ 1, 12,  3]], dtype=int32)

### 어린완자 데이터 샘플 패딩 처리 
1. 텍스트 전처리 (토큰화/불용어처리/정제/정규화)
2. 정수 인코딩 by Tokenizer (tensorflow.keras)
3. 패딩처리 by pad_sequences (tensorflow.keras)

In [13]:
raw_text = """The Little Prince, written by Antoine de Saint-Exupéry, is a poetic tale about a young prince who travels from his home planet to Earth. The story begins with a pilot stranded in the Sahara Desert after his plane crashes. While trying to fix his plane, he meets a mysterious young boy, the Little Prince.

The Little Prince comes from a small asteroid called B-612, where he lives alone with a rose that he loves deeply. He recounts his journey to the pilot, describing his visits to several other planets. Each planet is inhabited by a different character, such as a king, a vain man, a drunkard, a businessman, a geographer, and a fox. Through these encounters, the Prince learns valuable lessons about love, responsibility, and the nature of adult behavior.

On Earth, the Little Prince meets various creatures, including a fox, who teaches him about relationships and the importance of taming, which means building ties with others. The fox's famous line, "You become responsible, forever, for what you have tamed," resonates with the Prince's feelings for his rose.

Ultimately, the Little Prince realizes that the essence of life is often invisible and can only be seen with the heart. After sharing his wisdom with the pilot, he prepares to return to his asteroid and his beloved rose. The story concludes with the pilot reflecting on the lessons learned from the Little Prince and the enduring impact of their friendship.

The narrative is a beautifully simple yet profound exploration of love, loss, and the importance of seeing beyond the surface of things."""

In [14]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# 문장 토큰화 
sentences = sent_tokenize(raw_text)

# 영어 불용어 리스트 
from nltk.corpus import stopwords
en_stopwords = stopwords.words('english')  

# 단어사전 (key=단어, value=빈도)
vocab = {}


# 토큰화/정제/정규화 처리 결과 
preprocessed_sentences = []

# 토큰 만큼 반복
for sentence in sentences : 
    sentence = sentence.lower() # 대소문자 정규화 (소문자 변환)

    tokens = word_tokenize(sentence) # 단어 토큰화
    tokens = [token for token in tokens if token not in en_stopwords] # 불용어 제거 
    tokens = [token for token in tokens if len(token) > 2]  # 단어 길이가 2 이하면 제거 

    for token in tokens :
        if token not in vocab:
            vocab[token] = 1
        else :
            vocab[token] += 1
    
    preprocessed_sentences.append(tokens)

In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=15, oov_token='<OOV>')
tokenizer.fit_on_texts(preprocessed_sentences)
sequences_t = tokenizer.word_index  # corpus의 모든 단어를 대상으로 생성
sequences = tokenizer.texts_to_sequences(preprocessed_sentences)
sequences

[[3, 2, 1, 1, 1, 1, 1, 7, 2, 1, 1, 8, 9],
 [10, 1, 4, 1, 1, 1, 11, 1],
 [1, 1, 11, 12, 1, 7, 1, 3, 2],
 [3, 2, 1, 1, 13, 1, 1, 1, 1, 5, 1, 1],
 [1, 1, 4, 1, 1, 1, 1],
 [8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6],
 [1, 2, 1, 1, 14, 1, 1, 1, 1, 1],
 [9, 3, 2, 12, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1],
 [6, 1, 1, 1, 1, 1, 1, 1, 2, 1, 5],
 [1, 3, 2, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 4, 1, 1, 13, 1, 5],
 [10, 1, 4, 1, 14, 1, 3, 2, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

In [28]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(sequences, padding='post', maxlen=None, truncating='post')
padded

array([[ 3,  2,  1,  1,  1,  1,  1,  7,  2,  1,  1,  8,  9,  0,  0,  0],
       [10,  1,  4,  1,  1,  1, 11,  1,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  1, 11, 12,  1,  7,  1,  3,  2,  0,  0,  0,  0,  0,  0,  0],
       [ 3,  2,  1,  1, 13,  1,  1,  1,  1,  5,  1,  1,  0,  0,  0,  0],
       [ 1,  1,  4,  1,  1,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 8,  1,  1,  1,  1,  1,  1,  1,  1,  1,  6,  0,  0,  0,  0,  0],
       [ 1,  2,  1,  1, 14,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  0],
       [ 9,  3,  2, 12,  1,  1,  1,  6,  1,  1,  1,  1,  1,  1,  1,  1],
       [ 6,  1,  1,  1,  1,  1,  1,  1,  2,  1,  5,  0,  0,  0,  0,  0],
       [ 1,  3,  2,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  0],
       [ 1,  1,  4,  1,  1, 13,  1,  5,  0,  0,  0,  0,  0,  0,  0,  0],
       [10,  1,  4,  1, 14,  1,  3,  2,  1,  1,  1,  0,  0,  0,  0,  0],
       [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0]],
      dtype=int32)