# 문제1 Tokenizer

In [10]:
import re

In [77]:
class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  def preprocessing(self, sequences):
    result = []
    # 문제 1-1
    for sentence in sequences:
      pword = []
      for word in sentence.lower().split():
        pword.append(''.join(re.sub(r'[^a-z]', '', word)))
      result.append(pword)
    return result
  
  def fit(self, sequences):
    self.fit_checker = False
    # 문제 1-2
    tokens = self.preprocessing(sequences)
    token_set = set(word for token in tokens for word in token)
    for token in token_set:
      self.word_dict[token] = len(self.word_dict)
    self.fit_checker = True
  
  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)
    if self.fit_checker:
      # 문제 1-3
      for token in tokens:
        ptoken = []
        for word in token:
          if word in self.word_dict:
            ptoken.append(self.word_dict[word])
          else:
            ptoken.append(self.word_dict['oov'])
        result.append(ptoken)
      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

**1-1. preprocessing()** 텍스트 전처리

In [78]:
TOK1 = Tokenizer()
input = ['I go to school.', 'I LIKE pizza!']
output = TOK1.preprocessing(input)
print(output)

[['i', 'go', 'to', 'school'], ['i', 'like', 'pizza']]


**1-2. fit()** 어휘사전 구축

In [79]:
TOK1.fit(input)
print(TOK1.word_dict)

{'oov': 0, 'school': 1, 'pizza': 2, 'to': 3, 'like': 4, 'go': 5, 'i': 6}


**1-3. transform()** 문장 인덱싱

In [80]:
sentence = ['I go to school.', 'I LIKE pizza!', 'i like chicken']
print(TOK1.transform(sentence))

[[6, 5, 3, 1], [6, 4, 2], [6, 4, 0]]

# 문제2 TfidfVectorizer 생성하기

In [166]:
from math import log
import numpy as np

In [180]:
class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)
    # 문제 2-1
    n = len(tokenized)
    self.fit_checker = True
    size = len(self.tokenizer.word_dict)
    df = [0] * size
    for tokens in tokenized:
      token_set = set(tokens)
      for i in token_set:
        df[i] += 1
    idf_matrix = list(log(n / (1 + df[i])) for i in range(1, size))
    self.idf_matrix = idf_matrix

  def transform(self, sequences):
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
      # 문제 2-2
      t = len(self.tokenizer.word_dict) - 1
      d = len(tokenized)
      tf = [[0 for _ in range(t)] for _ in range(d)]
      tf = np.zeros((d, t))
      for i in range(d):
        for j in range(t):
          tf[i, j] = tokenized[i].count(j + 1)
      tfidf_matrix = tf * np.array(self.idf_matrix)
      self.tfidf_matrix = tfidf_matrix
      return self.tfidf_matrix
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

In [181]:
TOK2 = Tokenizer()
TIV = TfidfVectorizer(TOK2)

**2-1. fit()** IDF 행렬


$idf(d,t)=log_e(\frac{n}{1+df(d, t)})$

$df(d,t)$: 단어 t가 포함된 문장 d의 개수

$n$: 입력된 전체 문장 개수

In [182]:
sentence = ['I go to school.', 'I LIKE pizza!', 'I HAVE A APPLE', 'apple pie like', 'pine apple pizza']
TIV.fit(sentence)
print(TIV.idf_matrix)
print(TOK2.word_dict)

[0.9162907318741551, 0.5108256237659907, 0.9162907318741551, 0.9162907318741551, 0.9162907318741551, 0.5108256237659907, 0.22314355131420976, 0.9162907318741551, 0.9162907318741551, 0.9162907318741551, 0.22314355131420976]
{'oov': 0, 'school': 1, 'pizza': 2, 'a': 3, 'to': 4, 'have': 5, 'like': 6, 'apple': 7, 'pie': 8, 'go': 9, 'pine': 10, 'i': 11}


**2-2. transform()** TF-IDF 행렬

$tf(d, t)$ : 문장 d에 단어 t가 나타난 횟수

$tf-idf(d,t) = tf(d,t) \times idf(d,t)$

In [183]:
sentence = ['I go to school.', 'I LIKE pizza!', 'I HAVE A APPLE']
TIV.transform(sentence)

array([[0.91629073, 0.        , 0.        , 0.91629073, 0.        ,
        0.        , 0.        , 0.        , 0.91629073, 0.        ,
        0.22314355],
       [0.        , 0.51082562, 0.        , 0.        , 0.        ,
        0.51082562, 0.        , 0.        , 0.        , 0.        ,
        0.22314355],
       [0.        , 0.        , 0.91629073, 0.        , 0.91629073,
        0.        , 0.22314355, 0.        , 0.        , 0.        ,
        0.22314355]])