<a href="https://colab.research.google.com/github/pxpper/wanted_pre_onboarding/blob/main/%EC%9B%90%ED%8B%B0%EB%93%9C_%ED%94%84%EB%A6%AC%EC%98%A8%EB%B3%B4%EB%94%A9_%EC%BD%94%EC%8A%A4_%EC%95%88%EC%A0%95%EC%9D%B4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
from math import log
import numpy as np

In [2]:
class Tokenizer:
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  def preprocessing(self, sequences):
    result = []
    for seq in sequences:
        seq = seq.strip()
        seq = seq.lower()
        seq = re.sub('[^\w\s]','',seq)
        temp = seq.split()
        result.append(temp)
    return result
  
  def fit(self, sequences):
    self.fit_checker = False
    seq_list = self.preprocessing(sequences)
    dic_val = 1
    for seq in seq_list:
        for token in seq:
            if token not in self.word_dict.keys():
                self.word_dict[token] = dic_val
                dic_val += 1
    self.fit_checker = True
    
  def transform(self, sequences):
    result = []
    seq_list = self.preprocessing(sequences)
    if self.fit_checker:
      for seq in seq_list:
          temp = []
          for token in seq:
              if token in self.word_dict.keys():
                  temp.append(self.word_dict[token])
              else:
                  temp.append(0)
          result.append(temp)
      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

In [3]:
class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = Tokenizer()
    self.fit_checker = False
  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)
    index_list = list(set(sum(tokenized, [])))
    
    counting_list = np.zeros(len(index_list))

    for token in tokenized:
        for i, voca in enumerate(index_list):
            if voca in token:
              counting_list[i] += 1
    
    temp=[]
    for x in counting_list:
      temp.append(log(len(tokenized)/(1+x)))
    self.idf = temp
    
    self.fit_checker = True
    

  def transform(self, sequences):
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
      index_list = list(set(sum(tokenized, [])))
      
      result = []
      for i, token in enumerate(tokenized):
        temp = []
        for j, index in enumerate(index_list):
          tf = token.count(index)
          temp.append(tf * self.idf[j])
        result.append(temp)
      return result
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

P1-1. tokenizered result

In [4]:
tokenizer=Tokenizer()
seq_list=tokenizer.preprocessing(sequences=['I go to school.', 'I LIKE pizza!'])
print(seq_list)

[['i', 'go', 'to', 'school'], ['i', 'like', 'pizza']]


P1-2. dict

In [5]:
tokenizer.fit(sequences=['I go to school.', 'I LIKE pizza!'])
print(tokenizer.word_dict)

{'oov': 0, 'i': 1, 'go': 2, 'to': 3, 'school': 4, 'like': 5, 'pizza': 6}


 P1-3. indexing

In [6]:
index_tensor=tokenizer.transform(sequences=['I go to school.', 'I LIKE pizza!'])
print(index_tensor)

print("조건 1: 어휘 사전(self.word_dict)에 없는 단어는 'oov'의 index로 변환합니다.")
index_tensor=tokenizer.transform(sequences=['I go to school.', 'I LIKE pizza and chips!'])
print(index_tensor)

[[1, 2, 3, 4], [1, 5, 6]]
조건 1: 어휘 사전(self.word_dict)에 없는 단어는 'oov'의 index로 변환합니다.
[[1, 2, 3, 4], [1, 5, 6, 0, 0]]


P2-1. IDF_array

In [7]:
vectorizer=TfidfVectorizer(tokenizer)
vectorizer.fit(sequences=['I go to school.', 'I LIKE pizza!'])
print(vectorizer.idf)

[-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]


P2-2. TF-IDF_array

In [8]:
tfidf_array=vectorizer.transform(sequences=['I go to school.', 'I LIKE pizza!'])
print(tfidf_array)

[[-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0], [-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]]
