<a href="https://colab.research.google.com/github/seawavve/Wanted-Free-Pre-Onboarding-AI/blob/main/tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Tokenizer 생성

In [None]:
import re
import itertools

class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  # 텍스트 전처리
  def preprocessing(self, sequences):
    preprocessed_sequences = list()
    for sequence in sequences:
      lowered_sequence = sequence.lower()  # 소문자로 변환
      sequence = re.sub('[^a-zA-Z0-9 ]', '', lowered_sequence) # 특수문자 제거
      splited_sequence = sequence.split(' ') # white space 단위 자르기
      preprocessed_sequences.append(splited_sequence)
    return preprocessed_sequences
  
  # 어휘 사전 구축
  def fit(self, sequences):
    self.fit_checker = False
    tokenized_sequences = self.preprocessing(sequences)
    words_list= list()

    # 어휘사전 생성
    words_list = list(itertools.chain.from_iterable(tokenized_sequences))
    words_set = sorted(list(set(words_list)))
    for idx in range(len(words_set)):
      word = words_set[idx]
      self.word_dict[word] = idx+1

    self.fit_checker = True
  
  # 어휘 사전을 활용하여 토큰화
  def transform(self, sequences):
    tokens = self.preprocessing(sequences)
    transformed_sentences = list()

    if self.fit_checker:
      for token in tokens:
        transformed_sentence = list()
        for idx in range(len(token)):
          word = token[idx]
          # OOV 체크
          if word in self.word_dict:
            transformed_sentence.append(self.word_dict[word])
          else:
            transformed_sentence.append(self.word_dict['oov'])
        transformed_sentences.append( transformed_sentence )
      return transformed_sentences
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

In [None]:
tokenizer=Tokenizer()
sentences=['I go to school.',
           'I LIKE pizza!']

print(tokenizer.fit_transform(sentences))

[[2, 1, 6, 5], [2, 3, 4]]


### 2.TfidfVectorizer 생성하기

In [None]:
from math import log

class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
    self.tfidf_matrix = list()
    self.idf_matrix = list()
  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)
    N = len(sequences)

    # IDF Matrix 생성
    for idx in self.tokenizer.word_dict.values():
      df = 0
      for tokenized_sequence in tokenized:
        if idx in tokenized_sequence:
          df += 1
      idf = log(N/(df+1))
      self.idf_matrix.append(idf)
    self.fit_checker = True
    

  def transform(self, sequences):
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
      # TF-IDF Matrix 생성
      for tokenized_sequence in tokenized :
        tfidf_sequence = list()
        for idx in self.tokenizer.word_dict.values():
          tf = tokenized_sequence.count(idx)
          idf = self.idf_matrix[idx]
          tfidf = tf*idf
          tfidf_sequence.append(tfidf)
        self.tfidf_matrix.append(tfidf_sequence)
      return self.tfidf_matrix
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

In [None]:
tokenizer=Tokenizer()
tfidf_tokenizer = TfidfVectorizer(tokenizer)
test_sentences = ['I like to PARTY',
                  'Miss You',
                  'I Love you baby!'] 
print(tfidf_tokenizer.fit_transform(test_sentences))

[[0.0, 0.0, 0.0, 0.4054651081081644, 0.0, 0.0, 0.4054651081081644, 0.4054651081081644, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.4054651081081644, 0.0, 0.0, 0.0], [0.0, 0.4054651081081644, 0.0, 0.0, 0.4054651081081644, 0.0, 0.0, 0.0, 0.0]]
