<a href="https://colab.research.google.com/github/sdding/wanted_pre_onboarding/blob/main/%EC%9B%90%ED%8B%B0%EB%93%9C_%ED%94%84%EB%A6%AC%EC%98%A8%EB%B3%B4%EB%94%A9_%EC%BD%94%EC%8A%A4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 원티드 프리온보딩 AL/ML 코스 사전과제

In [1]:
import re
import numpy as np

In [26]:
class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False

  def preprocessing(self, sequences):
    result= []
    # 문제 1-1
    for text in sequences:  
      token = re.sub(r'[^0-9a-zA-Z ]', '', text.lower()).strip()   # 소문자 변환, 특수문자 제거, 토큰화
      result.append(token.split(' '))
          
    return result
  
  def fit(self, sequences):
    self.fit_checker = False
    # 문제 1-2
    tokens = self.preprocessing(sequences) # 문장에 대해 토큰화
          
    for token in tokens:
      for t in sorted(token):
        if t not in self.word_dict:  # 토큰이 사전에 없으면 사전에 추가(중복 방지)
          self.word_dict[t] = len(self.word_dict) # 1부터 정수 인덱싱
          
    self.fit_checker = True
    
  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)    
    if self.fit_checker:
      # 문제 1-3
      for token in tokens:
        token_list = []
        for t in token:
          if t not in self.word_dict:  # 어휘 사전에 없는 단어는 'oov'의 index 0 으로 변환
            token_list.append(self.word_dict['oov'])
          else:
            token_list.append(self.word_dict[t])  # 사전에 있다면 해당 인덱스로 변환
        result.append(token_list)
      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

In [28]:
Tokenizer().fit_transform(['I go to school.', 'I LIKE pizza!'])

[[2, 1, 4, 3], [2, 5, 6]]

In [40]:
class TfidfVectorizer: 
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)
    # 문제 2-1
    M = max(map(max, tokenized))  # 토큰 숫자중 최대값
    IDF = []
    for i in range(1, M+1):
      count = 0
      for tokens in tokenized:
        if i in tokens:
          count += 1
      IDF.append(np.log(len(tokenized) / (1+count)))  # IDF 값
    self.IDF = IDF

    self.fit_checker = True
    

  def transform(self, sequences):
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
      # 문제 2-2
      M = max(map(max, tokenized))  # 토큰 숫자중 최대값
      TF = []
      for tokens in tokenized:
        count = []
        for i in range(1, M+1):
          count.append(tokens.count(i))
        TF.append(count)
        
      self.tfidf_matrix = (np.array(TF) * np.array(self.IDF)).tolist()  # tf-idf 계산

      return self.tfidf_matrix
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

In [41]:
TfidfVectorizer(Tokenizer()).fit_transform(['I go to school.', 'I LIKE pizza!'])

[[0.0, -0.40546510810816444, 0.0, 0.0, 0.0, 0.0],
 [0.0, -0.40546510810816444, 0.0, 0.0, 0.0, 0.0]]