<a href="https://colab.research.google.com/github/seawavve/NLP_wavve/blob/main/Onboarding_test7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Tokenizer 생성

제출전 확인할 사항
* 클린코드 유의
* = 사이 띄어쓰기로 가독성 높이기
* word_dict, dict_word 이런 변수명 순서 통일성

In [58]:
import re
# fit이 일어난 후에 transform이 일어난다면 반드시 모든 단어가 word_dict에 포함되게 되어있는데 왜 oov를 쓰지?

class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  # 텍스트 전처리
  def preprocessing(self, sequences):
    result = []
    for sequence in sequences:
      sequence=sequence.lower()  # 소문자로 변환
      sequence=re.sub('[^a-zA-Z0-9 ]', '', sequence) # 특수문자 제거
      splited_sequence = sequence.split(' ') # white space 단위 자르기
      result.append(splited_sequence)
    return result
  
  # 어휘 사전을 구축
  def fit(self, sequences):
    self.fit_checker = False
    tokenized_sequences=self.preprocessing(sequences)
    words_list=[]

    # 어휘사전 생성
    for tokenized_sequence in tokenized_sequences:
      words_list.extend(tokenized_sequence)
    words_set=sorted(list(set(words_list)))
    for idx in range(len(words_set)):
      word = words_set[idx]
      self.word_dict[word]=idx+1
    self.fit_checker = True
    print(self.word_dict)
  
  # 어휘 사전을 활용하여 입력 문장을 정수 인덱싱
  def transform(self, sequences):
    tokens = self.preprocessing(sequences)
    transformed_sentences=[]

    if self.fit_checker:
      for token in tokens:
        transformed_sentence=[]
        for idx in range(len(token)):
          word = token[idx]
          if word in self.word_dict:
            transformed_sentence.append(self.word_dict[word])
          else:
            transformed_sentence.append(self.word_dict['oov'])
        transformed_sentences.append(transformed_sentence)
      return transformed_sentences
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

In [59]:
tokenizer=Tokenizer()
sentences = ['I go to school.', 'I LIKE pizza!']
test_sentences = ['I like to PARTY','miss you']

print(tokenizer.fit_transform(sentences))

{'oov': 0, 'go': 1, 'i': 2, 'like': 3, 'pizza': 4, 'school': 5, 'to': 6}
[[2, 1, 6, 5], [2, 3, 4]]


### 2.TfidfVectorizer 생성하기

참고자료
https://wikidocs.net/31698

In [66]:
from math import log

class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
    self.tfidf_matrix = list()
    self.idf_matrix = list()
  
  #대문자는 df 처리가 안된다. 노린문제인가? 이런경우 어떻게 처리하는가?
  #idf부터 모두 토큰화된걸 기준으로 계산하면 된다.
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)
    N=len(sequences)

    #IDF 행렬 만들기 (idx기준으로 바꾸기)---------------------------
    for idx in range(len(self.tokenizer.word_dict)):
      print(idx)
      df = 0
      for tokenized_sequence in tokenized:
        if idx in tokenized_sequence:
          df += 1
      print(idx, df)
      self.idf_matrix.append( log(N/(df+1)) )
    self.fit_checker = True
    

  def transform(self, sequences):
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
      print(tokenized)

      #TF-iDF 행렬 만들기
      for tokenized_sequence in tokenized :
        tf_list=[]
        for idx in range(len(self.idf_matrix)):
          tf = tokenized_sequence.count(idx)
          idf = self.idf_matrix[idx]
          tf_idf = tf*idf
          tf_list.append( tf_idf )
        self.tfidf_matrix.append(tf_list)
      return self.tfidf_matrix
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

In [67]:
tfidf_tokenizer = TfidfVectorizer(tokenizer)
sentences = ['I go to school.', 'I LIKE pizza!']
test_sentences = ['I like to PARTY','miss you', 'I Love you baby!']
print(tfidf_tokenizer.fit(test_sentences))
print(tfidf_tokenizer.transform(test_sentences))

{'oov': 0, 'go': 1, 'i': 2, 'like': 3, 'pizza': 4, 'school': 5, 'to': 7, 'baby': 1, 'love': 4, 'miss': 5, 'party': 6, 'you': 8}
0
0 0
1
1 1
2
2 2
3
3 1
4
4 1
5
5 1
6
6 1
7
7 1
8
8 2
9
9 0
10
10 0
11
11 0
None
[[2, 3, 7, 6], [5, 8], [2, 4, 8, 1]]
[[0.0, 0.0, 0.0, 0.4054651081081644, 0.0, 0.0, 0.4054651081081644, 0.4054651081081644, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.4054651081081644, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.4054651081081644, 0.0, 0.0, 0.4054651081081644, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
