<a href="https://colab.research.google.com/github/seyeonjungGit/NLP_Algorithm/blob/main/TfidfVectorizer_%EC%8B%A4%ED%96%89.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://codestates.notion.site/_AIB-8aaa720522d0496bb80a707f32dc7411

외부 라이브러리 사용은 자유이나, output의 type은 문제에 명시된 조건을 따라야 합니다.

# Tokenizer 생성하기

In [1]:
class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  # 텍스트 전처리
  def preprocessing(self, sequences):
    result = []
    '''
    문제 1-1.
    조건 1 : 소문자로의 변환과 특수문자 제거를 수행
    조건 2 : 토큰화는 white space 단위로 수행
    '''
    from tensorflow.keras.preprocessing.text import text_to_word_sequence
    for s in sequences :  # 한 문장씩 꺼내기
      result.append(list(text_to_word_sequence(s)))  # 소문자로 변환 + 특수문자 제거(아포스트로피는 보존) + 토큰화

    return result
  

  # 어휘 사전 구축
  def fit(self, sequences):
    self.fit_checker = False
    '''
    문제 1-2.
    조건 1: 위에서 만든 preprocessing 함수를 이용하여 각 문장에 대해 토큰화를 수행합니다.
    조건 2: 각각의 토큰을 정수 인덱싱 하기 위한 어휘 사전(self.word_dict)을 생성합니다.
    주어진 코드에 있는 self.word_dict를 활용합니다.
    '''
    tokens = self.preprocessing(sequences)  # 중첩리스트가 반환

    from collections import Counter
    all_words_list = sum(tokens, [])  # 토큰들을 한 리스트에 모두 담기
    vocab = Counter(all_words_list)  # 리스트에서 토큰들의 갯수 세아리기
    vocab_size = len(vocab)    # 토큰 중에 어휘사전구축에 활용할 갯수 설정(여기서는 전체 토큰 활용)
    vocab = vocab.most_common(vocab_size)

    i = 0
    for (word, frequency) in vocab :  # 높은 빈도수를 가진 단어일수록 낮은 정수 인덱스를 부여
      i += 1
      self.word_dict[word] = i     # word_dict에 어휘 추가.
    self.fit_checker = True


  # 어휘 사전 활용 -> 정수 인덱싱
  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)  # 중첩리스트가 반환
    if self.fit_checker:
      '''
      문제 1-3.
      조건 1: 어휘 사전(self.word_dict)에 없는 단어는 'oov'의 index로 변환합니다.
      '''
      for i in range(len(tokens)) :  # 한 문장을 불러옴
        pre = []
        for j in tokens[i] :  # 한 단어를 꺼낸다.
          if j in self.word_dict.keys():  
            pre.append(self.word_dict[j]) 
          else:    # 어휘 사전(self.word_dict)에 없는 단어는 'oov'의 index로 변환
            pre.append(self.word_dict['oov'])
        result.append(pre)
      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")


  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

In [2]:
# 확인
tokenizer = Tokenizer()
input = ['I go to school.', 'I LIKE pizza!']
tokenizer.fit_transform(input)

[[1, 2, 3, 4], [1, 5, 6]]

In [3]:
# oov 확인
input2 = ['something like to.', 'I LIKE cola!']
tokenizer.transform(input2)

[[0, 5, 3], [1, 5, 0]]

In [4]:
# 전처리 확인
tokenizer.preprocessing(input2)

[['something', 'like', 'to'], ['i', 'like', 'cola']]

# TfidfVectorizer 생성하기
reference : https://wikidocs.net/31698

In [5]:
class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)  # 이중리스트
    '''
    문제 2-1.
    조건 1: IDF 행렬은 list 형태입니다.
    조건 2: IDF 값은 아래 식을 이용해 구합니다.
    - IDF = (특정단어 t가 등장한 문서의 수) 에 반비례하는 수
    조건 3: 입력된 문장의 토큰화에는 문제 1에서 만든 Tokenizer를 사용합니다.
    '''

    from math import log

    # 정수인덱싱된 문장을 어휘사전으로 만들기
    vocab = list(set(sum(tokenized, [])))  # [1,2,3,4,...]
    vocab.sort()  # 정렬

    # 총 문서의 수
    N = len(tokenized)

    # 특정 단어 t가 등장한 문서의 수(IDF)
    self.result_idf = []
    for t in vocab :      # 어휘사전에서 한단어씩 꺼내기 e.g) 1 or 2 or 3....
      df = 0
      for one in tokenized:  # 전체에서 한 문장씩 꺼낸다. e.g) [1,2,3,4]
        df += (t in one)   # 한 문장에 해당 단어가 들어가 있다. -> True | False   # 전체문서에서 특정단어 t가 얼만큼 등장?
      
      self.result_idf.append(log(N/(1+df)))  # 해당 단어(t)에 대한 idf 값을 리스트에 추가

    self.fit_checker = True
    

  def transform(self, sequences):
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
      '''
      문제 2-2.
      조건1 : 입력 문장을 이용해 TF 행렬을 만드세요.
      - tf(d, t) : 문장 d에 단어 t가 나타난 횟수
      조건2 : 문제 2-1( fit())에서 만든 IDF 행렬과 아래 식을 이용해 TF-IDF 행렬을 만드세요
      - tf-idf(d,t) = tf(d,t) * idf(d,t)
      '''
      # 정수인덱싱된 문장을 어휘사전으로 만들기
      vocab = list(set(sum(tokenized, [])))
      vocab.sort()  # 정렬    

      # 총 문서의 수
      N = len(tokenized)
      
      # 한 문서안에 특정단어 t가 몇번 들어가는지 (TF)
      self.result_tf = []
      for d in tokenized:  # 문장 한개를 꺼낸다.
        self.result_tf.append([])
        for t in vocab:  # 어휘사전에서 단어 한개 꺼낸다.
          self.result_tf[-1].append(d.count(t))  # 중첩리스트

      # tfidf 구하기
      self.tfidf_matrix = []
      for tf in self.result_tf :  # TF행렬 하나씩 꺼내기 (문서 1개씩 꺼내기)
        matrix = []
        for i in range(len(vocab)):
          matrix.append(tf[i]*self.result_idf[i])
        self.tfidf_matrix.append(matrix)
      return self.tfidf_matrix

    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)  # 중첩리스트

In [6]:
# 확인 1)
docs = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
]

In [7]:
vocab = list(set(w for doc in docs for w in doc.split()))
vocab

['사과', '과일이', '좋아요', '노란', '길고', '싶은', '바나나', '저는', '먹고']

In [8]:
tokenizer = Tokenizer()
tf_idf = TfidfVectorizer(tokenizer)
result = tf_idf.fit_transform(docs)
result

[[0.0,
  0.28768207245178085,
  0.28768207245178085,
  0.6931471805599453,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.28768207245178085,
  0.28768207245178085,
  0.28768207245178085,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.5753641449035617,
  0.0,
  0.0,
  0.0,
  0.6931471805599453,
  0.6931471805599453,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.6931471805599453,
  0.6931471805599453,
  0.6931471805599453]]

In [9]:
li=tokenizer.fit_transform(docs)
print(li)
li2 = list(set(sum(li,[])))
print(li2)

[[2, 3, 4], [2, 3, 1], [5, 6, 1, 1], [7, 8, 9]]
[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [10]:
dic = {}
for i,j in tokenizer.word_dict.items():
  dic[j] = i
dic

{0: 'oov',
 1: '바나나',
 2: '먹고',
 3: '싶은',
 4: '사과',
 5: '길고',
 6: '노란',
 7: '저는',
 8: '과일이',
 9: '좋아요'}

In [11]:
dic.values()

dict_values(['oov', '바나나', '먹고', '싶은', '사과', '길고', '노란', '저는', '과일이', '좋아요'])

In [12]:
import pandas as pd 

tfidf_ = pd.DataFrame(result, columns = li2)
tfidf_

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.0,0.287682,0.287682,0.693147,0.0,0.0,0.0,0.0,0.0
1,0.287682,0.287682,0.287682,0.0,0.0,0.0,0.0,0.0,0.0
2,0.575364,0.0,0.0,0.0,0.693147,0.693147,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147,0.693147


In [13]:
# 컬럼명 변경
tfidf_.columns =list(dic.values())[1:]

In [14]:
# 재정렬
tfidf_ = tfidf_[['과일이', '길고','노란','먹고','바나나','사과','싶은','저는','좋아요']]

In [15]:
tfidf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


In [16]:
# 확인 2)
input = [
    'this is the first document',
    'the first document is this',
    'this is the second document',
    'and this is the third document',
    'is this the first document'
]

tokenizer = Tokenizer()
tf_idf = TfidfVectorizer(tokenizer)
result = tf_idf.fit_transform(input)
result

[[-0.1823215567939546,
  -0.1823215567939546,
  -0.1823215567939546,
  -0.1823215567939546,
  0.22314355131420976,
  0.0,
  0.0,
  0.0],
 [-0.1823215567939546,
  -0.1823215567939546,
  -0.1823215567939546,
  -0.1823215567939546,
  0.22314355131420976,
  0.0,
  0.0,
  0.0],
 [-0.1823215567939546,
  -0.1823215567939546,
  -0.1823215567939546,
  -0.1823215567939546,
  0.0,
  0.9162907318741551,
  0.0,
  0.0],
 [-0.1823215567939546,
  -0.1823215567939546,
  -0.1823215567939546,
  -0.1823215567939546,
  0.0,
  0.0,
  0.9162907318741551,
  0.9162907318741551],
 [-0.1823215567939546,
  -0.1823215567939546,
  -0.1823215567939546,
  -0.1823215567939546,
  0.22314355131420976,
  0.0,
  0.0,
  0.0]]

In [17]:
li=tokenizer.fit_transform(input)
print(li)
li2 = list(set(sum(li,[])))
print(li2)

[[1, 2, 3, 5, 4], [3, 5, 4, 2, 1], [1, 2, 3, 6, 4], [7, 1, 2, 3, 8, 4], [2, 1, 3, 5, 4]]
[1, 2, 3, 4, 5, 6, 7, 8]


In [18]:
dic = {}
for i,j in tokenizer.word_dict.items():
  dic[j] = i
dic

{0: 'oov',
 1: 'this',
 2: 'is',
 3: 'the',
 4: 'document',
 5: 'first',
 6: 'second',
 7: 'and',
 8: 'third'}

In [19]:
dic.values()

dict_values(['oov', 'this', 'is', 'the', 'document', 'first', 'second', 'and', 'third'])

In [20]:
import pandas as pd 

tfidf_ = pd.DataFrame(result, columns = li2)
tfidf_

Unnamed: 0,1,2,3,4,5,6,7,8
0,-0.182322,-0.182322,-0.182322,-0.182322,0.223144,0.0,0.0,0.0
1,-0.182322,-0.182322,-0.182322,-0.182322,0.223144,0.0,0.0,0.0
2,-0.182322,-0.182322,-0.182322,-0.182322,0.0,0.916291,0.0,0.0
3,-0.182322,-0.182322,-0.182322,-0.182322,0.0,0.0,0.916291,0.916291
4,-0.182322,-0.182322,-0.182322,-0.182322,0.223144,0.0,0.0,0.0


In [22]:
# 컬럼명 변경
tfidf_.columns =list(dic.values())[1:]
tfidf_

Unnamed: 0,this,is,the,document,first,second,and,third
0,-0.182322,-0.182322,-0.182322,-0.182322,0.223144,0.0,0.0,0.0
1,-0.182322,-0.182322,-0.182322,-0.182322,0.223144,0.0,0.0,0.0
2,-0.182322,-0.182322,-0.182322,-0.182322,0.0,0.916291,0.0,0.0
3,-0.182322,-0.182322,-0.182322,-0.182322,0.0,0.0,0.916291,0.916291
4,-0.182322,-0.182322,-0.182322,-0.182322,0.223144,0.0,0.0,0.0
