## 문제 1) Tokenizer 생성하기

In [None]:
import re

In [None]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False   
  
    def preprocessing(self, sequences):
        result = []
        '''
        문제 1-1.
        '''
        for sent in sequences:          
            prep_sent = re.sub(r"[^a-zA-Z0-9 ]", "", sent)
            result.append(prep_sent.lower().split())

        return result
    
    def fit(self, sequences):
        self.fit_checker = False
        '''
        문제 1-2.
        '''
        tokens = self.preprocessing(sequences)
        
        for token in tokens:
            for word in token:
                if word not in self.word_dict:
                    self.word_dict[word] = len(self.word_dict)
                    

        self.fit_checker = True
    
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        if self.fit_checker:
            '''
            문제 1-3.
            '''  
            for token in tokens:
                transformed = []

                for word in token:
                    if word in self.word_dict:
                        transformed.append(self.word_dict[word])             
                    else:
                        transformed.append(self.word_dict["oov"])
                
                result.append(transformed)

            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")
        
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

In [None]:
docs1 = ['I go to school.', 'I LIKE pizza!']
docs2 = ["She will go to home home home", "He will also go to school home"]

In [None]:
token = Tokenizer()

In [None]:
token.transform(docs1)

Exception: ignored

In [None]:
token.fit(docs1)

In [None]:
token.word_dict

{'go': 2, 'i': 1, 'like': 5, 'oov': 0, 'pizza': 6, 'school': 4, 'to': 3}

In [None]:
token.transform(docs1)

[[1, 2, 3, 4], [1, 5, 6]]

In [None]:
token.transform(docs2)

[[0, 0, 2, 3, 0, 0, 0], [0, 0, 0, 2, 3, 4, 0]]

In [None]:
token.fit_transform(docs2)

[[7, 8, 2, 3, 9, 9, 9], [10, 8, 11, 2, 3, 4, 9]]

In [None]:
token.word_dict

{'also': 11,
 'go': 2,
 'he': 10,
 'home': 9,
 'i': 1,
 'like': 5,
 'oov': 0,
 'pizza': 6,
 'school': 4,
 'she': 7,
 'to': 3,
 'will': 8}

## 문제 2) TfidfVectorizer 생성하기

In [None]:
from math import log

In [None]:
class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
    
    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        '''
        문제 2-1.
        '''
        self.idf = []
        
        self.vocab = list(set(token for token in self.tokenizer.word_dict.values() if token != 0)) # "oov"에 해당하는 토큰은 제외
        self.vocab.sort()

        N = len(tokenized) # 입력된 전체 문장 개수

        for t in self.vocab:
            df = 0 # 단어 t가 포함된 문장 d의 개수
            for doc in tokenized:
                df += t in doc
            self.idf.append(log(N/(df+1)))
 
        self.fit_checker = True
        

    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)
            '''
            문제 2-2.
            '''
            self.tfidf_matrix = []

            for doc in tokenized:
                self.tfidf_matrix.append([])
                for idx in range(len(self.vocab)):
                    t = self.vocab[idx]
                    tf = doc.count(t) # 문장 doc에 단어 t가 등장한 횟수
  
                    self.tfidf_matrix[-1].append(tf * self.idf[idx])
                
            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")

    
    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

In [None]:
docs1 = ['I go to school.', 'I LIKE pizza!']
docs2 = ["She will go to home home home", "He will also go to school home"]

In [None]:
new_token = Tokenizer()

In [None]:
tfidf = TfidfVectorizer(new_token)

In [None]:
tfidf.transform(docs1)

Exception: ignored

In [None]:
tfidf.fit(docs1)

In [None]:
tfidf.transform(docs1)

[[-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0],
 [-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]]

In [None]:
tfidf.idf

[-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]

In [None]:
tfidf.transform(docs2)

[[-0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [-0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]

In [None]:
data = tfidf.fit_transform(docs2)
columns = [c for c in new_token.word_dict.keys() if c != "oov"]

In [None]:
import pandas as pd
pd.DataFrame(data, columns=columns, index=docs2)

Unnamed: 0,i,go,to,school,like,pizza,she,will,home,he,also
She will go to home home home,0.0,-0.405465,-0.405465,0.0,0.0,0.0,0.0,-0.405465,-1.216395,0.0,0.0
He will also go to school home,0.0,-0.405465,-0.405465,0.0,0.0,0.0,0.0,-0.405465,-0.405465,0.0,0.0
