#**원티드 프리온보딩 AI/ML 코스 사전과제**

###**문제 1) Tokenizer 생성하기**

In [1]:
class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  def preprocessing(self, sequences):
    result = []
    '''
    문제 1-1.
    '''
    import re
    for sent in sequences:
      sent = re.sub('[^0-9a-zA-Z ]','', sent)
      sent = sent.lower()
      tokens = sent.split()
      result.append(tokens)
    return result
  
  def fit(self, sequences):
    self.fit_checker = False
    '''
    문제 1-2.
    '''
    tokens = self.preprocessing(sequences)
    idx = 0
    for token in tokens:
      for word in token:
        if word not in self.word_dict:
          idx += 1
          self.word_dict[word] = idx
    self.fit_checker = True
  
  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)
    if self.fit_checker:
      '''
      문제 1-3.
      '''
      for token in tokens:
        temp = []
        for word in token:
          try: 
            temp.append(self.word_dict[word])
          except:
            temp.append(self.word_dict['oov'])
        result.append(temp)
      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

In [2]:
# Test
tokenizer = Tokenizer()
input = ['I go to school.', 'I LIKE pizza!']

# 1-1
print("===1-1. preprocessing()===")
print(tokenizer.preprocessing(input))

print("\n")

# 1-2
print("===1-2. fit()===")
tokenizer.fit(input)
print(tokenizer.word_dict)

print("\n")

# 1-3
print("===1-3. transform()===")
input2 = ['I go to school.', 'I LIKE pizza!', 'I want to be a DS!!']
print(tokenizer.transform(input2))

===1-1. preprocessing()===
[['i', 'go', 'to', 'school'], ['i', 'like', 'pizza']]


===1-2. fit()===
{'oov': 0, 'i': 1, 'go': 2, 'to': 3, 'school': 4, 'like': 5, 'pizza': 6}


===1-3. transform()===
[[1, 2, 3, 4], [1, 5, 6], [1, 0, 3, 0, 0, 0]]


###**문제 2) TfidfVectorizer 생성하기**

In [3]:
class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False

  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences) #[[1, 2, 3, 4], [1, 5, 6]]
    '''
    문제 2-1.
    '''
    from math import log

    n = len(tokenized) #입력된 전체 문장의 개수
    vocab = list(set(word for tokens in tokenized for word in tokens)) #입력된 문장의 vocab
    self.idf_matrix= [] #리스트 형태

    for i in range(len(vocab)):
      df = 0  #특정 단어가 포함된 문장의 개수
      for token in tokenized:
        df += vocab[i] in token
      idf = log(n/(df+1))
      self.idf_matrix.append(idf)

    self.fit_checker = True
    

  def transform(self, sequences):
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
      '''
      문제 2-2.
      '''
      n =len(tokenized)
      vocab = list(set(word for tokens in tokenized for word in tokens))
      self.tf_matrix = []
      self.tfidf_matrix = []

      # tf 행렬
      for i in range(n):
        self.tf_matrix.append([])
        d = tokenized[i]
        for j in range(len(vocab)):
          t = vocab[j]
          tf = d.count(t)
          self.tf_matrix[-1].append(tf)
      
      # tfidf 행렬
      for i in range(n):
        self.tfidf_matrix.append([])
        d = tokenized[i]
        for j in range(len(vocab)):
          tfidf = self.tf_matrix[i][j] * self.idf_matrix[j]
          self.tfidf_matrix[-1].append(tfidf)

      return self.tfidf_matrix
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

In [4]:
# Test
tfidf = TfidfVectorizer(tokenizer)

# 1-1
print("===2-1. fit()===")
tfidf.fit(input)
print(tfidf.idf_matrix)
print("\n")

# 1-2
print("===2-2. transform()===")
tfidf.transform(input)
print(tfidf.tfidf_matrix)


===2-1. fit()===
[-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]


===2-2. transform()===
[[-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0], [-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]]
