<a href="https://colab.research.google.com/github/rhapis97/Practice_AI/blob/main/210708_NLP_BoW%2BTF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Bag of Words(BoW)
단어의 등장 순서를 고려하지 않는 **빈도수 기반**의 단어 표현 방법

1. 각 단어에 고유한 정수 인덱스 부여
2. 각 인덱스 위치에 단어 토큰의 등장 횟수를 기록한 벡터를 만든다.

doc1 = 'John likes to watch movies. Mary likes movies too.'  
Bow1 = {"John":1, 'likes':2, 'to':1, 'watch':1, 'movies':2, 'Mary':1, 'too':1}

In [1]:
!pip install konlpy

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.5MB/s 
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/98/88/f817ef1af6f794e8f11313dcd1549de833f4599abcec82746ab5ed086686/JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448kB)
[K     |████████████████████████████████| 450kB 45.3MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 10.8MB/s 
Installing collected packag

In [2]:
from konlpy.tag import Okt
import re
okt = Okt()

In [3]:
# token = re.sub("(\.)", "", "정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다.")
token = re.sub("(\.)", "", "소비자는 주로 소비하는 상품을 기준으로 물가상승률을 느낀다.")
# 정규표현식을 통해 온점을 제거하는 정제 작업이다.

token = okt.morphs(token)
# OKT형태소 분석기를 통해 토큰화 작업을 수행한 뒤에 token에다가 넣는다.

word2index = {}
bow = []
for voca in token:
  if voca not in word2index.keys():
    word2index[voca] = len(word2index)
    # token을 읽으면서, word2index에 없는(not in) 단어는 새로 추가하고, 이미 있는 단어는 넘긴다~
    bow.insert(len(word2index)-1, 1)
    # bow전체에 전부 기본값 1을 넣어준다. 단어의 개수는 최소 1개 이상이기 때문
  else:
    index = word2index.get(voca)
    # 재등장하는 단어의 인덱스를 받아오기
    bow[index] = bow[index]+1
    # 재등장하는 단어는 해당하는 인덱스의 위치에 1을 더해줌(단어 개수 세는 것)
    
print(word2index)    # 단어의 인덱스 반환

{'소비자': 0, '는': 1, '주로': 2, '소비': 3, '하는': 4, '상품': 5, '을': 6, '기준': 7, '으로': 8, '물가상승률': 9, '느낀다': 10}


In [4]:
bow    # bow 빈도수 반환

[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]

## Tensorflow의 Keras Tokenizer를 활용한 BoW

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

sentence = ["John likes to watch movies. Mary likes movies too! Mary also likes to watch football games."]

In [6]:
def print_bow(sentence):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(sentence)    # 단어장 생성
  bow = dict(tokenizer.word_counts)   # 각 단어와 각 단어의 빈도를 bow에 저장

  print('Bag of words:', bow)    # bow 출력
  print('단어장(vocabulary)의 크기:', len(tokenizer.word_counts))    # 중복을 제거한 단어들의 개수

In [7]:
print_bow(sentence)

Bag of words: {'john': 1, 'likes': 3, 'to': 2, 'watch': 2, 'movies': 2, 'mary': 2, 'too': 1, 'also': 1, 'football': 1, 'games': 1}
단어장(vocabulary)의 크기: 10


## scikit-learn CountVectorizer를 활용한 BoW

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
sentence = ["John likes to watch movies. Mary likes movies too! Mary also likes to watch football games."]

vector = CountVectorizer()

print('Bag of words:', vector.fit_transform(sentence).toarray())    # 코퍼스로부터 각 단어의 빈도수를 기록
print('각 단어의 인덱스:', vector.vocabulary_)    # 각 단어의 인덱스가 어떻게 부여되는지를 보여줌

Bag of words: [[1 1 1 1 3 2 2 2 1 2]]
각 단어의 인덱스: {'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


## 불용어를 제거한 BoW 만들기

### 사용자가 직접 정의한 불용어 사용

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

text = ["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words=['the', 'a', 'an' ,'is', 'not'])
print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)

[[1 1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


### CountVectorizer에서 제공하는 자체 불용어 사용

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

text = ["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words='english')
print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)

[[1 1 1]]
{'family': 0, 'important': 1, 'thing': 2}


### NLTK에서 지원하는 불용어 사용

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

text = ["Family is not an important thing. It's everything."]
sw = stopwords.words('english')
vect = CountVectorizer(stop_words=sw)
print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)

[[1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


## DTW(Document-Term Matrix)
다수의 문서에서 등장하는 각 단어들의 빈도를 행렬로 표현한 것  
다수의 문서에 대해서 BoW를 하나의 행렬로 표현하고 부르는 용어

문서 1: I like dog  
문서 2: I like cat  
문서 3: I like cat I like cat

In [13]:
import pandas as pd
content = [[0, 1, 1, 1], [1, 0, 1, 1], [2, 0, 2, 2]]
df = pd.DataFrame(content)
df.index = ['(문서1) I like dog','(문서2) I like cat','(문서3) I like cat I like cat']
df.columns = ['cat','dog','I','like']
df

Unnamed: 0,cat,dog,I,like
(문서1) I like dog,0,1,1,1
(문서2) I like cat,1,0,1,1
(문서3) I like cat I like cat,2,0,2,2


In [14]:
import numpy as np
from numpy import dot
from numpy.linalg import norm    # norm: 벡터 크기 또는 길이를 측정하는 방법

doc1 = np.array([0,1,1,1])
doc2 = np.array([1,0,1,1])
doc3 = np.array([2,0,2,2])

def cos_sim(A,B):
  return dot(A,B)/(norm(A)*norm(B))

In [15]:
print(cos_sim(doc1, doc2))
print(cos_sim(doc1, doc3))
print(cos_sim(doc2, doc3))    # 코사인 유사도는 0~1사이의 값을 가지며, 1에 가까울수록 유사도가 높다고 판단

0.6666666666666667
0.6666666666666667
1.0000000000000002


### scikit-learn CountVectorizer 활용한 DTM 구현

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
          'John likes to watch movies',
          'Mary likes movies too',
          'Mary also likes to watch football games',
]

vector = CountVectorizer()
print(vector.fit_transform(corpus).toarray())    # 각 인덱스에 해당하는 단어가 몇 개 들어 있는지
print(vector.vocabulary_)    # 단어 사전

[[0 0 0 1 1 0 1 1 0 1]
 [0 0 0 0 1 1 1 0 1 0]
 [1 1 1 0 1 1 0 1 0 1]]
{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


한계점
1. 희소표현(sparse representation)이다.
2. 단순 빈도수 기반 접근이다.

## TF-IDF(Term Frequency - Inverse Document Frequency)

모든 문서에서 자주 등장하는 단어는 중요도가 낮다고 판단하고, 특정 문서에서만 자주 등장하는 단어는 중요도가 높다고 판단하는 것

In [17]:
from math import log
import pandas as pd

docs = [
        'John likes to watch movies and Mary likes movies too',
        'James likes to watch TV',
        'Mary also likes to watch football gaes',
]

In [18]:
vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()
print('단어장의 크기:', len(vocab))
print(vocab)

단어장의 크기: 13
['James', 'John', 'Mary', 'TV', 'also', 'and', 'football', 'gaes', 'likes', 'movies', 'to', 'too', 'watch']


In [19]:
N = len(docs)
N

3

In [20]:
def tf(t, d):
  return d.count(t)

def idf(t):
  df = 0
  for doc in docs:
    df += t in doc
  return log(N/(df + 1)) + 1

def tfidf(t, d):
  return tf(t, d) * idf(t)

In [21]:
result = []
for i in range(N):
  result.append([])
  d = docs[i]
  for j in range(len(vocab)):
    t = vocab[j]

    result[-1].append(tf(t,d))

tf_ = pd.DataFrame(result, columns=vocab)
tf_

Unnamed: 0,James,John,Mary,TV,also,and,football,gaes,likes,movies,to,too,watch
0,0,1,1,0,0,1,0,0,2,2,2,1,1
1,1,0,0,1,0,0,0,0,1,0,1,0,1
2,0,0,1,0,1,0,1,1,1,0,1,0,1


In [22]:
result = []
for j in range(len(vocab)):
  t = vocab[j]
  result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=['IDF'])
idf_

Unnamed: 0,IDF
James,1.405465
John,1.405465
Mary,1.0
TV,1.405465
also,1.405465
and,1.405465
football,1.405465
gaes,1.405465
likes,0.712318
movies,1.405465


In [23]:
result = []
for i in range(N):
  result.append([])
  d = docs[i]
  for j in range(len(vocab)):
    t = vocab[j]

    result[-1].append(tfidf(t, d))

tfidf_ = pd.DataFrame(result, columns=vocab)
tfidf_

Unnamed: 0,James,John,Mary,TV,also,and,football,gaes,likes,movies,to,too,watch
0,0.0,1.405465,1.0,0.0,0.0,1.405465,0.0,0.0,1.424636,2.81093,1.424636,1.405465,0.712318
1,1.405465,0.0,0.0,1.405465,0.0,0.0,0.0,0.0,0.712318,0.0,0.712318,0.0,0.712318
2,0.0,0.0,1.0,0.0,1.405465,0.0,1.405465,1.405465,0.712318,0.0,0.712318,0.0,0.712318


0: 'John likes to watch movies and Mary likes movies too'  
1: 'James likes to watch TV'  
2: 'Mary also likes to watch football gaes'

### scikit-learn 활용한 TF-IDF 구현

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
          'you know I want your love',
          'I like you',
          'what should I do'
]

vector = CountVectorizer()
print(vector.fit_transform(corpus).toarray())

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]


In [25]:
print(vector.vocabulary_)

{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
          'you know I want your love',
          'I like you',
          'what should I do'
]

tfidfv = TfidfVectorizer().fit(corpus)

In [27]:
print(tfidfv.transform(corpus).toarray())

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]


In [28]:
print(tfidfv.vocabulary_)

{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


## abc뉴스데이터로 TF-IDF

In [29]:
import pandas as pd
import numpy as np
import urllib.request
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [30]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv",
                           filename="/content/abcnews-data-text.csv")

('/content/abcnews-data-text.csv', <http.client.HTTPMessage at 0x7ff13b174290>)

In [32]:
data = pd.read_csv('/content/abcnews-data-text.csv', error_bad_lines=False)

In [33]:
data

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers
...,...,...
1082163,20170630,when is it ok to compliment a womans smile a g...
1082164,20170630,white house defends trumps tweet
1082165,20170630,winter closes in on tasmania as snow ice falls
1082166,20170630,womens world cup australia wins despite atapat...


In [34]:
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [35]:
text = data[['headline_text']]

In [36]:
text.nunique()    # 고유값 개수

headline_text    1054983
dtype: int64

In [37]:
text.drop_duplicates(inplace=True)
text = text.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [38]:
print(len(text))    # 중복이 제거됨

1054983


## 데이터를 정제 및 정규화

In [39]:
# NLTK 토크나이저를 이용해 토큰화
text['headline_text'] = text.apply(lambda row:nltk.word_tokenize(row['headline_text']), axis=1)

In [40]:
stop_words = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words)])

In [41]:
text.head()

Unnamed: 0,headline_text
0,"[aba, decides, community, broadcasting, licence]"
1,"[act, fire, witnesses, must, aware, defamation]"
2,"[g, calls, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [42]:
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])

In [43]:
# 길이가 1~2인 것 제거
text = text['headline_text'].apply(lambda x:[word for word in x if len(word)>2])

In [44]:
print(text[:5])

0     [aba, decide, community, broadcast, licence]
1    [act, fire, witness, must, aware, defamation]
2       [call, infrastructure, protection, summit]
3            [air, staff, aust, strike, pay, rise]
4    [air, strike, affect, australian, travellers]
Name: headline_text, dtype: object


In [45]:
detokenized_doc = []
for i in range(len(text)):
  t = ' '.join(text[i])
  detokenized_doc.append(t)

train_data = detokenized_doc

In [46]:
train_data[:5]

['aba decide community broadcast licence',
 'act fire witness must aware defamation',
 'call infrastructure protection summit',
 'air staff aust strike pay rise',
 'air strike affect australian travellers']

In [47]:
c_vectorizer = CountVectorizer(stop_words='english', max_features=5000)    # 상위 5000개만 사용
document_term_matrix = c_vectorizer.fit_transform(train_data)

In [48]:
print('행렬의 크기:', document_term_matrix.shape)

행렬의 크기: (1054983, 5000)


In [49]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tf_idf_matrix = tfidf_vectorizer.fit_transform(train_data)

In [50]:
print('행렬의 크기:', tf_idf_matrix.shape)

행렬의 크기: (1054983, 5000)
