In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer # frequency based DTM
from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf based DTM

In [2]:
def display_features(features, feature_names):
    df = pd.DataFrame(data=features, columns=feature_names)
    print(df)

In [3]:
def tf_extractor(corpus): 
    # returns a frequency-based DTM
    vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1))
    # min_df: minimum document frequency threshold
         # if min_df = 1 -> 최소 적어도 하나의 문서에서 사용된 단어들은 모두 포함
    # ngram_range=(1,1) -> unigram 만 포함
    # ngram_range=(1,3) -> unigram, bi-gram, tri-gram 까지 고려
    features = vectorizer.fit_transform(corpus) # transform texts to a frequency matrix
    return vectorizer, features  

In [4]:
def tfidf_extractor(corpus):
    # returns a tf-idf based DTM
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1))
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [5]:
# the corpus is composed of 4 documents
CORPUS = [
'the sky is blue',
'sky is blue and sky is beautiful',
'the beautiful sky is so blue',
'i love blue cheese'
]

In [6]:
tf_vectorizer, tf_features = tf_extractor(CORPUS)
# You should remember what type of input data is provided
# CORPUS: a list of sentences

print(tf_features) # this prints out the words used in each document.

  (0, 2)	1
  (0, 4)	1
  (0, 6)	1
  (0, 8)	1
  (1, 1)	1
  (1, 0)	1
  (1, 2)	1
  (1, 4)	2
  (1, 6)	2
  (2, 7)	1
  (2, 1)	1
  (2, 2)	1
  (2, 4)	1
  (2, 6)	1
  (2, 8)	1
  (3, 3)	1
  (3, 5)	1
  (3, 2)	1


 (0,2)   1 -> index 2에 해당하는 단어가 0번 문서에서 1번 사용되었다는 뜻

In [12]:
features = tf_features.todense()
features # nd array, numpy

matrix([[0, 0, 1, 0, 1, 0, 1, 0, 1],
        [1, 1, 1, 0, 2, 0, 2, 0, 0],
        [0, 1, 1, 0, 1, 0, 1, 1, 1],
        [0, 0, 1, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [14]:
feature_names = tf_vectorizer.get_feature_names()
feature_names

['and', 'beautiful', 'blue', 'cheese', 'is', 'love', 'sky', 'so', 'the']

'and' -> 0
'beautiful' -> 1

In [15]:
display_features(features, feature_names)

   and  beautiful  blue  cheese  is  love  sky  so  the
0    0          0     1       0   1     0    1   0    1
1    1          1     1       0   2     0    2   0    0
2    0          1     1       0   1     0    1   1    1
3    0          0     1       1   0     1    0   0    0


In [18]:
tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS)
feature_names = tfidf_vectorizer.get_feature_names()
display_features(np.round(tdidf_features.todense(), 2), feature_names)

    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
