### TF-IDF

In [1]:
import utils

In [2]:
fu = utils.FileUtil()

In [3]:
corpus = [
    'Natural language processing (NLP) is a subfield of computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human (natural) languages.',
    'Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural language generation.',
    'The history of natural language processing generally started in the 1950s, although work can be found from earlier periods.'
]

In [4]:
def pre_processing(doc):
    """
    return tokens
    """
    words = fu.get_words(doc, lower_case=True, remove_stop_words=True)
    return words

#### Use Counter

In [5]:
import math
import pandas as pd
from collections import Counter, defaultdict

In [6]:
def compute_tf(word, word_count):    
    return word_count[word]/sum(word_count.values())

In [7]:
def compute_idf(word, all_docs):
    df = 0
    for doc in all_docs:
        if word in doc:
            df += 1
    idf = math.log(len(all_docs) / df) + 1
    return idf

In [8]:
tf_idf = defaultdict(lambda: defaultdict(float))
processed_corpus = [pre_processing(doc) for doc in corpus]
all_words = set([word for doc in processed_corpus for word in doc])

for i, doc in enumerate(processed_corpus):
    for word in all_words:
        tf_idf_value = compute_tf(word, Counter(doc)) * compute_idf(word, processed_corpus)
        tf_idf[word][i] = tf_idf_value

In [9]:
result = pd.DataFrame.from_dict(tf_idf)
result

Unnamed: 0,language,work,history,information,concerned,computer,subfield,science,found,speech,...,computers,recognition,engineering,artificial,processing,challenges,human,frequently,intelligence,generally
0,0.058824,0.0,0.0,0.123448,0.123448,0.123448,0.123448,0.123448,0.0,0.0,...,0.123448,0.0,0.123448,0.123448,0.058824,0.0,0.123448,0.0,0.123448,0.0
1,0.214286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.149901,...,0.0,0.149901,0.0,0.0,0.071429,0.149901,0.0,0.149901,0.0,0.0
2,0.083333,0.174884,0.174884,0.0,0.0,0.0,0.0,0.0,0.174884,0.0,...,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.174884


#### Use TfidfVectorizer from scikit-learn

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
vectorizer = TfidfVectorizer(smooth_idf=False)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

['1950s', 'although', 'and', 'artificial', 'be', 'between', 'can', 'challenges', 'computer', 'computers', 'concerned', 'earlier', 'engineering', 'found', 'frequently', 'from', 'generally', 'generation', 'history', 'human', 'in', 'information', 'intelligence', 'interactions', 'involve', 'is', 'language', 'languages', 'natural', 'nlp', 'of', 'periods', 'processing', 'recognition', 'science', 'speech', 'started', 'subfield', 'the', 'understanding', 'with', 'work']


In [12]:
X.shape

(3, 42)

In [13]:
vectorizer.fit_transform(corpus).toarray()

array([[0.        , 0.        , 0.29910492, 0.22330873, 0.        ,
        0.22330873, 0.        , 0.        , 0.22330873, 0.22330873,
        0.22330873, 0.        , 0.22330873, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.22330873,
        0.        , 0.22330873, 0.22330873, 0.22330873, 0.        ,
        0.22330873, 0.10640781, 0.22330873, 0.21281562, 0.22330873,
        0.14955246, 0.        , 0.10640781, 0.        , 0.22330873,
        0.        , 0.        , 0.22330873, 0.14955246, 0.        ,
        0.22330873, 0.        ],
       [0.        , 0.        , 0.19165059, 0.        , 0.        ,
        0.        , 0.        , 0.28616881, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.28616881,
        0.        , 0.        , 0.28616881, 0.        , 0.        ,
        0.19165059, 0.        , 0.        , 0.        , 0.28616881,
        0.        , 0.40908292, 0.        , 0.40908292, 0.        ,
        0.     