# Text Feature Extraction : TF-IDF Model

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import math
import warnings
warnings.filterwarnings('ignore')

In [2]:
corpus = "She is a wanderlust", "She is lovely"

## Sklearn TfidfVectorizer

In [3]:
vectorizer = TfidfVectorizer()

In [4]:
# fit: creates dict of count
# transform: tfidf calculated or features created
vectors = vectorizer.fit_transform(corpus).todense()

In [5]:
vectorizer.idf_

array([1.        , 1.40546511, 1.        , 1.40546511])

In [6]:
vectorizer.get_feature_names()

['is', 'lovely', 'she', 'wanderlust']

In [7]:
vectors

matrix([[0.50154891, 0.        , 0.50154891, 0.70490949],
        [0.50154891, 0.70490949, 0.50154891, 0.        ]])

In [8]:
df = pd.DataFrame(data = vectors, columns=vectorizer.get_feature_names())
df

Unnamed: 0,is,lovely,she,wanderlust
0,0.501549,0.0,0.501549,0.704909
1,0.501549,0.704909,0.501549,0.0


## Standard TFIDF

#### Term Frequency (TF): Word count in a doc / Words in a doc
Creates a dictionary of word count via CountVectorizer

In [24]:
corpus = list(map(str.lower, corpus))

In [26]:
cv = CountVectorizer()
count_occurrences = cv.fit_transform(corpus)

In [27]:
count_occurrences.toarray()

array([[1, 0, 1, 1],
       [1, 1, 1, 0]], dtype=int64)

In [12]:
bagOfWords_1 = dict.fromkeys(cv.get_feature_names())
for ind,key in enumerate(bagOfWords_1):
    bagOfWords_1[key] = count_occurrences.toarray()[0][ind]
bagOfWords_1

{'is': 1, 'lovely': 0, 'she': 1, 'wanderlust': 1}

In [13]:
bagOfWords_2 = dict.fromkeys(cv.get_feature_names())
for ind,key in enumerate(bagOfWords_2):
    bagOfWords_2[key] = count_occurrences.toarray()[1][ind]
bagOfWords_2

{'is': 1, 'lovely': 1, 'she': 1, 'wanderlust': 0}

In [14]:
pd.DataFrame(data = count_occurrences.toarray(), columns= cv.get_feature_names())

Unnamed: 0,is,lovely,she,wanderlust
0,1,0,1,1
1,1,1,1,0


In [15]:
def compute_tf(bow, doc):
    tf_dict ={}
    doc_count = len(doc)
    for word, count in bow.items():
        tf_dict[word] = count/doc_count
    return tf_dict

In [16]:
tf_doc1 = compute_tf(bagOfWords_1, corpus[0].split(' '))
tf_doc2 = compute_tf(bagOfWords_2, corpus[1].split(' '))

In [17]:
tf_doc1

{'is': 0.25, 'lovely': 0.0, 'she': 0.25, 'wanderlust': 0.25}

In [18]:
tf = pd.DataFrame([tf_doc1, tf_doc2])
tf

Unnamed: 0,is,lovely,she,wanderlust
0,0.25,0.0,0.25,0.25
1,0.333333,0.333333,0.333333,0.0


#### Inverse Document Frequency (IDF): log (Total docs / Docs with the word present)

In [19]:
def compute_idf(docs):    
    N = len(docs)
    idfDict = dict.fromkeys(docs[0].keys(),0)
    for doc in docs:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] +=1 
    
    for word, val in idfDict.items():
        # standard notation
        idfDict[word] = math.log(N / float(val))
        #sklearn notation
        #idfDict[word] = (math.log((N+1) / (val+1))) + 1
    return idfDict

In [20]:
idfs = compute_idf([bagOfWords_1,bagOfWords_2])
idfs

{'is': 0.0,
 'lovely': 0.6931471805599453,
 'she': 0.0,
 'wanderlust': 0.6931471805599453}

#### Dot product of both: TF * IDF 

In [21]:
def compute_tfidf(tf,idf):
    tfidf = {}
    for word, tfVal in tf.items():
        tfidf[word] = tfVal * idf[word]
    return tfidf

In [22]:
tfidf_doc1 = compute_tfidf(tf_doc1, idfs)
tfidf_doc2 = compute_tfidf(tf_doc2, idfs)

In [28]:
tfidf_doc1

{'is': 0.0, 'lovely': 0.0, 'she': 0.0, 'wanderlust': 0.17328679513998632}

In [23]:
pd.DataFrame([tfidf_doc1, tfidf_doc2])

Unnamed: 0,is,lovely,she,wanderlust
0,0.0,0.0,0.0,0.173287
1,0.0,0.231049,0.0,0.0


- Clearly, standard TF-IDF is different than the Sklearn's. 
- Though the TF term remains the same while IDF term differs. 
- Sklearn adds smoothing and L2-normalization to the result
- Thus, after smoothing the idf term, the dot product of TF & IDF value is L2-normalized (Eucledian)

# END