In [105]:
import pandas as pd
import nltk,string
import numpy as np

In [None]:
df = pd.read_csv('/home/snehil/work/DataVault/02_lazy_programmer_nlp/tmdb_5000_movies.csv', index_col=False)

In [107]:
df = df[['tagline','title']]
df.fillna('',inplace=True)

In [108]:
df['doc'] = df.tagline+' '+df.title
df.doc = df.doc.apply(lambda x:x.lower())

In [109]:
df

Unnamed: 0,tagline,title,doc
0,Enter the World of Pandora.,Avatar,enter the world of pandora. avatar
1,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,"at the end of the world, the adventure begins...."
2,A Plan No One Escapes,Spectre,a plan no one escapes spectre
3,The Legend Ends,The Dark Knight Rises,the legend ends the dark knight rises
4,"Lost in our world, found in another.",John Carter,"lost in our world, found in another. john carter"
...,...,...,...
4798,"He didn't come looking for trouble, but troubl...",El Mariachi,"he didn't come looking for trouble, but troubl..."
4799,A newlywed couple's honeymoon is upended by th...,Newlyweds,a newlywed couple's honeymoon is upended by th...
4800,,"Signed, Sealed, Delivered","signed, sealed, delivered"
4801,A New Yorker in Shanghai,Shanghai Calling,a new yorker in shanghai shanghai calling


### `preprocessing`

In [110]:
# tokenize
def tokenize(sent):
    sent = nltk.word_tokenize(sent)
    sent_mod = []
    for i in sent:
        if i.isalnum():
            sent_mod.append(i)

    return sent_mod

### Removing Stopwords,Punctuation 

stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(sent):
    sent = [i for i in sent if i not in stopword and i not in string.punctuation]
    return sent

df.doc = df.doc.apply(tokenize)
df.doc = df.doc.apply(remove_stopwords)

In [111]:
from nltk.corpus import wordnet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else :
        return wordnet.NOUN

In [112]:
wnl = nltk.stem.WordNetLemmatizer()

def wnl_lemma(sent):
    words_and_tags = nltk.pos_tag(sent)
    sent_mod = []
    for words,tags in words_and_tags:
        tag = get_wordnet_pos(tags)
        word = wnl.lemmatize(words, pos=tag)
        sent_mod.append(word)
    return sent_mod

In [113]:
df.doc = df.doc.apply(wnl_lemma)

### `Word to index`

In [114]:
unique_words = []
for i,row in df.iterrows():
    for j in row['doc']:
        if j not in unique_words:
            unique_words.append(j)

In [115]:
idx = [i for i in range(len(unique_words))]

In [116]:
word_index_dict = dict(zip(idx,unique_words))

In [117]:
index_word_dict = dict(zip(unique_words,idx))

In [118]:
len(word_index_dict)

5809

### `Count Vectorizer`

In [119]:
tf = np.zeros((4803,5809))

In [120]:
tf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [121]:
from collections import Counter
df['count_element'] = df.doc.apply(lambda x:Counter(x))

In [122]:
for i,row in df.iterrows():
    for key,value in row['count_element'].items():
        idx = unique_words.index(key)
        tf[i,idx] = value

In [123]:
tf

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [126]:
# total count of each word in whole corpus
document_frequency = np.sum(tf>0,axis=0)

In [127]:
document_frequency

array([ 11, 177,   3, ...,   1,   1,   1])

In [131]:
idf = np.log(len(df)/document_frequency)

In [133]:
# compute tfidf
tf_idf = tf*idf

In [142]:
tf_idf_normalized = tf_idf/np.linalg.norm(tf_idf, axis=1).reshape(4803,1)

  tf_idf_normalized = tf_idf/np.linalg.norm(tf_idf, axis=1).reshape(4803,1)


In [143]:
tf_idf_normalized

array([[0.46065552, 0.25012796, 0.55911118, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.42624673, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.48577495],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

#### checking some examples

In [167]:
df.iloc[201].doc

['seek', 'truth', 'da', 'vinci', 'code']

In [168]:
for j in (-tf_idf[201]).argsort()[:6]:
    print(word_index_dict[j])

vinci
da
seek
code
truth
pointe
