# stop words, tf-idf


In [6]:
import pandas as pd
import numpy as np

In [7]:
# load data
try:
    df = pd.read_csv('../data/nlp_data/rt_critics.csv')
except IOError:
    print 'cannot find file'

In [8]:
df.head()

Unnamed: 0,critic,fresh,imdb,publication,quote,review_date,rtid,title
0,Derek Adams,fresh,114709,Time Out,"So ingenious in concept, design and execution ...",2009-10-04,9559,Toy story
1,Richard Corliss,fresh,114709,TIME Magazine,The year's most inventive comedy.,2008-08-31,9559,Toy story
2,David Ansen,fresh,114709,Newsweek,A winning animated feature that has something ...,2008-08-18,9559,Toy story
3,Leonard Klady,fresh,114709,Variety,The film sports a provocative and appealing st...,2008-06-09,9559,Toy story
4,Jonathan Rosenbaum,fresh,114709,Chicago Reader,"An entertaining computer-generated, hyperreali...",2008-03-10,9559,Toy story


In [9]:

documents = list(df['quote'])
documents[:5]

['So ingenious in concept, design and execution that you could watch it on a postage stamp-sized screen and still be engulfed by its charm.',
 "The year's most inventive comedy.",
 'A winning animated feature that has something for everyone on the age spectrum.',
 "The film sports a provocative and appealing story that's every bit the equal of this technical achievement.",
 "An entertaining computer-generated, hyperrealist animation feature (1995) that's also in effect a toy catalog."]

## Document Frequency


In [10]:
from nltk.tokenize import wordpunct_tokenize  # for tokenizing our text
import string  # helps with removing punctuation
from collections import Counter  # great dict-like datastructure for counting things

In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
# This is a bit of text cleanup
word_bag_list = []
for doc in documents:
    cleaned = doc.lower().replace('-', ' ')  # make lowercase and split hyphenated words in two
    for c in string.punctuation:  # strip punctuation marks.
        cleaned = cleaned.replace(c, '')
    word_bag_list.append(wordpunct_tokenize(cleaned))

print 'a few tokens:', word_bag_list[:3]

# this flattens the nested lists into one big list for some stats
token_list = []
for tokens in word_bag_list:
    token_list.extend(tokens)
print 'number of tokens:', len(token_list)
print 'number of unique tokens:', len(set(token_list))
print 'number of documents:', len(word_bag_list)

a few tokens: [['so', 'ingenious', 'in', 'concept', 'design', 'and', 'execution', 'that', 'you', 'could', 'watch', 'it', 'on', 'a', 'postage', 'stamp', 'sized', 'screen', 'and', 'still', 'be', 'engulfed', 'by', 'its', 'charm'], ['the', 'years', 'most', 'inventive', 'comedy'], ['a', 'winning', 'animated', 'feature', 'that', 'has', 'something', 'for', 'everyone', 'on', 'the', 'age', 'spectrum']]
number of tokens: 280092
number of unique tokens: 22424
number of documents: 14072


In [20]:
word_bag_list[0]

['so',
 'ingenious',
 'in',
 'concept',
 'design',
 'and',
 'execution',
 'that',
 'you',
 'could',
 'watch',
 'it',
 'on',
 'a',
 'postage',
 'stamp',
 'sized',
 'screen',
 'and',
 'still',
 'be',
 'engulfed',
 'by',
 'its',
 'charm']

In [21]:
doc = word_bag_list[0]
set(doc)

{'a',
 'and',
 'be',
 'by',
 'charm',
 'concept',
 'could',
 'design',
 'engulfed',
 'execution',
 'in',
 'ingenious',
 'it',
 'its',
 'on',
 'postage',
 'screen',
 'sized',
 'so',
 'stamp',
 'still',
 'that',
 'watch',
 'you'}

In [30]:
set(['one','one','two'])

{'one', 'two'}

In [29]:
# calculate the document frequency of all the unique tokens in the bags of words.

df = Counter() 

for doc in word_bag_list:
    for token in set(doc):
        df[token] += 1

for token in df:
    df[token] = df[token] / float(len(documents))

# this prints the 20 highest-scoring words and their scores
df.most_common(20)

[('the', 0.6140562819783968),
 ('a', 0.5035531552018192),
 ('and', 0.48969584991472426),
 ('of', 0.4640420693575895),
 ('is', 0.3320068220579875),
 ('to', 0.32106310403638433),
 ('in', 0.23848777714610575),
 ('that', 0.20082433200682206),
 ('its', 0.1991898806139852),
 ('it', 0.1960631040363843),
 ('with', 0.15513075611142696),
 ('but', 0.15157760090960773),
 ('this', 0.1467453098351336),
 ('movie', 0.12933484934621944),
 ('film', 0.12926378624218307),
 ('for', 0.1286242183058556),
 ('as', 0.12784252416145536),
 ('an', 0.10993462194428652),
 ('be', 0.08484934621944286),
 ('on', 0.08449403069926094)]

## Stop Words


In [49]:
sum(tf.values())

32

In [43]:
print len(tf)
# print len(tfidf)

28


In [44]:
documents[49]

'Michael Mann and a superlative cast have taken a classic heist movie rife with familiar genre elements and turned it into a sleek, accomplished piece of work, meticulously controlled and completely involving.'

In [50]:
df['michael']

0.00412166003411029

In [33]:
# calculate the term frequency of all the unique tokens in all of the bags of words.

for doc in word_bag_list[:50]:
    tf = Counter() 
    tfidf = Counter()
    

    # calculate term frequencies
    for token in doc:
        tf[token] += 1
    total = float(sum(tf.values()))

    
    
    
    
    # calculate tf-idf scores
    for token in tf:
        tfidf[token] = (tf[token] / total) * (-np.log(df[token]))

    # this prints most significant words in the document
    print tfidf.most_common(5)

[('engulfed', 0.38207769145669573), ('postage', 0.35435180423429802), ('sized', 0.32662591701190019), ('stamp', 0.30424128549448326), ('ingenious', 0.26874915769444713)]
[('inventive', 1.1776761280575496), ('years', 0.8588893828779226), ('comedy', 0.65543605303509112), ('most', 0.59453821488145864), ('the', 0.097533738115198984)]
[('spectrum', 0.65025615367302192), ('winning', 0.47574203511007074), ('everyone', 0.43231666566869759), ('age', 0.39485397527852278), ('animated', 0.39393272981339084)]
[('equal', 0.39185708991301349), ('sports', 0.37253332126759958), ('provocative', 0.34790330157005933), ('technical', 0.34201603930200153), ('achievement', 0.34201603930200153)]
[('catalog', 0.63679615242782617), ('hyperrealist', 0.63679615242782617), ('1995', 0.49031451393874498), ('toy', 0.45195690427850749), ('generated', 0.41464918508281268)]
[('ushered', 0.23879855716043485), ('revived', 0.23879855716043485), ('lion', 0.19400457042973349), ('repetition', 0.19400457042973349), ('landmark',

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
len(output.toarray()[0])

21254

In [60]:
tfidf_vec = TfidfVectorizer(stop_words='english')
output = tfidf_vec.fit_transform(documents)
# print output.toarray()[20:30, :10]

In [61]:
# print tfidf_vec.get_stop_words()

In [59]:
# tfidf_vec.get_feature_names()