In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Scoring the List of Search Results

When searching we receive a huge list of results, then we have to rank the results and return the most informative!

## Number of overlapping words:
- not normalized by length of document

## Jaccard Coefficient
- $ |\space X \cap Y \space|\space  /\space  |\space X \cup Y \space | $ 

In [2]:
# NLTK supports access to different datasets https://www.nltk.org/book/ch02.html
import nltk
nltk.download('reuters')

from nltk.corpus import reuters
print("\nCategories: ", reuters.categories())

housing_articles = reuters.fileids('housing')
print("\nHousing articles:", housing_articles)

print("\nWords in an arbitrary article:", reuters.words('training/6067')[:10])

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/pgencheva/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True


Categories:  ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']

Housing articles: ['test/18911', 'test/19875', 'test/20106', 'test/20116', 

In [3]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

def word_overlap(doc_tokens, query_tokens):
    return sum([1 for _tok in query_tokens if _tok in doc_tokens])

def jaccard_coeff(doc_tokens, query_tokens):
    # naive intersection of sets
    return len(set(doc_tokens).intersection(set(query_tokens))) / len(set(doc_tokens).union(set(query_tokens)))

query_tokens = tokenizer.tokenize('housing growth next month')
print("Query: ", query_tokens)
print("Long Document words number:", len(reuters.words('training/6067')))
print("Long Document overlap:", word_overlap(query_tokens, reuters.words('training/6067')))
print("Long Document JC:", jaccard_coeff(query_tokens, reuters.words('training/6067')))

short_similar_document = tokenizer.tokenize('Baldrige predicts housing growth next week.')
print("\nShort Document words number:", len(short_similar_document))
print("Short Document overlap:", word_overlap(query_tokens, short_similar_document))
print("Short Document JC:", jaccard_coeff(query_tokens, short_similar_document))

Query:  ['housing', 'growth', 'next', 'month']
Long Document words number: 131
Long Document overlap: 3
Long Document JC: 0.03571428571428571

Short Document words number: 7
Short Document overlap: 3
Short Document JC: 0.375


__But we also want to__:
- Give _more weight_ to _less frequent words_ in the documents - __Balridge, prices__
- Give _less weight_ to _more frequent words_ in the documents - __how, much, housing, to__

In [4]:
query_tokens = tokenizer.tokenize('how much will the housing go up in the next month according to Balridge?')
print("Document word overlap:", word_overlap(query_tokens, reuters.words('training/6067')))
print("Document JC:", jaccard_coeff(query_tokens, reuters.words('training/6067')))
print("Document Content:", reuters.words('training/6067')[:10])

Document word overlap: 13
Document JC: 0.06593406593406594
Document Content: ['BALDRIGE', 'PREDICTS', 'SOLID', 'U', '.', 'S', '.', 'HOUSING', 'GROWTH', 'Commerce']


## TF-IDF - Term Frequency- Inverted Document Frequency
- View documents as __Bags Of Words__
- Mary lent John some money. = John lent Mary some money.
- Formula: 

$$TF * IDF (word, document) = (1+log(tf(word, document)) * log(\frac{n}{df(word)})$$
- n - total number of documents

### Term Frequency
- __Frequency of word in a document (here, raw count)__
- __0 if the term is not met in the document!!!__
- Relevance does not increase proportionally with frequency -> __log (base of 10)__
- Makes TF-IDF __increase with the number of occurrences__ within a doc

In [5]:
# !pip3 install pandas
import pandas as pd
from collections import Counter
import numpy as np

df = pd.DataFrame(Counter(reuters.words('test/20116')).most_common(), columns=['token', 'freq'])
df['tf'] = 1 + np.log10(df['freq'])
df.head()
df.tail()

Unnamed: 0,token,freq,tf
0,.,34,2.531479
1,",",31,2.491362
2,in,21,2.322219
3,pct,15,2.176091
4,1,13,2.113943


Unnamed: 0,token,freq,tf
110,689,1,1.0
111,11,1,1.0
112,below,1,1.0
113,level,1,1.0
114,687,1,1.0


### Document Frequency
- __Number of documents containing the word__ - an inversed measure of significance
- Logarithm with base 10 dampens the effect of IDF
- Affects ranking of queries with __at least 2 terms__
- Makes TFIDF __increase with the rarity of the term in the collection__

In [6]:
from collections import defaultdict
document_frequency = defaultdict(lambda: 0)
for fileid in housing_articles:
    for _word in set(reuters.words(fileid)):
        document_frequency[_word] += 1

idf_df = pd.DataFrame(list(document_frequency.items()), columns=['word', 'doc_freq'])
idf_df['idf'] = np.log10(len(housing_articles)/idf_df['doc_freq'])
idf_df.sort_values(by=['idf'], inplace=True)
idf_df.head()
idf_df.tail()

Unnamed: 0,word,doc_freq,idf
84,.,20,0.0
164,",",19,0.022276
185,the,17,0.070581
160,to,17,0.070581
186,a,16,0.09691


Unnamed: 0,word,doc_freq,idf
463,spurred,1,1.30103
464,26,1,1.30103
465,56th,1,1.30103
457,10th,1,1.30103
832,SAY,1,1.30103


Finally, we estimate score for a document D w.r.t. a query Q, __summing over tfidf scores of the word in both D and Q__

In [7]:
def tfidf_score(query_tokens, document_tokens):
    # naive implementation
    def tfidf(word):
        return (1 + np.log10(document_tokens.count(word))) * idf_df[idf_df['word']==word].iloc[0]['idf']
    
    overlapping_tokens = set(query_tokens).intersection(set(document_tokens))
    return sum([tfidf(_word) for _word in overlapping_tokens])

query_tokens = tokenizer.tokenize('housing growth next month')
print("Query: ", query_tokens)
print("\nLong Document TF-IDF:", tfidf_score(query_tokens, reuters.words('training/6067')))

short_similar_document = tokenizer.tokenize('Baldrige predicts housing growth next week.')
print("\nShort Document TF-IDF:", tfidf_score(query_tokens, short_similar_document))

Query:  ['housing', 'growth', 'next', 'month']

Long Document TF-IDF: 1.2596373105057561

Short Document TF-IDF: 2.1627272974976997


## Exercise: Now compute the TF-IDF score of the query to the documents:
- Query: 'Who was the first man ever to swim around Britain?'
- Doc1: 'Ross Edgley, at 33 - first man to swim around Britain'
- Doc2: 'Ross Edgley to Circumnavigate Britain Spent 5 Months at Sea'
- Doc3: 'Get Set 4 Swimming - H2OMG! Can this man swim around Britain?'
- Doc4: 'Welcome to the world of strongman swimming | British GQ'

# Vector Space
- Each document can be represented by a vector, where the terms are the axes of the space!

In [8]:
documents = [
    'Ross Edgley, at 33 - first man to swim around Britain',
    'Ross Edgley to Circumnavigate Britain Spent 5 Months at Sea',
    'Get Set 4 Swimming - H2OMG! Can this man swim around Britain?',
    'Welcome to the world of strongman swimming | British GQ'
]
query = 'Who was the first man ever to swim around Britain?'

## sklearn.feature_extraction.text:
- __CountVectorizer__ - Convert a collection of text documents to a matrix of token counts.
- __TfidfVectorizer__ - Convert a collection of raw documents to a matrix of TF-IDF features.
- Two main methods __fit__ and __transform__ :
    - __fit__ goes through the provided documents and __collects the vocabulary__
    - __transform__ transforms __documents in text representation to a vector representation__ according to the vocabulary

In [9]:
# ! pip3 install sklearn
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
count_vectorizer.fit(documents)
print(count_vectorizer.vocabulary_) # word to id

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

{'ross': 15, 'edgley': 7, 'at': 2, '33': 0, 'first': 8, 'man': 12, 'to': 24, 'swim': 20, 'around': 1, 'britain': 3, 'circumnavigate': 6, 'spent': 18, 'months': 13, 'sea': 16, 'get': 9, 'set': 17, 'swimming': 21, 'h2omg': 11, 'can': 5, 'this': 23, 'welcome': 25, 'the': 22, 'world': 26, 'of': 14, 'strongman': 19, 'british': 4, 'gq': 10}


In [10]:
# transform produces a sparse representations of documents - only values != 0
# we need toarray() to preview the whole lists
count_vectorizer.transform(documents).toarray()

array([[1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 0, 0],
       [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
        0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
        1, 0, 1, 1, 1]])

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df=2)
tfidf_vectorizer.fit_transform(documents).toarray()
tfidf_vectorizer.vocabulary_

array([[0.36984162, 0.36984162, 0.29941866, 0.36984162, 0.36984162,
        0.36984162, 0.36984162, 0.        , 0.29941866],
       [0.        , 0.48163503, 0.38992506, 0.48163503, 0.        ,
        0.48163503, 0.        , 0.        , 0.38992506],
       [0.46346838, 0.        , 0.3752176 , 0.        , 0.46346838,
        0.        , 0.46346838, 0.46346838, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.77722116, 0.62922751]])

{'ross': 5,
 'edgley': 3,
 'at': 1,
 'man': 4,
 'to': 8,
 'swim': 6,
 'around': 0,
 'britain': 2,
 'swimming': 7}

## Computing vector similarities
- We would like to find documents close to a given document or the closest documents to a query
- __Euclidean distance__? - shorter documents will be closer to each other rather than documents talking about same topic
- __Cosine similarity__ of the angle between two documents
    - divide each vector by its norm to achieve __unit length vectors__
    - cosine similarity is simply the __dot product__ of two unit length vectors

![Cosine SImilarity](img/cosine.png)

In [12]:
vector1 = np.array([1, 0, 0, 1, 2])
vector2 = np.array([0, 0, 1, 1, 1])

In [13]:
from sklearn import preprocessing

preprocessing.normalize([vector1], norm='l2')
vector1 / np.sqrt(sum(vector1**2))

unit_vector1 = preprocessing.normalize([vector1], norm='l2')[0]
unit_vector2 = preprocessing.normalize([vector2], norm='l2')[0]

array([[0.40824829, 0.        , 0.        , 0.40824829, 0.81649658]])

array([0.40824829, 0.        , 0.        , 0.40824829, 0.81649658])

In [14]:
np.dot(unit_vector1, unit_vector2)
sum([unit_vector1[i]*unit_vector2[i] for i in range(len(unit_vector1))])

0.7071067811865477

0.7071067811865477

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([vector1, vector2])

array([[1.        , 0.70710678],
       [0.70710678, 1.        ]])

## Exercise : calculate the closes document to the query from the previous exercise

In [16]:
def get_closest_documents(query, vectorizer, train_corpus_vectors, top_n=2):
    """Vectorizer should be fit on the documents beforehand.
        Returns tuples of (similarity, indexes) of closest documents"""
    # compute similarity to all sentences in the training corpus
    similarities = cosine_similarity(vectorizer.transform([query]), train_corpus_vectors).flatten()
    # get indexes of top n closest sentences
    related_docs_indices = similarities.argsort()[:-top_n-1:-1]
    # return tuples of (similarity score, document id)
    return [(similarities[idx], idx)  for idx in related_docs_indices]

train_corpus_vectors = tfidf_vectorizer.transform(documents)
closest_documents = get_closest_documents(query, tfidf_vectorizer, train_corpus_vectors)
print('Query:', query)
print('1st Closest document: {} Score: {}'.format(documents[closest_documents[0][1]], closest_documents[0][0]))
print('2nd Closest document: {} Score: {}'.format(documents[closest_documents[1][1]], closest_documents[1][0]))

Query: Who was the first man ever to swim around Britain?
1st Closest document: Get Set 4 Swimming - H2OMG! Can this man swim around Britain? Score: 0.8159745583466792
2nd Closest document: Ross Edgley, at 33 - first man to swim around Britain Score: 0.7678877104085525


## Exercise: using the friends corpus try to create an IR chatbot:
- User writes a sentences and we find the __closest sentence__ from the transcript
- We need to take __the answer__ to that sentence to make a dialogue! 
- Provide the bot with a __personality__, selecting only the tuple cues, where the answer is by a specific person.

In [17]:
pd.set_option('display.max_colwidth', -1)

friends_corpus = pd.read_csv("data/friends-final.txt", sep='\t')
friends_corpus.sample(5)

Unnamed: 0,id,scene_id,person,gender,original_line,line,metadata,filename
39776,3974501,2058,CHANDLER,M,"Chandler: Yeah! No! No! No! Don't fall asleep! Okay, I am going to make you some coffee. (Monica doesn't move as he gets out of bed and as he's heading for the door.) And I probably won't spill coffee grounds all over the kitchen floor.","Yeah! No! No! No! Don't fall asleep! Okay, I am going to make you some coffee. And I probably won't spill coffee grounds all over the kitchen floor.",Yeah_UH !_! No_UH !_! No_UH !_! No_UH !_! Do_VD0 n't_XX fall_VVI asleep_JJ !_! Okay_RR I_PPIS1 am_VBM going_VVGK to_TO make_VVI you_PPY some_DD coffee_NN1 ._. And_CC I_PPIS1 probably_RR wo_VM n't_XX spill_VVI coffee_NN1 grounds_NN2 all_RR over_II the_AT kitchen_NN1 floor_NN1 ._.,0712.txt
54331,5430001,2762,MONICA,F,Monica: You just wanna stay home so you can make a move on Joey!,You just wanna stay home so you can make a move on Joey!,You_PPY just_RR wan_VVI na_TO stay_VVI home_RL so_CS you_PPY can_VM make_VVI a_AT1 move_NN1 on_II Joey_NP1 !_!,0920.txt
22751,2272001,1165,CHANDLER,M,Chandler: Are you serious?,Are you serious?,Are_VBR you_PPY serious_JJ ?_?,0419.txt
22384,2235301,1147,PHOEBE,F,Phoebe: I'm telling it! I'm telling it!,I'm telling it! I'm telling it!,I_PPIS1 'm_VBM telling_VVG it_PPH1 !_! I_PPIS1 'm_VBM telling_VVG it_PPH1 !_!,0417.txt
60024,5999301,3003,RACHEL,F,"Rachel: What? Maybe I put it in here . Oh, oh, it's not in there! Oh, no! I must have packed it in one of these boxes!","What? Maybe I put it in here . Oh, oh, it's not in there! Oh, no! I must have packed it in one of these boxes!",What_DDQ ?_? Maybe_RR I_PPIS1 put_VV0 it_PPH1 in_II here_RL ._. Oh_UH oh_UH it_PPH1 's_VBZ not_XX in_II there_RL !_! Oh_UH no_UH !_! I_PPIS1 must_VM have_VHI packed_VVN it_PPH1 in_II one_MC1 of_IO these_DD2 boxes_NN2 !_!,1016.txt


In [18]:
# example conversation
friends_corpus[friends_corpus['scene_id']=='1'][['person', 'line']][:10]

Unnamed: 0,person,line
0,MONICA,There's nothing to tell! He's just some guy I work with!
1,JOEY,"C'mon, you're going out with the guy! There's gotta be something wrong with him!"
2,CHANDLER,"Alright Joey, be nice. So does he have a hump? A hump and a hairpiece?"
3,PHOEBE,"Wait, does he eat chalk?"
4,PHOEBE,"Just, 'cause, I don't want her to go through what I went through with Carl- oh!"
5,MONICA,"Okay, everybody relax. This is not even a date. It's just two people going out to dinner and not having sex."
6,CHANDLER,Sounds like a date to me.
7,CHANDLER,"Alright, so I'm back in high school, I'm standing in the middle of the cafeteria, and I realize I am totally naked."
8,ALL,"Oh, yeah. Had that dream."
9,CHANDLER,"Then I look down, and I realize there's a phone... there."


In [19]:
vectorizer = TfidfVectorizer().fit(friends_corpus.line)
train_corpus = vectorizer.transform(friends_corpus.line)
train_corpus.shape

(60849, 15032)

In [20]:
print("First sentence: ", friends_corpus.line.values[1])
print("Its vector representation: ", train_corpus[1].toarray())
print("An id of a word from the sentence: ", vectorizer.vocabulary_['with'])
print("The word tf-idf score: ", train_corpus[1].toarray()[0][vectorizer.vocabulary_['with']])

First sentence:  C'mon, you're going out with the guy! There's gotta be something wrong with him!
Its vector representation:  [[0. 0. 0. ... 0. 0. 0.]]
An id of a word from the sentence:  14756
The word tf-idf score:  0.41270625402865396


In [21]:
# add the previous line, which the cue follows in the dialogue
friends_corpus['previous_line'] = ['DUMMY PREVIOUS LINE'] + friends_corpus['line'].values[:-1].tolist()
# select only the cues which are made by JOEY
joey_line_tuples = friends_corpus[friends_corpus.person == 'JOEY']
# create a vectorizer and training space of the documents in the vector space
joey_vectorizer = TfidfVectorizer().fit(joey_line_tuples.previous_line)
joey_train_corpus = joey_vectorizer.transform(joey_line_tuples.previous_line)
joey_train_corpus.shape

(8125, 5411)

In [22]:
def get_closest_utterance(cue, vectorizer,  train_corpus, top_n=5):
    # compute similarity to all sentences in the training corpus
    similarities = cosine_similarity(vectorizer.transform([cue]), train_corpus).flatten()
    # get indexes of top 5 clocest sentences
    related_docs_indices = similarities.argsort()[:-top_n:-1]
    # return tuples of (similarity score, sentence)
    return [(similarities[idx], joey_line_tuples['line'].values[idx]) 
                for idx in related_docs_indices]

In [23]:
get_closest_utterance('who are you?', joey_vectorizer, joey_train_corpus)

[(1.0,
  "Joey Tribbiani! From the wall! Okay, maybe this will jog your memory, huh? Huh? Okay eh-ah-anyway, I'm ready to go back up on the wall I'm the star of a new TV show."),
 (1.0,
  "Oh, hi, I'm Joey. My stupid friends are buying this house. Who are you?"),
 (0.7527478944848099, 'Me.'),
 (0.7527478944848099, "Alright. I'll give you one hint. Warren Beatty.")]