In [1]:
import graphlab

# Load some text data - from Wikipedia, pages on people

In [2]:
people = graphlab.SFrame('people_wiki.gl/')

This non-commercial license of GraphLab Create for academic use is assigned to shavkat.riyatov@gmail.com and will expire on November 27, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\SHAVKA~1\AppData\Local\Temp\graphlab_server_1512912501.log.0


In [3]:
#people.head()

In [4]:
len(people)

59071

# Explore the dataset and checkout the text it contains

In [5]:
obama = people[people['name'] == 'Barack Obama']

In [6]:
#obama

In [7]:
#obama['text']

In [8]:
clooney = people[people['name'] == 'George Clooney']
#clooney['text']

# Get the wordcounts for the obama article

In [9]:
obama['word_count'] = graphlab.text_analytics.count_words(obama['text'])

In [10]:
#print obama['word_count']

## Sort the word_count for the Obama article

In [11]:
obama_word_count_table = obama[['word_count']].stack('word_count', new_column_name = ['word', 'count'])

In [12]:
graphlab.canvas.set_target('ipynb')

In [13]:
#obama_word_count_table.head()

In [14]:
#obama_word_count_table.sort('count', ascending=False)

# Compute TF-IDF for the corpus

In [15]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
#people.head()

In [16]:
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])

In [17]:
type(tfidf)

graphlab.data_structures.sarray.SArray

In [18]:
# in the course they assume that tfidf is a table with one colum 'docs'. But in reality it is not.
# this is the original instruction they wanted me to put: people['tfidf'] = tfidf['docs']
# But it gives an error: 'IndexError: Invalid type to use for indexing'
# Below is what I came up with after few investigation
people['tfidf'] = tfidf

## Examine TF-IDF for the Obama article

In [19]:
obama = people[people['name'] == 'Barack Obama']

In [20]:
#obama[['tfidf']].stack('tfidf', new_column_name=['word', 'tfidf']).sort('tfidf', ascending=False)

# Manualy compute distances between a few people

In [21]:
clinton = people[people['name'] == 'Bill Clinton']

In [22]:
beckham = people[people['name'] == 'David Beckham']

# Is Obama closer to Clinton than to Beckham?

In [23]:
graphlab.distances.cosine(obama['tfidf'][0], clinton['tfidf'][0])

0.8339854936884276

In [24]:
graphlab.distances.cosine(obama['tfidf'][0], beckham['tfidf'][0])

0.9791305844747478

# Build a nearest neighbour model for document retrieval

In [25]:
knn_model = graphlab.nearest_neighbors.create(people, features=['tfidf'], label='name')

# Applying the nearest-neighbours model for retrieval

## Who is closest to Obama?

In [26]:
#knn_model.query(obama)

## Other examples of document retrieval

In [27]:
#swift = people[people['name'] == 'Taylor Swift']

In [28]:
#knn_model.query(swift)

In [29]:
#jolie = people[people['name'] == 'Angelina Jolie']

In [30]:
#knn_model.query(jolie)

In [31]:
#arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [32]:
#knn_model.query(arnold)

# Assignment
## Task 1
Compare top words according to word counts to TF-IDF: 
In the notebook we covered in the module, we explored two document representations: word counts and TF-IDF. Now, take a particular famous person, 'Elton John'. 

- What are the 3 words in his articles with highest word counts? (the:27, in:18, and:15)

- What are the 3 words in his articles with highest TF-IDF? (furnish:18.38, elton:17.48, billboard:17.30)

These results illustrate why TF-IDF is useful for finding important words. Save these results to answer the quiz at the end.

In [33]:
elton_john = people[people['name'] == 'Elton John']

In [48]:
#elton_john[['word_count']].stack('word_count', new_column_name=['word', 'count']).sort('count', ascending=False).head()

In [49]:
#elton_john[['tfidf']].stack('tfidf', new_column_name=['word', 'tfidf']).sort('tfidf', ascending=False)

## Task 2

Measuring distance: Elton John is a famous singer; let’s compute the distance between his article and those of two other famous singers. In this assignment, you will use the cosine distance, which one measure of similarity between vectors, similar to the one discussed in the lectures. You can compute this distance using the graphlab.distances.cosine function. 

- What’s the cosine distance between the articles on ‘Elton John’ and ‘Victoria Beckham’? (0.9567006376655429)

- What’s the cosine distance between the articles on ‘Elton John’ and Paul McCartney’?  (0.8250310029221779)

- Which one of the two is closest to Elton John? (Paul McCartney)

- Does this result make sense to you? (Yes)

Save these results to answer the quiz at the end.

In [52]:
graphlab.distances.cosine(elton_john['tfidf'][0], people[people['name'] == 'Victoria Beckham']['tfidf'][0])

0.9567006376655429

In [53]:
graphlab.distances.cosine(elton_john['tfidf'][0], people[people['name'] == 'Paul McCartney']['tfidf'][0])

0.8250310029221779

## Task 3
Building nearest neighbors models with different input features and setting the distance metric: In the sample notebook, we built a nearest neighbors model for retrieving articles using TF-IDF as features and using the default setting in the construction of the nearest neighbors model. Now, you will build two nearest neighbors models:
- Using word counts as features
- Using TF-IDF as features
In both of these models, we are going to set the distance function to cosine similarity. Here is how: when you call the function

<i>graphlab.nearest_neighbors.create</i>

add the parameter:


<i>distance='cosine'</i>

Now we are ready to use our model to retrieve documents. Use these two models to collect the following results:

- What’s the most similar article, other than itself, to the one on ‘Elton John’ using word count features? (Cliff Richard)
- What’s the most similar article, other than itself, to the one on ‘Elton John’ using TF-IDF features? (Rod Stewart)
- What’s the most similar article, other than itself, to the one on ‘Victoria Beckham’ using word count features? (Mary Fitzgerald (artist))
- What’s the most similar article, other than itself, to the one on ‘Victoria Beckham’ using TF-IDF features? (David Beckham)
Save these results to answer the quiz at the end.

In [56]:
knn_model_wc = graphlab.nearest_neighbors.create(people, features=['word_count'], label='name', distance='cosine')

In [57]:
knn_model_tfidf = graphlab.nearest_neighbors.create(people, features=['tfidf'], label='name', distance='cosine')

In [58]:
victoria_beckham = people[people['name'] == 'Victoria Beckham']

In [64]:
knn_model_wc.query(elton_john)

query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


In [66]:
knn_model_tfidf.query(elton_john)

query_label,reference_label,distance,rank
0,Elton John,-2.22044604925e-16,1
0,Rod Stewart,0.717219667893,2
0,George Michael,0.747600998969,3
0,Sting (musician),0.747671954431,4
0,Phil Collins,0.75119324879,5


In [67]:
knn_model_wc.query(victoria_beckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.22044604925e-16,1
0,Mary Fitzgerald (artist),0.207307036115,2
0,Adrienne Corri,0.214509782788,3
0,Beverly Jane Fry,0.217466468741,4
0,Raman Mundair,0.217695474992,5


In [68]:
knn_model_tfidf.query(victoria_beckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.11022302463e-16,1
0,David Beckham,0.548169610263,2
0,Stephen Dow Beckham,0.784986706828,3
0,Mel B,0.809585523409,4
0,Caroline Rush,0.819826422919,5
