In [2]:
import graphlab as gl
# Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing.
gl.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1470597612.log


This non-commercial license of GraphLab Create for academic use is assigned to prabal.s.tiwaree@gmail.com and will expire on February 23, 2017.


## Load data

In [6]:
people = gl.SFrame('people_wiki.gl/')
people.head(2)

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...


## 1. Top word count words for Elton John

In [7]:
elton_john = people[people['name'] == 'Elton John']
elton_john.head()

URI,name,text
<http://dbpedia.org/resou rce/Elton_John> ...,Elton John,sir elton hercules john cbe born reginald ken ...


In [8]:
elton_john['word_count'] = gl.text_analytics.count_words(elton_john['text'])
elton_john.head()

URI,name,text,word_count
<http://dbpedia.org/resou rce/Elton_John> ...,Elton John,sir elton hercules john cbe born reginald ken ...,"{'all': 1, 'least': 1, 'producer': 1, 'heavi ..."


In [9]:
elton_john_wc = elton_john[['word_count']].stack('word_count',new_column_name=['word','count']).sort('count',ascending=False)
elton_john_wc

word,count
the,27
in,18
and,15
of,13
a,10
has,9
john,7
he,7
on,6
award,5


## 2. Top TF-IDF words for Elton John

In [10]:
people['word_count'] = gl.text_analytics.count_words(people['text'])
people.head(1)

URI,name,text,word_count
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'selection': 1, 'carltons': 1, 'being': ..."


In [14]:
#people['tfidf'] = gl.text_analytics.tf_idf(people['text']) #this seems to work too
people['tfidf'] = gl.text_analytics.tf_idf(people['word_count'])
people.head(1)

URI,name,text,word_count
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'selection': 1, 'carltons': 1, 'being': ..."

tfidf
"{'selection': 3.836578553093086, ..."


In [17]:
elton_john = people[people['name'] == 'Elton John']
elton_john_tfidf = elton_john[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)
elton_john_tfidf

word,tfidf
furnish,18.38947184
elton,17.48232027
billboard,17.3036809575
john,13.9393127924
songwriters,11.250406447
tonightcandle,10.9864953892
overallelton,10.9864953892
19702000,10.2933482087
fivedecade,10.2933482087
aids,10.262846934


## 3. The cosine distance between 'Elton John's and 'Victoria Beckham's articles (represented with TF-IDF) falls within which range?
## 4. The cosine distance between 'Elton John's and 'Paul McCartney's articles (represented with TF-IDF) falls within which range?

In [19]:
victoria = people[people['name'] == 'Victoria Beckham']
gl.distances.cosine(elton_john['tfidf'][0],victoria['tfidf'][0])

0.9567006376655429

In [20]:
mccartney = people[people['name'] == 'Paul McCartney']
gl.distances.cosine(elton_john['tfidf'][0],mccartney['tfidf'][0])

0.8250310029221779

## 6. Who is the nearest neighbor to 'Elton John' using raw word counts?

In [23]:
knn_model_wc = gl.nearest_neighbors.create(people,features=['word_count'],label='name',distance='cosine')

In [24]:
knn_model_wc.query(elton_john)

query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


## 7. Who is the nearest neighbor to 'Elton John' using TF-IDF?

In [25]:
knn_model_tfidf = gl.nearest_neighbors.create(people,features=['tfidf'],label='name',distance='cosine')
knn_model_tfidf.query(elton_john)

query_label,reference_label,distance,rank
0,Elton John,-2.22044604925e-16,1
0,Rod Stewart,0.717219667893,2
0,George Michael,0.747600998969,3
0,Sting (musician),0.747671954431,4
0,Phil Collins,0.75119324879,5


## 8. Who is the nearest neighbor to 'Victoria Beckham' using raw word counts?
## 9. Who is the nearest neighbor to 'Victoria Beckham' using TF-IDF?

In [26]:
knn_model_wc.query(victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.22044604925e-16,1
0,Mary Fitzgerald (artist),0.207307036115,2
0,Adrienne Corri,0.214509782788,3
0,Beverly Jane Fry,0.217466468741,4
0,Raman Mundair,0.217695474992,5


In [27]:
knn_model_tfidf.query(victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.11022302463e-16,1
0,David Beckham,0.548169610263,2
0,Stephen Dow Beckham,0.784986706828,3
0,Mel B,0.809585523409,4
0,Caroline Rush,0.819826422919,5
