In [1]:
import turicreate as tc

### Load a people data

In [2]:
people_df = tc.SFrame('datasets/people_wiki.sframe')

In [3]:
people_df.head(5)

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...


### Create a word_count table

In [4]:
people_df['word_count'] = tc.text_analytics.count_words(people_df['text'])

In [5]:
people_df.head(5)

URI,name,text,word_count
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'melbourne': 1.0, 'parade': 1.0, ..."
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...,"{'time': 1.0, 'each': 1.0, 'hour': 1.0, ..."
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...,"{'society': 1.0, 'hamilton': 1.0, 'to': ..."
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...,"{'kurdlawitzpreis': 1.0, 'awarded': 1.0, '2004': ..."
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'curtis': 1.0, '2007': 1.0, 'cent': 1.0, ..."


### Create a TF_IDF table

In [6]:
people_df['tf_idf'] = tc.text_analytics.tf_idf(people_df['word_count'])
people_df.head(5)

URI,name,text,word_count
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'melbourne': 1.0, 'parade': 1.0, ..."
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...,"{'time': 1.0, 'each': 1.0, 'hour': 1.0, ..."
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...,"{'society': 1.0, 'hamilton': 1.0, 'to': ..."
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...,"{'kurdlawitzpreis': 1.0, 'awarded': 1.0, '2004': ..."
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'curtis': 1.0, '2007': 1.0, 'cent': 1.0, ..."

tf_idf
"{'melbourne': 3.8914310119380633, ..."
"{'time': 1.3253342074200498, ..."
"{'society': 2.4448047262085693, ..."
"{'kurdlawitzpreis': 10.986495389225194, ..."
"{'curtis': 5.299520032885375, ..."


### Exploring word count and tf_idf of Elton John

#### Elton john word count exploration

In [13]:
elton = people_df[people_df['name'] == 'Elton John']
elton[['word_count']].stack('word_count', new_column_name=['word', 'count']).sort('count', ascending=False)

word,count
the,27.0
in,18.0
and,15.0
of,13.0
a,10.0
has,9.0
he,7.0
john,7.0
on,6.0
award,5.0


#### Elton john TF-IDF exploration

In [14]:
elton[['tf_idf']].stack('tf_idf', new_column_name=['word', 'tf_idf']).sort('tf_idf', ascending=False)

word,tf_idf
furnish,18.38947183999428
elton,17.482320270031995
billboard,17.30368095754203
john,13.93931279239831
songwriters,11.25040644703154
overallelton,10.986495389225194
tonightcandle,10.986495389225194
fivedecade,10.293348208665249
19702000,10.293348208665249
aids,10.262846934045534


### Measuring distance with Victoria Beckham

In [15]:
vbeck = people_df[people_df['name'] == 'Victoria Beckham']
vbeck

URI,name,text,word_count
<http://dbpedia.org/resou rce/Victoria_Beckham> ...,Victoria Beckham,victoria caroline beckham ne adams born 17 april ...,"{'week': 1.0, 'york': 1.0, 'show': 1.0, ..."

tf_idf
"{'week': 3.57252509903475, 'yo ..."


In [17]:
paulmcCart = people_df[people_df['name'] == 'Paul McCartney']
paulmcCart

URI,name,text,word_count
<http://dbpedia.org/resou rce/Paul_McCartney> ...,Paul McCartney,sir james paul mccartney mbe born 18 june 1942 is ...,"{'children': 1.0, 'five': 1.0, 'married': 1.0, ..."

tf_idf
"{'children': 2.4252853123921825, ..."


#### Calculating distance between victoria beckham and elton john

In [16]:
tc.distances.cosine(elton['tf_idf'][0], vbeck['tf_idf'][0])

0.9567006376655429

In [18]:
tc.distances.cosine(elton['tf_idf'][0], paulmcCart['tf_idf'][0])

0.8250310029221779

### Building KNN model based on TF-IDF and word count for comparison

#### KNN using TF-IDF feature

In [19]:
knn_tfidf = tc.nearest_neighbors.create(people_df, features=['tf_idf'], label='name', distance='cosine')

#### KNN using word count feature

In [21]:
knn_word_count = tc.nearest_neighbors.create(people_df, features=['word_count'], label='name', distance='cosine')

### Using both the models for comparison (on john elton)

#### john elton knn using word count

In [22]:
knn_word_count.query(elton)

query_label,reference_label,distance,rank
0,Elton John,2.220446049250313e-16,1
0,Cliff Richard,0.1614241525896703,2
0,Sandro Petrone,0.1682254275104111,3
0,Rod Stewart,0.168327165587061,4
0,Malachi O'Doherty,0.177315545978884,5


#### John elton knn using tf-idf

In [24]:
knn_tfidf.query(elton)

query_label,reference_label,distance,rank
0,Elton John,-2.220446049250313e-16,1
0,Rod Stewart,0.7172196678927374,2
0,George Michael,0.7476009989692848,3
0,Sting (musician),0.7476719544306141,4
0,Phil Collins,0.7511932487904706,5


### Using both model for comparison (on Victoria Beckham)

#### victoria beckham using word count

In [25]:
knn_word_count.query(vbeck)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.220446049250313e-16,1
0,Mary Fitzgerald (artist),0.2073070361150499,2
0,Adrienne Corri,0.2145097827875479,3
0,Beverly Jane Fry,0.2174664687407927,4
0,Raman Mundair,0.2176954749915048,5


#### Victoria beckham using TF-IDF

In [26]:
knn_tfidf.query(vbeck)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.1102230246251563e-16,1
0,David Beckham,0.5481696102632145,2
0,Stephen Dow Beckham,0.7849867068283364,3
0,Mel B,0.8095855234085036,4
0,Caroline Rush,0.81982642291868,5
