## Load some text data - from wikipedia, pages on people

In [1]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

people = pd.read_csv('people_wiki.csv')
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


## Explore the dataset and checkout the text it contains
### Exploring the entry for president Obama

In [2]:
obama = people[people['name'] == 'Barack Obama']
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [10]:
obama['text'].values[0]

'barack hussein obama ii brk husen bm born august 4 1961 is the 44th and current president of the united states and the first african american to hold the office born in honolulu hawaii obama is a graduate of columbia university and harvard law school where he served as president of the harvard law review he was a community organizer in chicago before earning his law degree he worked as a civil rights attorney and taught constitutional law at the university of chicago law school from 1992 to 2004 he served three terms representing the 13th district in the illinois senate from 1997 to 2004 running unsuccessfully for the united states house of representatives in 2000in 2004 obama received national attention during his campaign to represent illinois in the united states senate with his victory in the march democratic party primary his keynote address at the democratic national convention in july and his election to the senate in november he began his presidential campaign in 2007 and afte

### Exploring the entry for actor George Clooney

In [12]:
clooney = people[people['name'] == 'George Clooney']
clooney

Unnamed: 0,URI,name,text
38514,<http://dbpedia.org/resource/George_Clooney>,George Clooney,george timothy clooney born may 6 1961 is an a...


In [18]:
clooney['text'].values[0]

'george timothy clooney born may 6 1961 is an american actor writer producer director and activist he has received three golden globe awards for his work as an actor and two academy awards one for acting and the other for producingclooney made his acting debut on television in 1978 and later gained wide recognition in his role as dr doug ross on the longrunning medical drama er from 1994 to 1999 for which he received two emmy award nominations while working on er he began attracting a variety of leading roles in films including the superhero film batman robin 1997 and the crime comedy out of sight 1998 in which he first worked with a director who would become a longtime collaborator steven soderbergh in 1999 clooney took the lead role in three kings a wellreceived war satire set during the gulf warin 2001 clooneys fame widened with the release of his biggest commercial success the heist comedy oceans eleven the first of the film trilogy a remake of the 1960 film with frank sinatra as d

### Get the word counts for Obama article

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
features = vect.fit_transform(obama['text'])

In [31]:
vocabulary = vect.get_feature_names()
vocabulary[0:20]

['13th',
 '1961',
 '1992',
 '1996',
 '1997',
 '20',
 '2000in',
 '2004',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2012obama',
 '2013',
 '44th',
 '63',
 'act',
 'address']

In [30]:
stopwords = vect.get_stop_words()
stopwords

In [34]:
word_frequency = [sum(x) for x in zip(*features.toarray())]
np.array(word_frequency)

array([ 1,  1,  1,  1,  1,  2,  1,  3,  1,  1,  3,  2,  3,  1,  1,  1,  1,
        1,  8,  1,  1,  1,  2,  1,  4,  1,  3,  1, 21,  1,  6,  1,  2,  1,
        1,  1,  1,  1,  1,  1,  1,  2,  1,  1,  1,  1,  1,  1,  3,  1,  2,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  4,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  4,  1,  1,  2,  2,  1,  2,  1,  1,  3,  1,
        1,  1,  1,  1,  1,  3,  4,  2,  1,  3,  1,  1,  1,  1,  1,  1,  2,
        4,  1,  7,  1, 11,  1,  1,  1,  2,  1,  1,  1,  2, 30,  1,  1,  1,
        1,  1,  1,  3,  4,  2,  1,  3,  1,  1,  1,  1,  1,  1,  6,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  4,  1,  1,  1,  2,  1,  1,
        1,  1,  1,  2,  1,  1,  2,  9,  1, 18,  2,  1,  2,  1,  1,  1,  3,
        1,  1,  1,  1,  3,  1,  1,  1,  2,  4,  2,  1,  2,  1,  1,  1,  1,
        2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  1,  1,
        2,  1,  2,  1,  3,  1,  1,  1,  1,  1,  1,  1,  1,  3,  1,  2,  3,
        2,  1,  3,  1,  1

### Sort the word counts for the Obama article

In [42]:
word_count_dict = {word:freq for word, freq in zip(vocabulary, word_frequency)}
obama_word_count = pd.Series(word_count_dict)
obama_word_count.sort_values(ascending=False).head(10)

the      40
in       30
and      21
of       18
to       14
his      11
obama     9
act       8
he        7
law       6
dtype: int64

In [58]:
pd.DataFrame(list(word_count_dict.items()), columns = ['word','count']).sort_values('count',ascending=False).head(10)

Unnamed: 0,word,count
242,the,40
115,in,30
28,and,21
162,of,18
245,to,14
106,his,11
160,obama,9
18,act,8
104,he,7
30,as,6


## Compute TF-IDF for the corpus

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect_tfidf = TfidfVectorizer()

In [45]:
dtm = vect_tfidf.fit_transform(people['text'])
dtm.shape

(59071, 548429)

In [46]:
vocabulary_tfidf = vect_tfidf.get_feature_names()
vocabulary_tfidf[0:20]

['00',
 '000',
 '0000',
 '00000',
 '00000van',
 '0001',
 '00014338',
 '0001sec',
 '0002',
 '00026',
 '0003',
 '0005',
 '000577',
 '0005sec',
 '0006',
 '0007',
 '0007105916',
 '0007200374',
 '0007207328',
 '0007213506']

### Examine the TF-IDF for the Obama article

In [47]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [49]:
word_scores = {}
for word in vocabulary:
    word_scores[word] = dtm[35817, vocabulary_tfidf.index(word)]

In [56]:
obama_word_scores = pd.DataFrame(list(word_scores.items()), columns=['word', 'tfidf'])

obama_word_scores.sort_values('tfidf', ascending=False).head(15)

Unnamed: 0,word,tfidf
160,obama,0.365018
242,the,0.279323
18,act,0.249089
115,in,0.209673
123,iraq,0.151809
28,and,0.146739
133,law,0.144687
60,control,0.131857
162,of,0.126205
256,us,0.122834


### Try again without stop words

In [60]:
vect_tfidf_stopwords = TfidfVectorizer(stop_words = 'english')
dtm_stopwords = vect_tfidf_stopwords.fit_transform(people['text'])
vocabulary_tfidf_stopwords = vect_tfidf_stopwords.get_feature_names()

In [61]:
dtm_stopwords.shape

(59071, 548115)

In [62]:
vocabulary_tfidf_stopwords[0:20]

['00',
 '000',
 '0000',
 '00000',
 '00000van',
 '0001',
 '00014338',
 '0001sec',
 '0002',
 '00026',
 '0003',
 '0005',
 '000577',
 '0005sec',
 '0006',
 '0007',
 '0007105916',
 '0007200374',
 '0007207328',
 '0007213506']

In [64]:
word_scores_stopwords = {}
for word in vocabulary:
    if word in vocabulary_tfidf_stopwords:
        word_scores_stopwords[word] = dtm_stopwords[35817, vocabulary_tfidf_stopwords.index(word)]

In [65]:
obama_word_scores_stopwords = pd.DataFrame(list(word_scores_stopwords.items()), columns=['word', 'tfidf'])

obama_word_scores_stopwords.sort_values('tfidf', ascending=False).head(15)

Unnamed: 0,word,tfidf
138,obama,0.413495
18,act,0.28217
105,iraq,0.17197
114,law,0.163903
53,control,0.149369
143,ordered,0.138633
126,military,0.135368
67,democratic,0.129792
180,response,0.124821
104,involvement,0.124821


### An alternative approach

In [72]:
tfidf_obama_only = TfidfVectorizer(stop_words = 'english')
vectorized_obama_only = tfidf_obama_only.fit_transform(obama['text'])

obama_word_frequency = [sum(x) for x in zip(*vectorized_obama_only.toarray())]
tfidf_obama_dict = {word:freq for word, freq in zip(tfidf_obama_only.get_feature_names(), obama_word_frequency)}

tfidf_obama = pd.DataFrame(list(tfidf_obama_dict.items()), columns=['word', 'tfidf'])
tfidf_obama.sort_values('tfidf', ascending=False).head(15)

Unnamed: 0,word,tfidf
138,obama,0.341882
18,act,0.303895
114,law,0.227921
67,democratic,0.151947
126,military,0.151947
151,president,0.151947
105,iraq,0.151947
53,control,0.151947
189,school,0.113961
211,term,0.113961


## Manually compute similarity and distances between a few people

In [74]:
from sklearn.metrics.pairwise import cosine_similarity
clinton = people[people['name'] == 'Bill Clinton']
beckham = people[people['name'] == 'David Beckham']

### Is Obama closer to Clinton than to Beckham?

Note that we're computing the cosine similarity (which ranges from 0 to 1, where 1 is most similar) not the cosine distance which we can use as well.

cosine_distance = (1-cosine_similarity)

We should find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.

In [76]:
cosine_similarity(dtm[obama.index], dtm[clinton.index])[0][0]

0.32502224734753526

In [77]:
cosine_similarity(dtm[obama.index], dtm[beckham.index])[0][0]

0.15795460246896714

But we can also compute the cosine distance as we did in GraphLab Create

In [79]:
from sklearn.metrics.pairwise import pairwise_distances
pairwise_distances(dtm[obama.index], dtm[clinton.index], metric='cosine')[0][0]

0.67497775265246474

In [80]:
pairwise_distances(dtm[obama.index], dtm[beckham.index], metric='cosine')[0][0]

0.84204539753103291

## Build a nearest neighbor model for document retrieval

In [81]:
from sklearn.neighbors import NearestNeighbors
neighbors = NearestNeighbors(n_neighbors=10)

In [82]:
neighbors.fit(dtm)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=10, p=2, radius=1.0)

### Who is closest to Obama?

In [83]:
distances, indices = neighbors.kneighbors(dtm[obama.index], return_distance=True)

In [84]:
distances

array([[ 0.        ,  1.06843875,  1.1098956 ,  1.11802815,  1.13996938,
         1.14776932,  1.1503744 ,  1.15493924,  1.15776052,  1.15967833]])

In [85]:
indices

array([[35817, 24478, 57108, 38376, 38714, 28447, 39357, 48693, 18827,
        46811]])

In [90]:
obama_closest = {peop:dist for peop, dist in zip(people.iloc[indices[0]]['name'], distances[0])}

import operator

sorted(obama_closest.items(),key=operator.itemgetter(1),reverse=False)

[('Barack Obama', 0.0),
 ('Joe Biden', 1.0684387489755509),
 ('Hillary Rodham Clinton', 1.1098955951464644),
 ('Samantha Power', 1.1180281484430084),
 ('Eric Stern (politician)', 1.139969375675925),
 ('George W. Bush', 1.1477693200355308),
 ('John McCain', 1.1503743970400016),
 ('Artur Davis', 1.154939244962413),
 ('Henry Waxman', 1.157760523580238),
 ('Jeff Sessions', 1.1596783267790989)]

## Other examples of document retrieval

In [91]:
swift = people[people['name'] == 'Taylor Swift']

distances_swift, indices_swift = neighbors.kneighbors(dtm[swift.index], return_distance=True)

swift_closest = {peop:dist for peop, dist in zip(people.iloc[indices_swift[0]]['name'], distances_swift[0])}

sorted(swift_closest.items(),key=operator.itemgetter(1),reverse=False)

[('Taylor Swift', 0.0),
 ('Carrie Underwood', 1.1100799052480037),
 ('Adele', 1.1178055034883319),
 ('Kelly Clarkson', 1.1291984376933892),
 ('Dolly Parton', 1.1390377987371103),
 ('Joss Stone', 1.1399174335958189),
 ('Ed Sheeran', 1.1487480903595233),
 ('Rihanna', 1.1516023588424591),
 ('JoJo (singer)', 1.1528987198034819),
 ('Miranda Lambert', 1.1549131811918949)]

In [92]:
jolie = people[people['name'] == 'Angelina Jolie']

distances_jolie, indices_jolie = neighbors.kneighbors(dtm[jolie.index], return_distance=True)

jolie_closest = {peop:dist for peop, dist in zip(people.iloc[indices_jolie[0]]['name'], distances_jolie[0])}

sorted(jolie_closest.items(),key=operator.itemgetter(1),reverse=False)

[('Angelina Jolie', 0.0),
 ('Barbara Hershey', 1.1206292555062576),
 ('Glenn Close', 1.1258511704084893),
 ('Maggie Smith', 1.1347558539845199),
 ('Julianne Moore', 1.1401370672352029),
 ('Konkona Sen Sharma', 1.1450078336622631),
 ('Meryl Streep', 1.1458973492549014),
 ('Candice Bergen', 1.1464160850410523),
 ('Jodie Foster', 1.1465882426060314),
 ('Kate Winslet', 1.1475371175633065)]

In [93]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

distances_arnold, indices_arnold = neighbors.kneighbors(dtm[arnold.index], return_distance=True)

arnold_closest = {peop:dist for peop, dist in zip(people.iloc[indices_arnold[0]]['name'], distances_arnold[0])}

sorted(arnold_closest.items(),key=operator.itemgetter(1),reverse=False)

[('Arnold Schwarzenegger', 0.0),
 ('Paul Grant (bodybuilder)', 1.216373900392606),
 ('Bonnie Garcia', 1.2219353032328455),
 ('Abel Maldonado', 1.2327233063300338),
 ('David Israel', 1.2391098751511127),
 ('John Garamendi', 1.2404854214576317),
 ('James Tramel', 1.2464936685404691),
 ('Gray Davis', 1.2504671090936266),
 ('Russell Gould', 1.250487164162347),
 ('Jerry Brown', 1.2505900474630125)]