# Document retrieval from Wikipedia data

In [1]:
import turicreate

# Load some text data from Wikipedia

In [2]:
people = turicreate.SFrame('./people_wiki.sframe')

In [3]:
people

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...


# Compute TF-IDF for the entire corpus of articles

In [4]:
people['tfidf'] = turicreate.text_analytics.tf_idf(people['text'])

## Examine the word count and tf_idf of Jimmy Carter

In [5]:
carter = people[people['name'] == 'Jimmy Carter']

In [6]:
carter['text']

dtype: str
Rows: ?
['james earl jimmy carter jr born october 1 1924 is an american politician and member of the democratic party who served as the 39th president of the united states from 1977 to 1981 he was awarded the 2002 nobel peace prizecarter raised in rural georgia was a peanut farmer who served two terms as a georgia state senator and one as the governor of georgia from 1971 to 1975 he was elected president in 1976 defeating incumbent president gerald ford in a relatively close election running as an outsider who promised truth in government in the wake of the watergate scandal he is the second oldest after george h w bush of americas four living former presidentsduring carters term as president he created two new cabinetlevel departments the department of energy and the department of education he established a national energy policy that included conservation price control and new technology in foreign affairs carter pursued the camp david accords the panama canal treaties the

In [7]:
carter['word_count'] = turicreate.text_analytics.count_words(carter['text'])

In [8]:
word_count_table = carter [['word_count']].stack('word_count', new_column_name = ['word','count'])

In [9]:
word_count_table.sort('count',ascending=False)

word,count
the,31.0
in,13.0
of,13.0
and,11.0
he,10.0
a,6.0
to,6.0
as,6.0
was,5.0
for,4.0


In [10]:
carter[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)

word,tfidf
carter,19.43850482044404
panama,17.218413951194123
carters,14.070503341287534
canal,12.104042912189003
georgia,12.038857109115163
energy,11.423849716384485
prizecarter,10.986495389225194
reagancarter,10.986495389225194
presidentsduring,10.293348208665249
stagflation,10.293348208665249


# Apply nearest neighbors for retrieval of Wikipedia articles

## Build the NN model

In [11]:
knn_model = turicreate.nearest_neighbors.create(people,features=['tfidf'],label='name')

In [12]:
knn_model.query(carter)

query_label,reference_label,distance,rank
0,Jimmy Carter,0.0,1
0,George H. W. Bush,0.8286445012787724,2
0,Walter Mondale,0.8328981723237598,3
0,Betty Wilson (New Jersey politician) ...,0.8337874659400545,4
0,Bill Clinton,0.8352941176470589,5


In [13]:
leonardo = people[people['name'] == 'Leonardo DiCaprio']

In [14]:
knn_model.query(leonardo)

query_label,reference_label,distance,rank
0,Leonardo DiCaprio,0.0,1
0,Nicole Kidman,0.7750865051903114,2
0,Alec Baldwin,0.7773722627737226,3
0,Robert De Niro,0.7905405405405406,4
0,Catherine Zeta-Jones,0.7938931297709924,5
