In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np

In [2]:
data_articles= pd.read_csv('wiki_people.csv', index_col='name')['text']

In [3]:
len(data_articles)

53164

In [4]:
#For creating models with different features
def create_NN(feature):
    return NearestNeighbors(n_neighbors=10).fit(feature)

In [5]:
#Predicting the most relevant articles to the query article
def get_closest_neighs(name, model, feature, data_set):
    row = data_set.index.get_loc(name)
    distances, indices = model.kneighbors(feature.getrow(row))
    names_similar = pd.Series(indices.flatten()).map(data_set.reset_index()['name'])
    result = pd.DataFrame({'distance':distances.flatten(), 'name':names_similar})
    return result

In [6]:
#Feature Extraction(1)
countvec = CountVectorizer()
wordvec= countvec.fit_transform(data_articles)

In [7]:
#Feature Extraction(2)
tfidfvec= TfidfVectorizer()
tfidf_matrix= tfidfvec.fit_transform(data_articles)

In [8]:
model_wordvec = create_NN(wordvec)
model_tfidf = create_NN(tfidf_matrix)

In [9]:
print "Retrieved articles with Word Vector model for 'Barack Obama' Article: "
print get_closest_neighs('Barack Obama',model_wordvec, wordvec , data_articles)
print "\n\n Retrieved articles with tf-idf model for 'Barack Obama' Article: "
print get_closest_neighs('Barack Obama',model_tfidf, tfidf_matrix , data_articles)

Retrieved articles with Word Vector model for 'Barack Obama' Article: 
    distance                        name
0   0.000000                Barack Obama
1  33.015148                   Joe Biden
2  34.307434              George W. Bush
3  35.791060                 Mitt Romney
4  36.069378            Lawrence Summers
5  36.249138              Walter Mondale
6  36.276714            Francisco Barrio
7  36.400549                  Don Bonker
8  36.441734  Wynn Normington Hugh-Jones
9  36.837481    Refael (Rafi) Benvenisti


 Retrieved articles with tf-idf model for 'Barack Obama' Article: 
   distance                     name
0  0.000000             Barack Obama
1  1.067974                Joe Biden
2  1.109491   Hillary Rodham Clinton
3  1.117104           Samantha Power
4  1.139620  Eric Stern (politician)
5  1.147363           George W. Bush
6  1.149957              John McCain
7  1.154632              Artur Davis
8  1.157366             Henry Waxman
9  1.159638            Jeff Sessions


In [10]:
print "Retrieved articles with Word Vector model for 'Joe Biden' Article: "
print get_closest_neighs('Joe Biden',model_wordvec, wordvec , data_articles)
print "\n\nRetrieved articles with tf-idf model for 'Joe Biden' Article: "
print get_closest_neighs('Joe Biden',model_tfidf, tfidf_matrix , data_articles)

Retrieved articles with Word Vector model for 'Joe Biden' Article: 
    distance                        name
0   0.000000                   Joe Biden
1  28.740216          William R. Hawkins
2  29.000000              David Whissell
3  29.086079  Michael Henderson (author)
4  29.308702                Jimmy Carter
5  29.308702        Giancarlo Pagliarini
6  29.410882                  John Kerry
7  29.529646                 John McCain
8  29.916551               Matthew Steen
9  30.066593  Peter Fenwick (politician)


Retrieved articles with tf-idf model for 'Joe Biden' Article: 
   distance                    name
0  0.000000               Joe Biden
1  1.007736              Jill Biden
2  1.059652           Cynthia Hogan
3  1.067974            Barack Obama
4  1.133607             Chris Coons
5  1.187922          Michael Castle
6  1.194682           Jeff Sessions
7  1.196413             John McCain
8  1.203451     Kenneth D. Thompson
9  1.204226  Hillary Rodham Clinton
