# Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk

# Load dataset

In [2]:
people = pd.read_csv('people_data.csv', nrows=2000)

In [3]:
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Bill_Clinton>,Bill Clinton,william jefferson bill clinton born william je...
1,<http://dbpedia.org/resource/Polyana_L%C3%B3pez>,Polyana L%C3%B3pez,polyana lpez born circa 1985 is an argentine a...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


# Count Vectorizer

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(people.text)

# Tf-Idf Transform

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transform = TfidfTransformer()
X_train_tfidf = tfidf_transform.fit_transform(X_train_counts)

In [6]:
# Use Tf-Idf as a feature
people['tfidf'] = list(X_train_tfidf.toarray())

In [7]:
people.head()

Unnamed: 0,URI,name,text,tfidf
0,<http://dbpedia.org/resource/Bill_Clinton>,Bill Clinton,william jefferson bill clinton born william je...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,<http://dbpedia.org/resource/Polyana_L%C3%B3pez>,Polyana L%C3%B3pez,polyana lpez born circa 1985 is an argentine a...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# KD-Tree

In [8]:
from sklearn.neighbors import KDTree

kdt = KDTree(people['tfidf'].tolist(), leaf_size=3)

In [15]:
# Using KD Tree to find three articles similar to Barack Obama
dist, idx = kdt.query(people['tfidf'][people['name']=='Barack Obama'].tolist(), k=5)

In [16]:
# Indices of 3 nearest articles
idx

array([[  32,    0, 1177, 1342, 1789]])

In [17]:
idx[0]

array([  32,    0, 1177, 1342, 1789])

In [18]:
# Nearest neighbour 1
people['name'][32]

'Barack Obama'

In [19]:
# Nearest neighbour 2
people['name'][0]

'Bill Clinton'

In [20]:
# Nearest neighbour 3
people['name'][1177]

'Donald Fowler'

In [21]:
# Nearest neighbour 4
people['name'][1342]

'Grier Martin'

In [22]:
# Nearest neighbour 5
people['name'][1789]

'Jonathan Steele'