In [1]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

people = pd.read_csv('people_wiki.csv')
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


# Question 1
Top word count words for Elton John

In [2]:
john = people[people['name'] == 'Elton John']
john['text'].values[0]

'sir elton hercules john cbe born reginald kenneth dwight 25 march 1947 is an english singer songwriter composer pianist record producer and occasional actor he has worked with lyricist bernie taupin as his songwriter partner since 1967 they have collaborated on more than 30 albums to datein his fivedecade career elton john has sold more than 300 million records making him one of the bestselling music artists in the world he has more than fifty top 40 hits including seven consecutive no 1 us albums 58 billboard top 40 singles 27 top 10 four no 2 and nine no 1 for 31 consecutive years 19702000 he had at least one song in the billboard hot 100 his single something about the way you look tonightcandle in the wind 1997 sold over 33 million copies worldwide and is the bestselling single of all time he has received six grammy awards five brit awards winning two awards for outstanding contribution to music and the first brits icon in 2013 for his lasting impact on british culture an academy a

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
vect_john = CountVectorizer()
features_john = vect_john.fit_transform(john['text'])
vocabulary_john = vect_john.get_feature_names()

In [10]:
john_word_frequency = [sum(x) for x in zip(*features_john.toarray())]
john_word_count_dict = {word:freq for word, freq in zip(vocabulary_john, john_word_frequency)}

john_word_count = pd.DataFrame(list(john_word_count_dict.items()), columns = ['word','count']).sort_values('count',ascending=False)
john_word_count.head(10)

Unnamed: 0,word,count
227,the,27
135,in,18
42,and,15
176,of,13
118,has,9
121,he,7
146,john,7
177,on,6
209,since,5
106,for,5


# Question 2
Top TF-IDF words for Elton John

In [11]:
john

Unnamed: 0,URI,name,text
19923,<http://dbpedia.org/resource/Elton_John>,Elton John,sir elton hercules john cbe born reginald kenn...


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect_tfidf = TfidfVectorizer(stop_words = 'english')
dtm = vect_tfidf.fit_transform(people['text'])
vocabulary_tfidf = vect_tfidf.get_feature_names()


In [13]:
word_scores_john = {}
for word in vocabulary_john:
    if word in vocabulary_tfidf:
        word_scores_john[word] = dtm[19923, vocabulary_tfidf.index(word)]

In [14]:
john_word_scores = pd.DataFrame(list(word_scores_john.items()), columns=['word', 'tfidf'])

john_word_scores.sort_values('tfidf', ascending=False).head(15)

Unnamed: 0,word,tfidf
47,billboard,0.220815
118,john,0.217082
78,elton,0.212174
93,furnish,0.208194
181,songwriters,0.137278
42,award,0.136446
34,aids,0.127077
136,million,0.124955
1,100,0.123607
149,palace,0.122427


# Question 3
The cosine distance between 'Elton John's and 'Victoria Beckham's articles (represented with TF-IDF) falls within which range?

In [15]:
beckham = people[people['name'] == 'Victoria Beckham']

from sklearn.metrics.pairwise import pairwise_distances
pairwise_distances(dtm[john.index], dtm[beckham.index], metric='cosine')[0][0]

0.96592976924167284

# Question 4
The cosine distance between 'Elton John's and 'Paul McCartney's articles (represented with TF-IDF) falls within which range?

In [16]:
mccartney = people[people['name'] == 'Paul McCartney']

pairwise_distances(dtm[john.index], dtm[mccartney.index], metric='cosine')[0][0]

0.81008627039643599

# Question 5
Who is closer to 'Elton John', 'Victoria Beckham' or 'Paul McCartney'?

* Paul McCartney

# Question 6
Who is the nearest neighbor to 'Elton John' using raw word counts?

In [30]:
from sklearn.neighbors import NearestNeighbors
neighbors_raw = NearestNeighbors(n_neighbors=10)

from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words = 'english')
features = vect.fit_transform(people['text'])

neighbors_raw.fit(features)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=10, p=2, radius=1.0)

In [31]:
import operator

distances_raw_john, indices_raw_john = neighbors_raw.kneighbors(features[john.index], return_distance=True)

john_closest_raw = {peop:dist for peop, dist in zip(people.iloc[indices_raw_john[0]]['name'], distances_raw_john[0])}

sorted(john_closest_raw.items(),key=operator.itemgetter(1),reverse=False)

[('Elton John', 0.0),
 ('Barry Gibb', 20.420577856662138),
 ('Olivia Newton-John', 20.808652046684813),
 ('Scott Cutler', 21.377558326431949),
 ('Sting (musician)', 21.42428528562855),
 ('George Michael', 21.42428528562855),
 ('Roger Daltrey', 21.494185260204677),
 ('Jimmy Webb', 21.61018278497431),
 ('Anne Preven', 21.61018278497431),
 ('Scott Mathews', 21.679483388678801)]

# Question 7
Who is the nearest neighbor to 'Elton John' using TF-IDF?

In [18]:
from sklearn.neighbors import NearestNeighbors
neighbors_tfidf = NearestNeighbors(n_neighbors=10)

neighbors_tfidf.fit(dtm)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=10, p=2, radius=1.0)

In [20]:
import operator

distances_john, indices_john = neighbors_tfidf.kneighbors(dtm[john.index], return_distance=True)

john_closest = {peop:dist for peop, dist in zip(people.iloc[indices_john[0]]['name'], distances_john[0])}

sorted(john_closest.items(),key=operator.itemgetter(1),reverse=False)

[('Elton John', 0.0),
 ('Rod Stewart', 1.1867420727453364),
 ('Sting (musician)', 1.1965095841300906),
 ('George Michael', 1.2038876011656052),
 ('Phil Collins', 1.2040186326374407),
 ('Kelly Clarkson', 1.2183370781896707),
 ('Usher (entertainer)', 1.2215036579599907),
 ('Adele', 1.2238839311499266),
 ('Rihanna', 1.2296774398874877),
 ('Bryan Adams', 1.2324149213938365)]

# Question 8
Who is the nearest neighbor to 'Victoria Beckham' using raw word counts?

In [32]:
distances_raw_beckham, indices_raw_beckham = neighbors_raw.kneighbors(features[beckham.index], return_distance=True)

beckham_closest_raw = {peop:dist for peop, dist in zip(people.iloc[indices_raw_beckham[0]]['name'], distances_raw_beckham[0])}

sorted(beckham_closest_raw.items(),key=operator.itemgetter(1),reverse=False)

[('Victoria Beckham', 0.0),
 ('Silvia Tcherassi', 19.235384061671343),
 ('Lela Rose', 19.416487838947599),
 ('Louis Molloy', 19.544820285692065),
 ('Kate Moss', 19.595917942265423),
 ('Namoli Brennet', 19.646882704388499),
 ('Dave Cartwright', 19.672315572906001),
 ('Hilary Alexander', 19.672315572906001),
 ('Karen Maruyama', 19.697715603592208),
 ('Charles Black (counterfeiter)', 19.748417658131498)]

# Question 9
Who is the nearest neighbor to 'Victoria Beckham' using TF-IDF?

In [29]:
distances_tfidf_beckham, indices_tfidf_beckham = neighbors_tfidf.kneighbors(dtm[beckham.index], return_distance=True)

beckham_closest_tfidf = {peop:dist for peop, dist in zip(people.iloc[indices_tfidf_beckham[0]]['name'], distances_tfidf_beckham[0])}

sorted(beckham_closest_tfidf.items(),key=operator.itemgetter(1),reverse=False)

[('Victoria Beckham', 0.0),
 ('David Beckham', 1.0727721445393383),
 ('Stephen Dow Beckham', 1.2642770409366204),
 ('Caroline Rush', 1.2745985436463616),
 ('Angelique Westerhof', 1.2761121836156752),
 ('Wal%C3%A9 Adeyemi', 1.2800181981682079),
 ('Colin McDowell', 1.2806572846455802),
 ('Zurain Imam', 1.2808955667963822),
 ('Mel B', 1.2813653221401371),
 ('Yuliya Polishchuk', 1.2819006119162419)]