In [161]:
# Extract a few sentences from the wikipedia article
# since wikipedia articles on Hollywood actors are all many pages long
# I initially decided to use the Article extracts from the MediaWiki API as the dataset

In [162]:
import requests

def get_wiki_extract(article):    
    response = requests.get('https://en.wikipedia.org/w/api.php',
                            params={'action': 'query','format': 'json',
                                    'titles': article,
                                    'prop': 'extracts',
                                    'exintro': False,
                                    'explaintext': True,
                                    'exsectionformat': 'plain'
                                   }).json()
    text = next(iter(response['query']['pages'].values()))['extract']
    return text

In [163]:
# Use the wikipedia library to get the full content of the article
# to build a more complex model
import wikipedia
def get_wiki_content(article):
    return wikipedia.page(article).content

In [164]:
# Store all the raw wikipedia article extracts in a list
#raw_wiki_extracts = []
#for actor in actors:
    #raw_wiki_extracts.append(get_wiki_extract(actor))
#    raw_wiki_extracts.append(get_wiki_content(actor))
#print(raw_wiki_extracts[-1])

In [189]:
# Generate a random subset of 50 actors
# with shuf -n 150 actors.txt > actors50.txt

# this will fetch 150 wikipedia articles to build the corpus 
raw_wiki_extracts = []
with open('actors150.txt') as actors_list:
    for actor in actors_list:
        actor = actor.strip()
        content = get_wiki_content(actor)
        print(actor)
        raw_wiki_extracts.append(content)

Jaclyn_Betham
Desi_Arnaz
Kathy_Brier
Ernie_Anderson
Don_Ameche
Candace_Cameron_Bure
Chloe_Bennet
Maude_Apatow
Tammy_Blanchard
Todd_Armstrong
Devon_Aoki
Fiona_Bishop
Jack_Albertson
Rosanna_Arquette
Geoffrey_Arend
David_Arkin
Rodolfo_Acosta
Zazie_Beetz
Sam_Appel
Paul_America
Talia_Balsam
Edward_Andrews
Joe_Don_Baker
Irving_Bacon
Loanne_Bishop
Tim_Allen
Kevin_Bacon
Lester_Allen
Doris_Belack
Anna_Belknap
Tammy_Barr
Courtney_Taylor_Burness
Julie_Caitlin_Brown
Crystal_Allen
Jordana_Brewster
Jason_Alexander
Oscar_Apfel
Hedy_Burress
Gene_Autry
Dana_Barron
Michelle_Bauer
Humbert_Allen_Astredo
Delta_Burke
Eion_Bailey
Blythe_Auffarth
Melanie_Amaro
Susan_Blakely
Brandy_Burre
Darcy_Rose_Byrnes
Rex_Allen
Daniella_Alonso
Bobby_Alto
Alex_Borstein
Olivia_Burnette
Alma_Beltran
Elizabeth_Alderfer
Kristina_Anapau
Allison_Balson
Angela_Bassett
Maxine_Bahns
Nicole_Axelrod
Shaindel_Antelis
Billie_Joe_Armstrong
Angelica_Bridges
Rowan_Blanchard
Lou_Antonio
Bianca_Allen
Emile_de_Antonio
Edwin_August
Mia_Barron


In [191]:
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.tokenize import RegexpTokenizer
from collections import defaultdict
from nltk.corpus import stopwords

def raw_to_clean_bag_of_words(text):
    # Build a dict to translate TreeBank POS type to WordNet POS
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    
    text = text.lower()
    
    # keeps only whole words with at least four chars 
    tokenizer = RegexpTokenizer('\w{4,}')
    tokens = tokenizer.tokenize(text)
    
    # wordnet lemmatizer reduces to word root forms
    lemmatizer = WordNetLemmatizer()
    bag = []
    for token, tag in pos_tag(tokens):
        lemma = lemmatizer.lemmatize(token, tag_map[tag[0]])
        bag.append(lemma)
    
    # filter stopwords
    bag = filter(lambda word: word not in stopwords.words('english'), bag)
    return bag

# Build a bag of words representation for each article
bag = []
for article in raw_wiki_extracts:
    bag.append(raw_to_clean_bag_of_words(article))
#print(bag[-1])

In [192]:
# remove words that appear only once
frequency = defaultdict(int)
for text in bag:
    for token in text:
        frequency[token] += 1

bag = [[token for token in text if frequency[token] > 1] for text in bag]
#print(bag[0])

In [193]:
# Generate a dictionary from all the articles
import gensim
dictionary = gensim.corpora.Dictionary(bag)
corpus = [dictionary.doc2bow(text) for text in bag]

# Save the dictionary so we don't hammer wikipedia 
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [209]:
# Build a Latent Dirichlet Allocation (LDA) model with gensim
# https://radimrehurek.com/gensim/models/ldamodel.html
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics = 5, 
                                           id2word=dictionary, 
                                           update_every=0,
                                           passes=20)
ldamodel.save('actor_lda_model.gensim')
topics = ldamodel.print_topics(num_words=6)
pprint(topics)

#from the model probabilities, it appears that there are no significant 
#high probabilty, unique words that separate the topics

[(0,
  u'0.017*"arnaz" + 0.015*"autry" + 0.009*"film" + 0.007*"television" + 0.007*"show" + 0.007*"lucy"'),
 (1,
  u'0.013*"armstrong" + 0.009*"bacon" + 0.008*"film" + 0.008*"show" + 0.007*"behar" + 0.006*"release"'),
 (2,
  u'0.016*"allen" + 0.011*"film" + 0.010*"role" + 0.009*"play" + 0.007*"broadway" + 0.007*"star"'),
 (3,
  u'0.020*"film" + 0.010*"role" + 0.010*"star" + 0.010*"television" + 0.008*"series" + 0.008*"appear"'),
 (4,
  u'0.018*"film" + 0.013*"role" + 0.011*"series" + 0.009*"play" + 0.009*"episode" + 0.009*"bassett"')]


In [215]:
test_actors = ['Brie_Larson','Jennifer_Aniston','Brad_Pitt','Angelina_Jolie',
         'Winona_Ryder','Brittany_Murphy','Julia_Roberts', 'Kathy_Bates',
         'Sarah_Paulson','Susan_Sarandon','Dustin_Hoffman','Robin_Williams',
         'Alan_Alda','Judd_Apatow','Charlie_Adler','Kevin_Bacon','James_Arness',
         'Will_Arnett','Steve_Bannos','Jonathan_Banks','Rick_Aviles','Bea_Arthur','Lauren_Bacall']

In [213]:
# Try to classify new actors with LDA model
for actor in test_actors:
    bow = dictionary.doc2bow(raw_to_clean_bag_of_words(get_wiki_content(test_actor)))
    print(actor, ldamodel.get_document_topics(bow, minimum_probability=0.05 ))

('Brie_Larson', [(1, 0.051107556), (3, 0.4656969), (4, 0.47781518)])
('Jennifer_Aniston', [(1, 0.05110852), (3, 0.46568725), (4, 0.47782382)])
('Brad_Pitt', [(1, 0.051108066), (3, 0.46568987), (4, 0.47782165)])
('Angelina_Jolie', [(1, 0.05110684), (3, 0.46570358), (4, 0.47780916)])
('Winona_Ryder', [(1, 0.051106825), (3, 0.46570373), (4, 0.4778091)])
('Brittany_Murphy', [(1, 0.05110762), (3, 0.46569437), (4, 0.4778176)])
('Julia_Roberts', [(1, 0.051109333), (3, 0.46568048), (4, 0.47782984)])
('Kathy_Bates', [(1, 0.051107936), (3, 0.4656901), (4, 0.4778216)])
('Sarah_Paulson', [(1, 0.05110715), (3, 0.46570075), (4, 0.47781166)])
('Susan_Sarandon', [(1, 0.051108398), (3, 0.46569037), (4, 0.4778208)])
('Dustin_Hoffman', [(1, 0.05110834), (3, 0.46568844), (4, 0.47782284)])
('Robin_Williams', [(1, 0.05110863), (3, 0.46568555), (4, 0.47782546)])
('Alan_Alda', [(1, 0.051107652), (3, 0.4656984), (4, 0.47781357)])
('Judd_Apatow', [(1, 0.051106784), (3, 0.46570447), (4, 0.47780836)])
('Charlie_A

In [214]:
# Try the Hierarchical Dirichlet Process HDP model
hdpmodel = gensim.models.HdpModel(corpus, id2word=dictionary)
hdpmodel.optimal_ordering()
hdpmodel.print_topics(num_topics=5, num_words=6)

[(0,
  u'0.010*film + 0.006*play + 0.005*bacon + 0.005*award + 0.005*biel + 0.004*role'),
 (1,
  u'0.007*arnaz + 0.006*ameche + 0.006*albert + 0.006*star + 0.006*film + 0.005*show'),
 (2,
  u'0.006*episode + 0.005*film + 0.004*bilson + 0.003*breslin + 0.003*series + 0.003*play'),
 (3,
  u'0.017*bassett + 0.007*film + 0.005*role + 0.003*play + 0.003*appear + 0.002*year'),
 (4,
  u'0.012*autry + 0.004*gene + 0.004*film + 0.004*ranch + 0.003*allen + 0.003*also')]

In [220]:
# Try to classify new actors with HDP model
# https://radimrehurek.com/gensim/models/hdpmodel.html
for actor in test_actors:
    bow = [dictionary.doc2bow(raw_to_clean_bag_of_words(get_wiki_content(test_actor)))]
    print(actor, hdpmodel.evaluate_test_corpus(bow))

('Brie_Larson', -564.9070615803536)
('Jennifer_Aniston', -564.9070615803536)
('Brad_Pitt', -564.9070615803536)
('Angelina_Jolie', -564.9070615803536)
('Winona_Ryder', -564.9070615803536)
('Brittany_Murphy', -564.9070615803536)
('Julia_Roberts', -564.9070615803536)
('Kathy_Bates', -564.9070615803536)
('Sarah_Paulson', -564.9070615803536)
('Susan_Sarandon', -564.9070615803536)
('Dustin_Hoffman', -564.9070615803536)
('Robin_Williams', -564.9070615803536)
('Alan_Alda', -564.9070615803536)
('Judd_Apatow', -564.9070615803536)
('Charlie_Adler', -564.9070615803536)
('Kevin_Bacon', -564.9070615803536)
('James_Arness', -564.9070615803536)
('Will_Arnett', -564.9070615803536)
('Steve_Bannos', -564.9070615803536)
('Jonathan_Banks', -564.9070615803536)
('Rick_Aviles', -564.9070615803536)
('Bea_Arthur', -564.9070615803536)
('Lauren_Bacall', -564.9070615803536)


It is not clear to me how I can classify random wikipedia articles for hollywood actors by a specific theme. There does not appear to be sufficient distinguishing features from keywords alone