# Introduction to Topic Modeling

In today's lesson, we're going to be working on topic modeling.

In [1]:
import little_mallet_wrapper
import seaborn
import glob
from pathlib import Path

In [18]:
path_to_mallet = 'mallet-2.0.8/bin/mallet'

In [19]:
directory = "../_datasets/texts/history/NYT-Obituaries/"

In [20]:
files = glob.glob(f"{directory}/*.txt")

In [21]:
files

['../_datasets/texts/history/NYT-Obituaries/1945-Adolf-Hitler.txt',
 '../_datasets/texts/history/NYT-Obituaries/1915-F-W-Taylor.txt',
 '../_datasets/texts/history/NYT-Obituaries/1975-Chiang-Kai-shek.txt',
 '../_datasets/texts/history/NYT-Obituaries/1984-Ethel-Merman.txt',
 '../_datasets/texts/history/NYT-Obituaries/1953-Jim-Thorpe.txt',
 '../_datasets/texts/history/NYT-Obituaries/1964-Nella-Larsen.txt',
 '../_datasets/texts/history/NYT-Obituaries/1955-Margaret-Abbott.txt',
 '../_datasets/texts/history/NYT-Obituaries/1984-Lillian-Hellman.txt',
 '../_datasets/texts/history/NYT-Obituaries/1959-Cecil-De-Mille.txt',
 '../_datasets/texts/history/NYT-Obituaries/1928-Mabel-Craty.txt',
 '../_datasets/texts/history/NYT-Obituaries/1973-Eddie-Rickenbacker.txt',
 '../_datasets/texts/history/NYT-Obituaries/1989-Ferdinand-Marcos.txt',
 '../_datasets/texts/history/NYT-Obituaries/1991-Martha-Graham.txt',
 '../_datasets/texts/history/NYT-Obituaries/1997-Deng-Xiaoping.txt',
 '../_datasets/texts/history/N

In [22]:
little_mallet_wrapper.process_string(text, numbers='remove')

'august obituary stanislavsky dies moscow associated press moscow aug constantin sergeivitch stanislavsky one greatest masters russian drama founder moscow art theatre died today heart ailment year old dramatist ill since june collapsed preparations staging cartouche work seventeenth century french playwright moliere russia celebrated birthday last january stanislavsky began stage career made internationally famous directing performances non professional artists czarist russia received order red labor banner services proletarian stage stanislavsky actor producer teacher philosopher theatre actor appeared roles become part history russian theatre satine gorky lower depths famusov woe reason griboyedov fifty five years stanislavsky strove create bold new forms theatrical expression forty years headed moscow art theatre staged fifty plays enacted twenty seven characterizations recognition work government conferred besides red banner labor order lenin title people artist founder art theatr

In [23]:
training_data = []
for file in files:
    text = open(file, encoding='utf-8').read()
    processed_text = little_mallet_wrapper.process_string(text, numbers='remove')
    training_data.append(processed_text)

In [24]:
original_texts = []
for file in files:
    text = open(file, encoding='utf-8').read()
    original_texts.append(text)

In [25]:
obit_titles = [Path(file).stem for file in files]

In [26]:
obit_titles

['1945-Adolf-Hitler',
 '1915-F-W-Taylor',
 '1975-Chiang-Kai-shek',
 '1984-Ethel-Merman',
 '1953-Jim-Thorpe',
 '1964-Nella-Larsen',
 '1955-Margaret-Abbott',
 '1984-Lillian-Hellman',
 '1959-Cecil-De-Mille',
 '1928-Mabel-Craty',
 '1973-Eddie-Rickenbacker',
 '1989-Ferdinand-Marcos',
 '1991-Martha-Graham',
 '1997-Deng-Xiaoping',
 '1938-George-E-Hale',
 '1885-Ulysses-Grant',
 '1909-Sarah-Orne-Jewett',
 '1957-Christian-Dior',
 '1987-Clare-Boothe-Luce',
 '1976-Jacques-Monod',
 '1954-Getulio-Vargas',
 '1979-Stan-Kenton',
 '1990-Leonard-Bernstein',
 '1972-Jackie-Robinson',
 '1998-Fred-W-Friendly',
 '1991-Leo-Durocher',
 '1915-B-T-Washington',
 '1997-James-Stewart',
 '1981-Joe-Louis',
 '1983-Muddy-Waters',
 '1942-George-M-Cohan',
 '1989-Samuel-Beckett',
 '1962-Marilyn-Monroe',
 '2000-Charles-M-Schulz',
 '1967-Gregory-Pincus',
 '1894-R-L-Stevenson',
 '1978-Bruce-Catton',
 '1982-Arthur-Rubinstein',
 '1875-Andrew-Johnson',
 '1974-Charles-Lindbergh',
 '1964-Rachel-Carson',
 '1953-Marjorie-Rawlings',


In [27]:
little_mallet_wrapper.print_dataset_stats(training_data)

Number of Documents: 379
Mean Number of Words per Document: 1314.6
Vocabulary Size: 35983


In [34]:
num_topics = 20

In [35]:
training_data = training_data

In [36]:
#Change to your desired output directory
output_directory_path = 'topic-model-output/NYT-Obits'

#No need to change anything below here
Path(f"{output_directory_path}").mkdir(parents=True, exist_ok=True)

path_to_training_data           = f"{output_directory_path}/training.txt"
path_to_formatted_training_data = f"{output_directory_path}/mallet.training"
path_to_model                   = f"{output_directory_path}/mallet.model.{str(num_topics)}"
path_to_topic_keys              = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}"
path_to_topic_distributions     = f"{output_directory_path}/mallet.topic_distributions.{str(num_topics)}"

In [37]:
little_mallet_wrapper.import_data(path_to_mallet,
                path_to_training_data,
                path_to_formatted_training_data,
                training_data)

Importing data...
Complete


In [38]:
little_mallet_wrapper.train_topic_model(path_to_mallet,
                      path_to_formatted_training_data,
                      path_to_model,
                      path_to_topic_keys,
                      path_to_topic_distributions,
                      num_topics)

Training topic model...
Complete


In [33]:
topics = little_mallet_wrapper.load_topic_keys(path_to_topic_keys)

for topic_number, topic in enumerate(topics):
    print(f"✨Topic {topic_number}✨\n\n{topic}\n")

✨Topic 0✨

['world', 'german', 'work', 'life', 'man', 'moses', 'germany', 'century', 'modern', 'child', 'children', 'new', 'study', 'became', 'professor', 'great', 'human', 'philosophy', 'two', 'church']

✨Topic 1✨

['years', 'one', 'time', 'would', 'later', 'many', 'could', 'life', 'made', 'also', 'long', 'much', 'old', 'took', 'well', 'even', 'early', 'another', 'part', 'among']

✨Topic 2✨

['baseball', 'queen', 'louis', 'game', 'prince', 'year', 'league', 'first', 'world', 'team', 'ruth', 'one', 'won', 'england', 'victoria', 'years', 'tennis', 'duke', 'emperor', 'king']

✨Topic 3✨

['mrs', 'years', 'new', 'american', 'first', 'york', 'university', 'died', 'year', 'miss', 'two', 'became', 'school', 'women', 'born', 'national', 'world', 'later', 'death', 'home']

✨Topic 4✨

['president', 'roosevelt', 'house', 'truman', 'united', 'republican', 'party', 'state', 'senator', 'governor', 'war', 'american', 'democratic', 'nixon', 'states', 'election', 'campaign', 'office', 'hoover', 'senate