# Get Topics for Each Word

Here is a script that I used to get the topics for each word. Then I used the most popular topic for each word to make a more simplified topic modeling in my website: https://sarajaksa.eu/2019/meicogsci-topics/text-modeling/

In [1]:
# import the libaries
import gensim
import collections
import os
import nltk
import json
import pandas
from constants import folder_meicogsci, folder_meicogsci_2019, folder_models, all_topics_names

In [2]:
# import the model
model_21 = gensim.models.LdaModel.load(os.path.join(folder_models, "LDA_21"))
model_21.minimum_probability = 0.0

In [3]:
# for each topic get the most representitive 500 words
all_topics = model_21.show_topics(num_topics=-1, num_words=500, formatted=False)

In [4]:
# put the words in the dict of wich, with word having the dict of topics
all_words = collections.defaultdict(dict)
for topic, words in all_topics:
    for word, freq in words:
        all_words[word][all_topics_names[topic]] = float(freq)

Below are some examples of the words and their topics.

In [5]:
all_words["negative emotion"]

{'reinforcment learning': 0.0003554156864993274}

In [6]:
all_words["cognitive scientist"]

{'verbalization': 0.001178124570287764, 'reasoning': 0.001995381899178028}

In [7]:
all_words["morality"]

{'decision making': 0.0021055189426988363}

In [8]:
all_words["neurological"]

{'pitch': 0.0037964843213558197,
 'non-typical': 0.0016196452779695392,
 'neuroscience': 0.0010986431734636426,
 'constructivism': 0.0005386153934523463,
 'learning': 0.0007708182674832642,
 'attention': 0.0008897276129573584,
 'TMS': 0.0004845411458518356}

In [9]:
# this is a json dump of all words with all topics
json.dumps(all_words)

'{"language": {"pitch": 0.016879143193364143, "categorization": 0.004162071738392115, "modeling": 0.005567881278693676, "non-typical": 0.0004919379134662449, "verbalization": 0.007837525568902493, "language": 0.05771108344197273, "reasoning": 0.0006777359521947801, "system": 0.01090843416750431}, "speech": {"pitch": 0.012216161005198956, "health": 0.006021290086209774, "verbalization": 0.006537938490509987, "language": 0.004639361519366503, "system": 0.002979497192427516, "neural networks": 0.000795051222667098}, "brain": {"pitch": 0.0121036721393466, "movement": 0.007805631961673498, "society": 0.0008395765325985849, "neuroscience": 0.027518950402736664, "health": 0.0014513020869344473, "perception": 0.0014325991505756974, "learning": 0.0010376477148383856, "verbalization": 0.005948040634393692, "reasoning": 0.0023951425682753325, "attention": 0.0017139205010607839, "system": 0.000984728685580194, "tasks": 0.002667863853275776, "TMS": 0.023137787356972694, "neural networks": 0.0066892

In [10]:
# create a dictionary with word and their most popular topic
word_with_topic = dict([(word, list(pandas.DataFrame([(t, c) for t, c in topics.items()]).sort_values(by=1).tail(1)[0])[0]) for word, topics in all_words.items()])

In [11]:
# here are some rown with words and their most populat opic
pandas.DataFrame([(w, t) for w, t in word_with_topic.items()], columns=["Word", "Topic"]).head(10)

Unnamed: 0,Word,Topic
0,language,language
1,speech,pitch
2,brain,neuroscience
3,tone,pitch
4,ability,pitch
5,human,society
6,pitch,pitch
7,native,pitch
8,musical,pitch
9,perception,pitch


In [12]:
# and this is a json dump with words and their most popular topic
json.dumps(word_with_topic)

'{"language": "language", "speech": "pitch", "brain": "neuroscience", "tone": "pitch", "ability": "pitch", "human": "society", "pitch": "pitch", "native": "pitch", "musical": "pitch", "perception": "pitch", "music": "pitch", "effect": "health", "context": "pitch", "difference": "attention", "feature": "perception", "hypothesis": "pitch", "stroke": "pitch", "test": "health", "second": "pitch", "class": "pitch", "task": "learning", "speaker": "language", "curiosity": "pitch", "english": "language", "stimulus": "perception", "grammar": "pitch", "protein": "pitch", "population": "pitch", "correlation": "pitch", "critical": "pitch", "experiment": "categorization", "specie": "pitch", "imitation": "pitch", "probe": "pitch", "many": "neural networks", "vocal": "pitch", "syndrome": "pitch", "musician": "pitch", "dementia": "pitch", "patient": "non-typical", "adult": "neuroscience", "question": "constructivism", "time": "health", "infant": "pitch", "mechanism": "reinforcment learning", "german":