## Author-Topic Model

### Import data

In [1]:
import os
import json
from gensim.test.utils import datapath

tmp = datapath(os.getcwd() + '\\tmp\\model.atmodel')
dirpath = os.getcwd() + '\\corpus\\'

with open('nouns_adj.json', 'r') as f:
    documents = json.load(f)

### Assign unique author id to each text

In [2]:
authors, chunks = [], []

for file in os.listdir(dirpath):
    if file.startswith('__'):
        with open(dirpath+file, 'r') as f:
            js = json.load(f)
            authors.append(js['Author'][0]['Author'].replace(' ', '_'))
            chunks.append(len(js['Author'])) # number of texts for each author

chunks = [n if n < 100 else 100 for n in chunks] # keep 150 texts max for each author

author2doc = dict()

n = 0
for author, i in zip(authors, chunks):
    n += i
    author2doc[author] = list(range((n)-i, (n)))

### Create dictionary and corpus

In [3]:
from gensim import corpora

# Map each token to a unique ID
dictionary = corpora.Dictionary(documents)
print(f'Number of unique tokens: {len(dictionary)}')

# Filter out tokens by frequency
min_doc, max_doc = 15, .2
dictionary.filter_extremes(no_below=min_doc, no_above=max_doc)
print(f'Number of unique tokens (filtered): {len(dictionary)}')

# Create a BOW corpus
corpus = [dictionary.doc2bow(doc) for doc in documents]

print(f'Number of documents: {len(corpus)}')
print(f'Number of authors: {len(author2doc)}')

Number of unique tokens: 30281
Number of unique tokens (filtered): 3231
Number of documents: 1260
Number of authors: 17


### Run the Author-Topic model

In [4]:
from gensim.models import AuthorTopicModel

# Set training parameters for ATM
num_topics = 12
random_state = 100
chunksize = len(corpus)
passes = 150
alpha = .9099999999999999
eta = .21
iterations = 100
eval_every = 1

In [20]:
at_model = AuthorTopicModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, \
                            author2doc=author2doc, chunksize=chunksize, passes=passes, \
                            gamma_threshold=1e-10, eval_every=eval_every, \
                            iterations=iterations, random_state=i, \
                            alpha=alpha, eta=eta, minimum_probability=0.01)

In [30]:
# Save the model
at_model.save(tmp)

In [5]:
# Load the model
at_model = AuthorTopicModel.load(tmp)

### Print the results

In [6]:
from pprint import pprint

top_topics = at_model.top_topics(corpus)
pprint(at_model.print_topics(num_words=15))

[(0,
  '0.019*"нейтрино" + 0.012*"частота" + 0.012*"измерение" + 0.012*"детектор" + '
  '0.010*"излучение" + 0.009*"распад" + 0.009*"ядро" + 0.008*"событие" + '
  '0.008*"гравитационный_волна" + 0.008*"вселенная" + 0.007*"поток" + '
  '0.007*"сигнал" + 0.007*"нейтрон" + 0.007*"электрон" + 0.007*"фотон"'),
 (1,
  '0.017*"порода" + 0.011*"атмосфера" + 0.011*"зона" + 0.009*"океан" + '
  '0.009*"глубина" + 0.008*"мантия" + 0.007*"кислород" + 0.007*"железо" + '
  '0.007*"содержание" + 0.007*"планета" + 0.007*"морской" + 0.006*"минерал" + '
  '0.006*"граница" + 0.006*"углерод" + 0.005*"отложение"'),
 (2,
  '0.021*"звезда" + 0.009*"ядро" + 0.009*"чёрный_дыра" + 0.009*"галактика" + '
  '0.009*"вселенная" + 0.008*"солнечный" + 0.007*"солнце" + '
  '0.007*"космический" + 0.006*"звёздный" + 0.006*"дыра" + 0.006*"астроном" + '
  '0.006*"излучение" + 0.006*"физика" + 0.006*"белый_карлик" + 0.005*"поле"'),
 (3,
  '0.013*"мышь" + 0.010*"рецептор" + 0.010*"нейрон" + 0.009*"днк" + '
  '0.008*"растение"

### Evaluate topic coherence

In [25]:
from gensim.models import CoherenceModel

# Compute coherence score using CV
coherence_c_v_model = CoherenceModel(model=at_model, texts=documents, 
                                     dictionary=dictionary, coherence='c_v')
coherence_c_v = coherence_c_v_model.get_coherence()
print(f'Coherence score (c_v): {coherence_c_v}') # 0.5 is good, 0.6 is excellent

# Compute coherence score using UMass
coherence_umass_model = CoherenceModel(model=at_model, texts=documents, 
                                       dictionary=dictionary, coherence="u_mass")
coherence_umass = coherence_umass_model.get_coherence()
print(f'Coherence score (UMass): {coherence_umass}') # should be close to 0

Coherence score (c_v): 0.6516016930107807
Coherence score (UMass): -1.6153015119372658


### Show topic distribution for each author

In [26]:
topic_labels = ['Частицы', 'Лито- и гидросфера', 'Космос', 'Эксперимент',
                'Рак', 'Животные', 'Размножение', 'Коллайдер', 'Геном',
                'Затмение', 'Позвоночные', 'Сверхпроводники']

In [27]:
def show_author(name):
    print('\n%s' % name)
    print('Темы:')
    pprint([(topic_labels[topic[0]], topic[1]) for topic in at_model[name]])
    

for author in authors:
    show_author(author)


Александр_Козловский
Темы:
[('Космос', 0.2105464150033727), ('Затмение', 0.7860777810833285)]

Александр_Марков
Темы:
[('Лито- и гидросфера', 0.021053808383658088),
 ('Размножение', 0.8910183610685185),
 ('Геном', 0.03775617855735935),
 ('Позвоночные', 0.04521570680505765)]

Александр_Сергеев
Темы:
[('Частицы', 0.05242328568065768),
 ('Лито- и гидросфера', 0.043854007186273414),
 ('Космос', 0.253814773866717),
 ('Эксперимент', 0.07280322701005486),
 ('Рак', 0.015894118862348604),
 ('Размножение', 0.014791212611308871),
 ('Геном', 0.05707530067872772),
 ('Затмение', 0.463159409602393),
 ('Сверхпроводники', 0.015712023986673037)]

Алексей_Гиляров
Темы:
[('Животные', 0.9533332258713978), ('Размножение', 0.0443966662846007)]

Алексей_Левин
Темы:
[('Космос', 0.6745995815064718),
 ('Эксперимент', 0.04212715409745493),
 ('Рак', 0.0973365954262154),
 ('Коллайдер', 0.13417923303503432),
 ('Сверхпроводники', 0.04390345208132961)]

Алексей_Опаев
Темы:
[('Животные', 0.1773292731109675), ('Размнож