## Author-Topic Model

### Import data

In [36]:
import json

with open('preprocessing/data.json', 'r') as f:
    documents = json.load(f)
    texts = [' '.join([word for word in text]) for text in documents]

### Assign unique author id to each text

In [42]:
authors, chunks = [], []
author2doc = dict()

with open('corpus/elementy_authors.json', 'r') as f:
    js = json.load(f)
    for i in list(js):
        authors.append(i)
        chunks.append(len(js[i])) # number of texts for each author

chunks = [n if n < 100 else 100 for n in chunks] # keep 100 texts max

n = 0
for author, i in zip(authors, chunks):
    n += i
    author2doc[author] = list(range((n)-i, (n)))

### Create dictionary and corpus

In [62]:
from gensim import corpora

# Map each token to a unique ID
dictionary = corpora.Dictionary(documents)
print(f'Number of unique tokens: {len(dictionary)}')

# Filter out tokens by frequency
min_doc, max_doc = 15, .3
dictionary.filter_extremes(no_below=min_doc, no_above=max_doc)
print(f'Number of unique tokens (filtered): {len(dictionary)}')

# Create a BOW corpus
corpus = [dictionary.doc2bow(doc) for doc in documents]
print(f'Number of documents: {len(corpus)}')

print(f'Number of authors: {len(author2doc)}')

Number of unique tokens: 43392
Number of unique tokens (filtered): 5034
Number of documents: 2289
Number of authors: 139


### Run the Author-Topic model

In [68]:
from gensim.models import AuthorTopicModel
from gensim.test.utils import datapath

tmp = datapath('/tmp/model.atmodel')

# Set training parameters
num_topics = 11
random_state = 100
chunksize = len(corpus)
passes = 150
alpha = .9099999999999999
eta = .21
iterations = 100
eval_every = 1

In [69]:
at_model = AuthorTopicModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, \
                            author2doc=author2doc, chunksize=chunksize, passes=passes, \
                            gamma_threshold=1e-10, eval_every=eval_every, \
                            iterations=iterations, random_state=i, \
                            alpha=alpha, eta=eta, minimum_probability=0.01)

In [None]:
# Save the model
at_model.save(tmp)

In [None]:
# Load the model
at_model = AuthorTopicModel.load(tmp)

### Print the results

In [70]:
from pprint import pprint

top_topics = at_model.top_topics(corpus)
pprint(at_model.print_topics(num_words=15))

[(0,
  '0.009*"белок" + 0.009*"геном" + 0.008*"мозг" + 0.008*"бактерия" + '
  '0.007*"нейрон" + 0.005*"популяция" + 0.005*"мутация" + 0.005*"генетический" '
  '+ 0.005*"днк" + 0.005*"эволюционный" + 0.005*"поведение" + 0.004*"мышь" + '
  '0.004*"функция" + 0.004*"действие" + 0.004*"решение"'),
 (1,
  '0.020*"молекула" + 0.015*"реакция" + 0.011*"бактерия" + 0.009*"химический" '
  '+ 0.008*"вода" + 0.008*"фермент" + 0.007*"атом" + 0.007*"материал" + '
  '0.006*"поверхность" + 0.006*"энергия" + 0.006*"соединение" + 0.006*"синтез" '
  '+ 0.006*"свойство" + 0.006*"вирус" + 0.005*"молекулярный"'),
 (2,
  '0.010*"динозавр" + 0.009*"остаток" + 0.009*"древний" + 0.009*"зуб" + '
  '0.009*"кость" + 0.008*"находка" + 0.006*"морской" + 0.006*"палеонтолог" + '
  '0.006*"млекопитающее" + 0.006*"вымирание" + 0.005*"хищник" + 0.005*"птица" '
  '+ 0.005*"ископаемое" + 0.005*"отложение" + 0.004*"возраст"'),
 (3,
  '0.026*"самка" + 0.025*"самец" + 0.012*"растение" + 0.011*"популяция" + '
  '0.010*"насеком

### Evaluate topic coherence

In [71]:
from gensim.models import CoherenceModel

# Compute coherence score using CV
coherence_c_v_model = CoherenceModel(model=at_model, texts=documents, 
                                     dictionary=dictionary, coherence='c_v')
coherence_c_v = coherence_c_v_model.get_coherence()
print(f'Coherence score (c_v): {coherence_c_v}') # 0.5 is good, 0.6 is excellent

# Compute coherence score using UMass
coherence_umass_model = CoherenceModel(model=at_model, texts=documents, 
                                       dictionary=dictionary, coherence="u_mass")
coherence_umass = coherence_umass_model.get_coherence()
print(f'Coherence score (UMass): {coherence_umass}') # should be close to 0

Coherence score (c_v): 0.6101308832568375
Coherence score (UMass): -1.4060572125592259


### Show topic distribution for each author

In [72]:
topic_labels = ['Геном', 'Молекулы', 'Палеонтология', 'Размножение', 'Птицы', 'Космос', \
                'Рак', 'Эволюция', 'Сверхпроводники', 'Частицы', 'Земля']

In [76]:
def show_author(name):
    print(f'\n{name.replace("_", " ")}\nТемы:')
    pprint([(topic_labels[topic[0]], round(topic[1], 3)) for topic in at_model[name]])
    

for author in authors:
    show_author(author)


Айк Акопян
Темы:
[('Космос', 0.877), ('Сверхпроводники', 0.024), ('Частицы', 0.09)]

Валентин Анаников
Темы:
[('Молекулы', 0.986)]

Ольга Баклицкая-Каменева
Темы:
[('Молекулы', 0.525),
 ('Космос', 0.07),
 ('Сверхпроводники', 0.319),
 ('Частицы', 0.062)]

Дарья Баранова
Темы:
[('Геном', 0.039),
 ('Молекулы', 0.177),
 ('Палеонтология', 0.014),
 ('Размножение', 0.014),
 ('Птицы', 0.018),
 ('Космос', 0.012),
 ('Рак', 0.67),
 ('Эволюция', 0.013),
 ('Сверхпроводники', 0.014),
 ('Частицы', 0.015),
 ('Земля', 0.014)]

Вера Башмакова
Темы:
[('Геном', 0.119),
 ('Молекулы', 0.11),
 ('Размножение', 0.038),
 ('Рак', 0.694),
 ('Сверхпроводники', 0.024)]

Арсений Белосохов
Темы:
[('Геном', 0.163),
 ('Молекулы', 0.24),
 ('Палеонтология', 0.322),
 ('Размножение', 0.052),
 ('Рак', 0.183),
 ('Эволюция', 0.016),
 ('Земля', 0.018)]

Александр Бердичевский
Темы:
[('Космос', 0.618), ('Сверхпроводники', 0.023), ('Земля', 0.344)]

Александр Березин
Темы:
[('Космос', 0.955), ('Сверхпроводники', 0.022)]

Антон 