# LDA Model Training

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

## Data

In [2]:
data_file_path='DATA.csv'
data=pd.read_csv(data_file_path)
data

Unnamed: 0,PeakID,Distance to TSS,motif_string
0,seq_40800,-81,"motif_149,motif_162,motif_166,motif_166,motif_..."
1,seq_10314,-228,motif_144
2,seq_18866,9616,"motif_104,motif_126,motif_126,motif_147,motif_..."
3,seq_45348,5770,"motif_120,motif_121,motif_128,motif_128,motif_..."
4,seq_2616,-12540,"motif_100,motif_103,motif_126,motif_126,motif_..."
...,...,...,...
60095,seq_5507,-127293,"motif_137,motif_137,motif_13,motif_174,motif_1..."
60096,seq_11070,37721,"motif_132,motif_132,motif_132,motif_142,motif_..."
60097,seq_17945,88589,"motif_161,motif_162,motif_162,motif_174,motif_..."
60098,seq_29137,93216,"motif_104,motif_105,motif_117,motif_117,motif_..."


In [3]:
print(data[data['motif_string'].isna()])
data.dropna(subset=['motif_string'],inplace=True)
data['motif_list']=data['motif_string'].apply(lambda x:x.split(','))
data

          PeakID  Distance to TSS motif_string
18840  seq_57232             -180          NaN
23892  seq_20216             -147          NaN
29667  seq_31231              483          NaN
46832   seq_8465             -341          NaN
47219  seq_46451             4737          NaN


Unnamed: 0,PeakID,Distance to TSS,motif_string,motif_list
0,seq_40800,-81,"motif_149,motif_162,motif_166,motif_166,motif_...","[motif_149, motif_162, motif_166, motif_166, m..."
1,seq_10314,-228,motif_144,[motif_144]
2,seq_18866,9616,"motif_104,motif_126,motif_126,motif_147,motif_...","[motif_104, motif_126, motif_126, motif_147, m..."
3,seq_45348,5770,"motif_120,motif_121,motif_128,motif_128,motif_...","[motif_120, motif_121, motif_128, motif_128, m..."
4,seq_2616,-12540,"motif_100,motif_103,motif_126,motif_126,motif_...","[motif_100, motif_103, motif_126, motif_126, m..."
...,...,...,...,...
60095,seq_5507,-127293,"motif_137,motif_137,motif_13,motif_174,motif_1...","[motif_137, motif_137, motif_13, motif_174, mo..."
60096,seq_11070,37721,"motif_132,motif_132,motif_132,motif_142,motif_...","[motif_132, motif_132, motif_132, motif_142, m..."
60097,seq_17945,88589,"motif_161,motif_162,motif_162,motif_174,motif_...","[motif_161, motif_162, motif_162, motif_174, m..."
60098,seq_29137,93216,"motif_104,motif_105,motif_117,motif_117,motif_...","[motif_104, motif_105, motif_117, motif_117, m..."


In [4]:
from gensim.corpora import Dictionary

docs=data['motif_list'].values
dictionary=Dictionary(docs)

print('Motif-Index to Motif-Name Mapping:')
for i,v in dictionary.items():
    print(f'{i} - {v}')
    if i==10:
        break
    

Motif-Index to Motif-Name Mapping:
0 - motif_149
1 - motif_162
2 - motif_166
3 - motif_174
4 - motif_199
5 - motif_230
6 - motif_248
7 - motif_295
8 - motif_60
9 - motif_66
10 - motif_98


In [5]:
corpus = [dictionary.doc2bow(doc) for doc in docs]
print('BOW (Sequence-0):')
docs[0],corpus[0]

BOW (Sequence-0):


(['motif_149',
  'motif_162',
  'motif_166',
  'motif_166',
  'motif_174',
  'motif_199',
  'motif_230',
  'motif_248',
  'motif_295',
  'motif_295',
  'motif_60',
  'motif_60',
  'motif_66',
  'motif_98'],
 [(0, 1),
  (1, 1),
  (2, 2),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 2),
  (9, 1),
  (10, 1)])

## Training Test

In [6]:
%%time
import logging
from gensim.models.callbacks import Callback,PerplexityMetric, ConvergenceMetric, CoherenceMetric

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
convergence_logger = ConvergenceMetric(logger='shell')
from gensim.models import LdaModel,LdaMulticore

temp = dictionary[0]
id2word = dictionary.id2token
lda = LdaModel(corpus, id2word=id2word, alpha='auto',chunksize=10000,
               eta='auto',num_topics=3, iterations=1000, passes = 5,
              minimum_probability=0.0,callbacks=[convergence_logger])

lda_predictions=lda.get_document_topics(corpus,minimum_probability=0.0)
lda_pred_topic=[]
for pred in tqdm(lda_predictions):
    top_topic=sorted(pred,key=lambda x:-x[1])[0][0]
    lda_pred_topic.append(top_topic)
lda_pred_data=data.copy()
lda_pred_data['pred_topic']=lda_pred_topic

logging.getLogger().setLevel(logging.CRITICAL)

2021-02-22 07:32:21,039 : INFO : using autotuned alpha, starting with [0.33333334, 0.33333334, 0.33333334]
2021-02-22 07:32:21,040 : INFO : using serial LDA version on this node
2021-02-22 07:32:21,042 : INFO : running online (multi-pass) LDA training, 3 topics, 5 passes over the supplied corpus of 60095 documents, updating model once every 10000 documents, evaluating perplexity every 60095 documents, iterating 1000x with a convergence threshold of 0.001000
2021-02-22 07:32:21,045 : INFO : PROGRESS: pass 0, at document #10000/60095
2021-02-22 07:32:21,046 : DEBUG : performing inference on a chunk of 10000 documents
2021-02-22 07:32:48,382 : DEBUG : 9881/10000 documents converged within 1000 iterations
2021-02-22 07:32:48,411 : INFO : optimized alpha [0.0064046383, 0.1345146, 0.17912321]
2021-02-22 07:32:48,412 : DEBUG : updating topics
2021-02-22 07:32:48,413 : INFO : merging changes from 10000 documents into a model of 60095 documents
2021-02-22 07:32:48,415 : INFO : topic #0 (0.006):

2021-02-22 07:33:12,759 : INFO : -5.098 per-word bound, 34.3 perplexity estimate based on a held-out corpus of 95 documents with 2924 words
2021-02-22 07:33:12,759 : INFO : PROGRESS: pass 0, at document #60095/60095
2021-02-22 07:33:12,760 : DEBUG : performing inference on a chunk of 95 documents
2021-02-22 07:33:12,789 : DEBUG : 95/95 documents converged within 1000 iterations
2021-02-22 07:33:12,790 : INFO : optimized alpha [0.018261174, 0.04336119, 0.04699353]
2021-02-22 07:33:12,790 : DEBUG : updating topics
2021-02-22 07:33:12,791 : INFO : merging changes from 95 documents into a model of 60095 documents
2021-02-22 07:33:12,793 : INFO : topic #0 (0.018): 0.029*"motif_60" + 0.021*"motif_62" + 0.017*"motif_126" + 0.017*"motif_84" + 0.015*"motif_174" + 0.014*"motif_2" + 0.014*"motif_17" + 0.013*"motif_175" + 0.013*"motif_1" + 0.013*"motif_31"
2021-02-22 07:33:12,794 : INFO : topic #1 (0.043): 0.021*"motif_219" + 0.020*"motif_49" + 0.020*"motif_41" + 0.019*"motif_48" + 0.019*"motif_50

2021-02-22 07:33:33,663 : DEBUG : updating topics
2021-02-22 07:33:33,664 : INFO : merging changes from 10000 documents into a model of 60095 documents
2021-02-22 07:33:33,665 : INFO : topic #0 (0.040): 0.031*"motif_60" + 0.023*"motif_126" + 0.022*"motif_62" + 0.021*"motif_84" + 0.016*"motif_17" + 0.015*"motif_31" + 0.015*"motif_26" + 0.014*"motif_162" + 0.014*"motif_174" + 0.013*"motif_239"
2021-02-22 07:33:33,666 : INFO : topic #1 (0.044): 0.025*"motif_219" + 0.021*"motif_11" + 0.021*"motif_12" + 0.021*"motif_49" + 0.020*"motif_10" + 0.020*"motif_48" + 0.020*"motif_41" + 0.020*"motif_50" + 0.018*"motif_6" + 0.018*"motif_9"
2021-02-22 07:33:33,667 : INFO : topic #2 (0.055): 0.062*"motif_60" + 0.048*"motif_184" + 0.040*"motif_37" + 0.037*"motif_218" + 0.037*"motif_23" + 0.035*"motif_42" + 0.030*"motif_40" + 0.029*"motif_13" + 0.029*"motif_166" + 0.027*"motif_19"
2021-02-22 07:33:33,667 : INFO : topic diff=0.076051, rho=0.353344
2021-02-22 07:33:33,669 : DEBUG : bound: at document #0
20

2021-02-22 07:33:47,768 : INFO : topic #1 (0.054): 0.027*"motif_219" + 0.023*"motif_11" + 0.023*"motif_12" + 0.022*"motif_49" + 0.022*"motif_10" + 0.022*"motif_48" + 0.022*"motif_41" + 0.021*"motif_50" + 0.020*"motif_6" + 0.020*"motif_9"
2021-02-22 07:33:47,769 : INFO : topic #2 (0.074): 0.063*"motif_60" + 0.050*"motif_184" + 0.041*"motif_37" + 0.039*"motif_218" + 0.038*"motif_23" + 0.036*"motif_42" + 0.031*"motif_40" + 0.030*"motif_13" + 0.029*"motif_166" + 0.027*"motif_19"
2021-02-22 07:33:47,770 : INFO : topic diff=0.072808, rho=0.333158
2021-02-22 07:33:47,773 : INFO : PROGRESS: pass 2, at document #60000/60095
2021-02-22 07:33:47,773 : DEBUG : performing inference on a chunk of 10000 documents
2021-02-22 07:33:50,572 : DEBUG : 10000/10000 documents converged within 1000 iterations
2021-02-22 07:33:50,596 : INFO : optimized alpha [0.081709094, 0.055375416, 0.07814823]
2021-02-22 07:33:50,596 : DEBUG : updating topics
2021-02-22 07:33:50,597 : INFO : merging changes from 10000 docum

2021-02-22 07:34:02,015 : INFO : topic diff=0.068323, rho=0.316078
2021-02-22 07:34:02,017 : INFO : PROGRESS: pass 3, at document #50000/60095
2021-02-22 07:34:02,017 : DEBUG : performing inference on a chunk of 10000 documents
2021-02-22 07:34:05,619 : DEBUG : 10000/10000 documents converged within 1000 iterations
2021-02-22 07:34:05,647 : INFO : optimized alpha [0.124389626, 0.06742938, 0.10415623]
2021-02-22 07:34:05,648 : DEBUG : updating topics
2021-02-22 07:34:05,649 : INFO : merging changes from 10000 documents into a model of 60095 documents
2021-02-22 07:34:05,651 : INFO : topic #0 (0.124): 0.030*"motif_60" + 0.024*"motif_62" + 0.024*"motif_126" + 0.022*"motif_84" + 0.016*"motif_17" + 0.015*"motif_31" + 0.015*"motif_26" + 0.014*"motif_162" + 0.013*"motif_174" + 0.013*"motif_35"
2021-02-22 07:34:05,651 : INFO : topic #1 (0.067): 0.029*"motif_219" + 0.025*"motif_11" + 0.025*"motif_12" + 0.024*"motif_49" + 0.024*"motif_10" + 0.024*"motif_48" + 0.024*"motif_41" + 0.022*"motif_50" 

2021-02-22 07:34:19,206 : INFO : optimized alpha [0.16491576, 0.07872451, 0.13053094]
2021-02-22 07:34:19,206 : DEBUG : updating topics
2021-02-22 07:34:19,207 : INFO : merging changes from 10000 documents into a model of 60095 documents
2021-02-22 07:34:19,208 : INFO : topic #0 (0.165): 0.029*"motif_60" + 0.024*"motif_62" + 0.023*"motif_126" + 0.021*"motif_84" + 0.016*"motif_17" + 0.015*"motif_31" + 0.015*"motif_54" + 0.014*"motif_26" + 0.014*"motif_162" + 0.013*"motif_174"
2021-02-22 07:34:19,209 : INFO : topic #1 (0.079): 0.030*"motif_219" + 0.027*"motif_11" + 0.027*"motif_12" + 0.026*"motif_49" + 0.026*"motif_10" + 0.025*"motif_41" + 0.025*"motif_48" + 0.024*"motif_50" + 0.023*"motif_9" + 0.023*"motif_6"
2021-02-22 07:34:19,210 : INFO : topic #2 (0.131): 0.064*"motif_60" + 0.052*"motif_184" + 0.042*"motif_37" + 0.041*"motif_218" + 0.038*"motif_23" + 0.036*"motif_42" + 0.031*"motif_40" + 0.030*"motif_13" + 0.029*"motif_166" + 0.027*"motif_19"
2021-02-22 07:34:19,210 : INFO : topic d

CPU times: user 2min 24s, sys: 2.05 s, total: 2min 26s
Wall time: 2min 24s


## Model Evaluation Metrics

In [27]:
def getTopicDistribution(pred_data,ntopics):
    value_count=pred_data['pred_topic'].value_counts(normalize=True)
    topic_dist_dict=value_count.to_dict()
    return topic_dist_dict

def likelihoodMetric(pred_data,predictions,ntopics):
    likelihood=0
    P_T=getTopicDistribution(pred_data,ntopics)
    for pred in tqdm(predictions):
        P_Xi_M=0
        for topic_no,P_Xi_T in pred:
            P_Xi_M+=P_Xi_T*P_T[topic_no]
        likelihood+=np.log10(P_Xi_M)
    print(likelihood)
    return likelihood
# likelihoodMetric(lda_pred_data,lda_predictions,3)

In [26]:
from gensim.models.coherencemodel import CoherenceModel

def coherenceMetric_cv(model,dictionary,docs):
    cm=CoherenceModel(model=model,dictionary=dictionary ,
                      texts=docs, coherence='c_v',processes=30)
    coherence = cm.get_coherence()
    print(coherence)
    return coherence
# coherenceMetric_cv(lda,dictionary ,docs)

In [25]:
from gensim.models.coherencemodel import CoherenceModel

def coherenceMetric_umass(model,dictionary,corpus):
    cm = CoherenceModel(model=model, corpus=corpus, \
                        coherence='u_mass',processes=30)
    coherence = cm.get_coherence()
    print(coherence)
    return coherence
# coherenceMetric_umass(lda,dictionary ,corpus)

In [24]:
from gensim.models.coherencemodel import CoherenceModel

def coherenceMetric_uci(model,dictionary,docs):
    cm=CoherenceModel(model=model,dictionary=dictionary ,
                      texts=docs, coherence='c_uci',processes=30,
                     window_size =2000)
    coherence = cm.get_coherence()
    print(coherence)
    return coherence
# coherenceMetric_uci(lda,dictionary ,docs)

In [23]:
def perplexityMetric(model,corpus):
    perplexity=model.log_perplexity(corpus)
    print(perplexity)
    return perplexity
# perplexityMetric(lda,corpus)

In [22]:
from collections import Counter

def findTopMotifs(pred_data,ntopics,ntop=5,outdir=None):     
    gb=pred_data[['motif_string','pred_topic']].groupby('pred_topic').\
    agg(lambda x: ','.join(x))
    gb['top_motif']=gb['motif_string'].\
    apply(lambda x:Counter(x.split(',')).most_common(ntop))
    gb.reset_index(inplace=True)
    gb=gb[['pred_topic','top_motif']]
    if outdir is not None:
        gb.to_csv(f'{outdir}/top{ntop}_motifs_topics_{ntopics}.csv',index=False)
    print(gb)
    return gb
# findTopMotifs(lda_pred_data,3)

In [21]:
def getAvgTssDist(pred_data,ntopics,outdir=None):
#     pred_topic=[]
#     for pred in tqdm(predictions):
#         top_topic=sorted(pred,key=lambda x:-x[1])[0][0]
#         pred_topic.append(top_topic)
#     _data=data.copy()
#     _data['pred_topic']=pred_topic
    gb=pred_data[['Distance to TSS','pred_topic']].groupby('pred_topic').mean()
    gb.reset_index(inplace=True)
    if outdir is not None:
        gb.to_csv(f'{outdir}/avg_tss_dist_topics_{ntopics}.csv',index=False)
    print(gb)
    return gb
# getAvgTssDist(lda_pred_data,3)

## Training & Evaluation

In [28]:
%%time

from gensim.models import LdaModel,LdaMulticore

outdir='model_output'
eval_dict={'num_topics':[],'likelihood':[],'coherence_cv':[],\
          'coherence_umass':[],'coherence_uci':[],'perplexity':[]}
temp = dictionary[0]
id2word = dictionary.id2token
for ntopics in range(2,10):
    print('\n'+'='*40)
    print('Num of Topics = '+str(ntopics))
    model = LdaModel(corpus, id2word=id2word, alpha='auto',chunksize=10000,
                   eta='auto',num_topics=ntopics, iterations=1000, passes = 5,
                  minimum_probability=0.0)
    
    predictions=model.get_document_topics(corpus,minimum_probability=0.0)
    pred_topic=[]
    for pred in tqdm(predictions):
        top_topic=sorted(pred,key=lambda x:-x[1])[0][0]
        pred_topic.append(top_topic)
    pred_data=data.copy()
    pred_data['pred_topic']=pred_topic
    print('\nFinding likelihood...')
    likelihood=likelihoodMetric(pred_data,predictions,ntopics)
    print('\nFinding coherence_cv...')
    coherence_cv=coherenceMetric_cv(model,dictionary,docs)
    print('\nFinding coherence_umass...')
    coherence_umass=coherenceMetric_umass(model,dictionary ,corpus)
    print('\nFinding coherence_uci...')
    coherence_uci=coherenceMetric_uci(model,dictionary,docs)
    print('\nFinding perplexity...')
    perplexity=perplexityMetric(model,corpus)
    print('\nFinding Top Motifs...')
    findTopMotifs(pred_data,ntopics,outdir=outdir)
    print('\nFindng avg. distance from TSS per topic...')
    getAvgTssDist(pred_data,ntopics,outdir=outdir)
    eval_dict['num_topics'].append(ntopics)
    eval_dict['likelihood'].append(likelihood)
    eval_dict['coherence_cv'].append(coherence_cv)
    eval_dict['coherence_umass'].append(coherence_umass)
    eval_dict['coherence_uci'].append(coherence_uci)
    eval_dict['perplexity'].append(perplexity)
eval_df=pd.DataFrame(eval_dict)
eval_df.to_csv(f'{outdir}/metrics.csv',index=False)
eval_df


Num of Topics = 2


100%|██████████| 60095/60095 [00:20<00:00, 2940.95it/s]
  1%|          | 343/60095 [00:00<00:17, 3426.52it/s]


Finding likelihood...


100%|██████████| 60095/60095 [00:20<00:00, 2930.51it/s]

-18039.164087197336

Finding coherence_cv...





0.536841275443674

Finding coherence_umass...
-1.568913339952001

Finding coherence_uci...
0.4113135185165094

Finding perplexity...
-5.104206422556327

Finding Top Motifs...
   pred_topic                                          top_motif
0           0  [(motif_60, 21811), (motif_62, 15924), (motif_...
1           1  [(motif_60, 46696), (motif_184, 28656), (motif...

Findng avg. distance from TSS per topic...
   pred_topic  Distance to TSS
0           0     14298.025338
1           1      6228.789842

Num of Topics = 3


100%|██████████| 60095/60095 [00:24<00:00, 2459.96it/s]
  0%|          | 294/60095 [00:00<00:20, 2915.80it/s]


Finding likelihood...


100%|██████████| 60095/60095 [00:28<00:00, 2098.41it/s]

-25448.78619993081

Finding coherence_cv...





0.5319771271950177

Finding coherence_umass...
-1.681825703371601

Finding coherence_uci...
0.4691770993562088

Finding perplexity...
-5.031001276754723

Finding Top Motifs...
   pred_topic                                          top_motif
0           0  [(motif_60, 41700), (motif_184, 28036), (motif...
1           1  [(motif_11, 8715), (motif_12, 8485), (motif_10...
2           2  [(motif_60, 19462), (motif_126, 11358), (motif...

Findng avg. distance from TSS per topic...
   pred_topic  Distance to TSS
0           0      6870.059759
1           1     11445.936062
2           2     12330.978274

Num of Topics = 4


100%|██████████| 60095/60095 [00:17<00:00, 3475.21it/s]
  1%|          | 386/60095 [00:00<00:15, 3859.60it/s]


Finding likelihood...


100%|██████████| 60095/60095 [00:19<00:00, 3093.54it/s]

-33521.91712313656

Finding coherence_cv...





0.6157931540715109

Finding coherence_umass...
-1.561299982070476

Finding coherence_uci...
0.632652443014669

Finding perplexity...
-4.91161652806905

Finding Top Motifs...
   pred_topic                                          top_motif
0           0  [(motif_60, 32030), (motif_184, 23118), (motif...
1           1  [(motif_219, 7416), (motif_60, 6858), (motif_4...
2           2  [(motif_37, 18448), (motif_23, 17091), (motif_...
3           3  [(motif_60, 20238), (motif_126, 12198), (motif...

Findng avg. distance from TSS per topic...
   pred_topic  Distance to TSS
0           0      7884.539415
1           1      5036.355994
2           2      7352.487841
3           3     14610.568292

Num of Topics = 5


100%|██████████| 60095/60095 [00:17<00:00, 3362.73it/s]
  1%|          | 346/60095 [00:00<00:17, 3455.48it/s]


Finding likelihood...


100%|██████████| 60095/60095 [00:18<00:00, 3237.88it/s]

-35814.16976854499

Finding coherence_cv...





0.7444441968019083

Finding coherence_umass...
-1.351924232763107

Finding coherence_uci...
0.9173814371839171

Finding perplexity...
-4.804642958117152

Finding Top Motifs...
   pred_topic                                          top_motif
0           0  [(motif_37, 19194), (motif_23, 17905), (motif_...
1           1  [(motif_60, 19829), (motif_62, 14542), (motif_...
2           2  [(motif_60, 33641), (motif_184, 23861), (motif...
3           3  [(motif_11, 7683), (motif_12, 7526), (motif_10...
4           4  [(motif_48, 5697), (motif_41, 5633), (motif_49...

Findng avg. distance from TSS per topic...
   pred_topic  Distance to TSS
0           0      3285.020830
1           1     14491.705743
2           2      9098.656305
3           3      9822.244269
4           4      6175.478491

Num of Topics = 6


100%|██████████| 60095/60095 [00:25<00:00, 2395.60it/s]
  0%|          | 189/60095 [00:00<00:31, 1884.41it/s]


Finding likelihood...


100%|██████████| 60095/60095 [00:22<00:00, 2639.68it/s]

-44779.15296452151

Finding coherence_cv...





0.7560262888980295

Finding coherence_umass...
-1.2936051024567823

Finding coherence_uci...
0.9600437247034693

Finding perplexity...
-4.774857782568854

Finding Top Motifs...
   pred_topic                                          top_motif
0           0  [(motif_54, 8671), (motif_62, 8280), (motif_86...
1           1  [(motif_37, 19305), (motif_23, 17911), (motif_...
2           2  [(motif_60, 11887), (motif_11, 9489), (motif_1...
3           3  [(motif_48, 6504), (motif_49, 6495), (motif_41...
4           4  [(motif_60, 35811), (motif_184, 24715), (motif...
5           5  [(motif_219, 6328), (motif_264, 5133), (motif_...

Findng avg. distance from TSS per topic...
   pred_topic  Distance to TSS
0           0     18485.013508
1           1      3684.272510
2           2     13249.341390
3           3      7424.695706
4           4      9176.107754
5           5      5821.993518

Num of Topics = 7


100%|██████████| 60095/60095 [00:21<00:00, 2779.93it/s]
  1%|          | 325/60095 [00:00<00:18, 3247.22it/s]


Finding likelihood...


100%|██████████| 60095/60095 [00:22<00:00, 2623.09it/s]

-43638.01373687387

Finding coherence_cv...





0.7026354977523898

Finding coherence_umass...
-1.5363047159551473

Finding coherence_uci...
0.8902269087878859

Finding perplexity...
-4.729814553496755

Finding Top Motifs...
   pred_topic                                          top_motif
0           0  [(motif_219, 5727), (motif_264, 4862), (motif_...
1           1  [(motif_11, 8266), (motif_12, 8075), (motif_10...
2           2  [(motif_60, 17919), (motif_126, 10294), (motif...
3           3  [(motif_49, 6488), (motif_60, 6437), (motif_41...
4           4  [(motif_60, 30774), (motif_184, 22794), (motif...
5           5  [(motif_54, 6841), (motif_122, 5679), (motif_8...
6           6  [(motif_37, 18217), (motif_23, 16955), (motif_...

Findng avg. distance from TSS per topic...
   pred_topic  Distance to TSS
0           0      4569.273919
1           1     11395.484732
2           2     14093.757670
3           3     10844.308157
4           4      7686.014361
5           5     18664.814220
6           6      1909.597542

Num of Top

100%|██████████| 60095/60095 [00:20<00:00, 2923.58it/s]
  1%|          | 348/60095 [00:00<00:17, 3465.85it/s]


Finding likelihood...


100%|██████████| 60095/60095 [00:18<00:00, 3327.80it/s]


-47882.11157096832

Finding coherence_cv...
0.7121904343451513

Finding coherence_umass...
-1.5152111919095643

Finding coherence_uci...
0.902591961993945

Finding perplexity...
-4.67598557443039

Finding Top Motifs...
   pred_topic                                          top_motif
0           0  [(motif_60, 16197), (motif_126, 9026), (motif_...
1           1  [(motif_37, 18382), (motif_23, 17100), (motif_...
2           2  [(motif_11, 7983), (motif_12, 7801), (motif_10...
3           3  [(motif_54, 6781), (motif_122, 5588), (motif_8...
4           4  [(motif_60, 5717), (motif_17, 5331), (motif_31...
5           5  [(motif_48, 5475), (motif_41, 5450), (motif_49...
6           6  [(motif_60, 30423), (motif_184, 22466), (motif...
7           7  [(motif_219, 6176), (motif_264, 5108), (motif_...

Findng avg. distance from TSS per topic...
   pred_topic  Distance to TSS
0           0     13530.684242
1           1      2305.111846
2           2     13768.299408
3           3     17367.0180

100%|██████████| 60095/60095 [00:19<00:00, 3151.80it/s]
  1%|          | 307/60095 [00:00<00:19, 3030.22it/s]


Finding likelihood...


100%|██████████| 60095/60095 [00:20<00:00, 2902.64it/s]

-50964.736442599286

Finding coherence_cv...





0.7183785054651171

Finding coherence_umass...
-1.4796840555224637

Finding coherence_uci...
0.9323398864390846

Finding perplexity...
-4.651199893365425

Finding Top Motifs...
   pred_topic                                          top_motif
0           0  [(motif_17, 4905), (motif_60, 4641), (motif_26...
1           1  [(motif_48, 5506), (motif_41, 5498), (motif_49...
2           2  [(motif_219, 5673), (motif_264, 4811), (motif_...
3           3  [(motif_37, 18016), (motif_23, 16763), (motif_...
4           4  [(motif_256, 2396), (motif_60, 2342), (motif_2...
5           5  [(motif_54, 7149), (motif_122, 5906), (motif_8...
6           6  [(motif_60, 15264), (motif_126, 7810), (motif_...
7           7  [(motif_11, 8080), (motif_12, 7890), (motif_10...
8           8  [(motif_60, 30720), (motif_184, 22789), (motif...

Findng avg. distance from TSS per topic...
   pred_topic  Distance to TSS
0           0     14143.938427
1           1      6465.861005
2           2      3202.895297
3    

Unnamed: 0,num_topics,likelihood,coherence_cv,coherence_umass,coherence_uci,perplexity
0,2,-18039.164087,0.536841,-1.568913,0.411314,-5.104206
1,3,-25448.7862,0.531977,-1.681826,0.469177,-5.031001
2,4,-33521.917123,0.615793,-1.5613,0.632652,-4.911617
3,5,-35814.169769,0.744444,-1.351924,0.917381,-4.804643
4,6,-44779.152965,0.756026,-1.293605,0.960044,-4.774858
5,7,-43638.013737,0.702635,-1.536305,0.890227,-4.729815
6,8,-47882.111571,0.71219,-1.515211,0.902592,-4.675986
7,9,-50964.736443,0.718379,-1.479684,0.93234,-4.6512


In [None]:
# #setup logging for trainging metrics 
# import logging
# logging.basicConfig(filename='test_output/model_callbacks.log', filemode='w',
#                     format="%(asctime)s:%(levelname)s:%(message)s",
#                     level=logging.NOTSET)

# from gensim.models.callbacks import Callback,PerplexityMetric, ConvergenceMetric, CoherenceMetric
# perplexity_logger = PerplexityMetric(corpus=corpus, logger='shell')
# convergence_logger = ConvergenceMetric(logger='shell')
# # coherence_cv_logger = CoherenceMetric(corpus=corpus, coherence = 'c_v', texts = docs)

# %%time
# from gensim.models import LdaModel,LdaMulticore

# #HYPERPARAMETERS
# #passes = epochs
# temp = dictionary[0]
# id2word = dictionary.id2token
# lda = LdaModel(corpus, id2word=id2word, alpha='auto',eval_every = 1,\
#                eta='auto',num_topics=3, iterations=5, passes = 10,
#               callbacks=[perplexity_logger,convergence_logger])

# lda.print_topics()

# %%time
# from gensim.models import LdaModel,LdaMulticore

# id2word = dictionary.id2token
# lda = LdaMulticore(corpus, id2word=id2word,eval_every = 1,\
#                eta='auto',num_topics=3, iterations=500, passes = 100)

# import pyLDAvis.gensim

# pyLDAvis.enable_notebook()
# pyLDAvis.gensim.prepare(lda, corpus, dictionary)