Topic modeling using gensim
[from link](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/)

In [None]:
!pip install nltk
!pip install gensim
!pip install spacy

In [None]:
import nltk
nltk.download('stopwords')


In [None]:
#restart the kernel after running this (colab Runtime->Restart Runtime)
!pip install --upgrade -q gspread

In [None]:
!pip install --upgrade wandb
!wandb login fcfc2eca6b5d76c9f5532e9ef9d320af69a388ed

In [None]:
!pip install pyLDAvis

In [None]:

import gspread
'''
from google.colab import auth
auth.authenticate_user()

from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())
'''
gc=gspread.oauth()
worksheet = gc.open('Gpt Huggingface results').sheet1

# get_all_values gives a list of rows.
rows = worksheet.get_all_values()
print(rows)

# Convert to a DataFrame and render.
import pandas as pd
df=pd.DataFrame.from_records(rows)
df.head()

In [None]:
df.columns=["prompt","text"]
df.head()

In [None]:
#remove the prompt from text
for index,row in df.iterrows():
  df.at[index,'text']=row['text'][len(row['prompt']):]
df.head()

In [None]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

cohere=[]
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)



In [None]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations



In [None]:
import nltk
nltk.download("punkt")

In [None]:
data_words=[]
data_words_ret=[]
for index,row in df.iterrows():
  data=nltk.tokenize.sent_tokenize(row['text'])
  data_words_ret=list(sent_to_words(data))
  for wd in data_words_ret:
    data_words.append(wd)
print (data_words[1:5])

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]


In [None]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [None]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [None]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

In [None]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

In [None]:
print(len(data_lemmatized))

In [None]:
print(data_lemmatized)

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

In [None]:
# Create Corpus
texts = data_lemmatized

In [None]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]


In [None]:
# View
print(corpus[:1])

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

## build model
[parameters description here](https://radimrehurek.com/gensim/models/ldamodel.html#gensim.models.ldamodel.LdaModel.top_topics)

In [None]:
import wandb
sweep_config={
    "method":"random",
    "metric":{
        "name":"coherence",
        "goal":"maximize"
    },
    "parameters":{
        "num_topics":{
            "values":[10,20,30,40,50]
        },
        'random_state':{
            "values":[50,52,54,56,58,60]
        },
        'update_every':{
            "values":[1,5,10,50]
        },
        'chunksize':{
            "values":[10,20,30,40,50,60,70,80,90,100]
        },
        'passes':{
            'values':[10,20,30,40,50]
        },
        'minimum_probability':{
            'values':[0.01,0.03,0.04,0.05,0.08]
        },
        'per_word_topic':{
            'value':True
        }
    }
}
sweep_id = wandb.sweep(sweep_config, entity="reedrw", project="gpt2-lda")

In [None]:
def train():
  # Build LDA model 
  config_defaults={
      "num_topics":10,
      "random_state":60,
      "update_every":50,
      "chunksize":10,
      "passes":40,
      "minimum_probability":.01,
      "per_word_topics":True
  }
  wandb.init(config=config_defaults)
  #print("run:",run.config)
  #print("type",type(run))
  #print(wandb.config)
  #print("config=",config)
  config=wandb.config
  
  lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=config.num_topics, 
                                            random_state=config.random_state,
                                            update_every=config.update_every,
                                            chunksize=config.chunksize,
                                            passes=config.passes,
                                            alpha="auto",
                                            minimum_probability=config.minimum_probability,
                                            per_word_topics=config.per_word_topics)
  # Print the Keyword in the 10 topics
  wandb.log({"topics":lda_model.print_topics()})
  doc_lda = lda_model[corpus]
  coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence="c_v")
  coherence_lda = coherence_model_lda.get_coherence()
  print(coherence_lda,)
  wandb.log({"coherence":coherence_lda,
             "perplexity":lda_model.log_perplexity(corpus)})
  print(lda_model.print_topics()) 
  return lda_model

In [None]:
wandb.agent(sweep_id, train)

In [None]:
lda_model=train()

In [None]:
# Print the Keyword in the 10 topics
#wandb.log({"topics":lda_model.print_topics()})
for l in lda_model.print_topics():
    print(l)
    
#print(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
wandb.log({"coherence":coherence_lda})
print('\nCoherence Score: ', coherence_lda)
if len(cohere)>0:
  if max(cohere)<coherence_lda:
    print('model improved')
  else:
    print('no improvement')
else:
  print('baseline established')
cohere.append(coherence_lda)

lda_model.save('lda_gpt2_model')
wandb.save("lda_gpt2_model")


In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
viz = pyLDAvis.display(LDAvis_prepared)

viz

## Mallet LDA ##
[see guide here](https://radimrehurek.com/gensim/models/wrappers/ldamallet.html)

In [None]:
import wandb
mallet_sweep_config={
    "method":"bayes",
    "metric":{
        "name":"coherence",
        "goal":"maximize"
    },
    "parameters":{
        "num_topics":{
            "values":[10,20,30,40,50]
        },
        "iterations":{
            'min':1,
            'max':20
           },
        "alpha":{
            "values":[0.7,0.8,0.9,1,1.1,1.2]
        }
    }
}
mallet_sweep_id = wandb.sweep(mallet_sweep_config, entity="reedrw", project="gpt2-lda")

In [None]:
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.wrappers import LdaMallet


In [None]:
def mallet_train():
    default_config={
        'path_to_mallet_binary':'~/Downloads/Mallet/bin/mallet',
        'num_topics':50,
        'iterations':1,
        'alpha':1.2
        }
    wandb.init(config=default_config)
    config=wandb.config
    ldamallet = gensim.models.wrappers.LdaMallet(path_to_mallet_binary, corpus=corpus, num_topics=config.num_topics, 
                                                 iterations=config.iterations, alpha=config.alpha, id2word=id2word, random_seed=1000)
    coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_ldamallet = coherence_model_ldamallet.get_coherence()
    print('\nCoherence Score: ', coherence_ldamallet)
    wandb.log({"topics":ldamallet.show_topics(),
              "coherence":coherence_ldamallet})
    #pprint(ldamallet.show_topics())
    return ldamallet

In [None]:
wandb.agent(mallet_sweep_id,mallet_train)

In [None]:
ldamallet=mallet_train()


In [None]:
# Show Topics
pprint(ldamallet.show_topics())



In [None]:
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)