# Topic Coherence Score calcutaion

In [None]:
#Loading the data matrix (not uploaded with the submission as it was very big ~18GB)
import pickle5 as pickle
with open('data_matrix.pkl', 'rb') as f:
    V = pickle.load(f)

In [None]:
#Loading the vocabulary of our dataset
import pickle5 as pickle
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [None]:
#Loading the pretrained word vector model on our dataset
from gensim.models import Word2Vec
w2v_model = Word2Vec.load("word2vec.model")

In [None]:
from sklearn import decomposition
import matplotlib.pyplot as plt
import numpy as np

# Function to return top k words in a topic. 
def word (topic, vocabulary, k):
  vocabulary = list(vocabulary[0])
  word_list = []
  for i in range(k):
     word_list.append(vocabulary[topic.index(max(topic))])
     topic[topic.index(max(topic))] = 0.0
  return word_list

# Function to calculate the coherence score for a decomposition, given a topic matrix.
def calculate_coherence(w2v_model, topic_matrix, num_tops, vocabulary ):
    
    overall_topic_coherence = []
    topic_coherence_score = []
    for i in range(num_tops):
      topic = list (topic_matrix[i])
      topic_words = word(topic, vocabulary, 10)
      for j in range(len(topic_words)):
        for k in range(j+1,len(topic_words)):
          try:
            topic_coherence_score.append(w2v_model.similarity(topic_words[j], topic_words[k] ))  # For every pair of top words calculate the similarity score 
          except:
            continue
      overall_topic_coherence.append(sum(topic_coherence_score)/len(topic_coherence_score)) #Storing Average similarity score for a topic in the topic matrix

    return sum(overall_topic_coherence)/len(overall_topic_coherence) #returning the average score over the topic matrix

# Code to find best number of topics to decompose into according to the  coherence score.

In [None]:
topic_models = []
kmin = 3
kmax = 24

#Storing the decomposition for every number of topic in range kmin to kmax
for k in range(kmin,kmax+1):
    print("Applying NMF for k=%d ..." % k )
    model = decomposition.NMF( n_components=k, random_state=1 ) 
    W = model.fit_transform(V)
    H = model.components_    
    topic_models.append( (k,W,H) ) 


# Calculating the topic coherence for each topic decomposition in range kmin to kmax 
k_values = []
coherences = []
for (k,W,H) in topic_models:
    k_values.append( k )
    coherences.append( calculate_coherence( w2v_model, H.transpose(), k, vocab ) )
    print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )

%matplotlib inline
plt.style.use("ggplot")
matplotlib.rcParams.update({"font.size": 14})

fig = plt.figure(figsize=(13,7))
ax = plt.plot( k_values, coherences )
plt.xticks(k_values)
plt.xlabel("Number of Topics")
plt.ylabel("Topic matrix Coherence")
plt.scatter( k_values, coherences, s=120)
ymax = max(coherences)
xpos = coherences.index(ymax)
best_k = k_values[xpos]
plt.annotate( "k=%d" % best_k, xy=(best_k, ymax), xytext=(best_k, ymax), textcoords="offset points", fontsize=16)
plt.show()