In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import numpy as np
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

In [None]:
def clean_and_normalize_text(text):
    text = ''.join([char.lower() for char in text if char.isalnum() or char.isspace()])

    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    return text

def umass_coherence(doc_word_matrix, word_id_dict, topics):
  print("umass coherence")
  """
  Calculates Umass coherence score for LDA model

  Args:
      doc_word_matrix: Document-word matrix (sparse matrix)
      word_id_dict: Dictionary mapping word IDs to words
      topics: LDA model topics

  Returns:
      Umass coherence score
  """
  eps = 1e-12
  coherence = 0.0
  _, vocab_size = doc_word_matrix.shape

  for m, x in enumerate(topics):
    cluster_sum = np.sum(x)
    for i, j, v in zip(doc_word_matrix.indptr[:-1], doc_word_matrix.indices, doc_word_matrix.data):
      if x[j] > 0:  # Check if word is present in the topic
        coherence += v / (cluster_sum + eps)
  coherence /= (vocab_size * (vocab_size - 1))
  return coherence


def customize_tfidf_vectorizer(documents):
    print("Cleaning documents...")
    cleaned_documents = [clean_and_normalize_text(doc) for doc in documents]

    tokenized_documents = [word_tokenize(doc) for doc in cleaned_documents]

    vectorizer = TfidfVectorizer(
        max_features=2000,
        stop_words=list(ENGLISH_STOP_WORDS),  
        ngram_range=(1, 2),  
        min_df=2,  
        max_df=0.95,  
        sublinear_tf=True  
    )

    document_matrix = vectorizer.fit_transform(cleaned_documents)
    return document_matrix, vectorizer, tokenized_documents  # Return tokenized_documents



def check_and_merge_topics(topics, threshold=0.9):
  print("Checking and removing duplicate topics")
  """
  Checks for highly similar topics and merges them

  Args:
      topics: List of LDA topics (each topic is a word distribution)
      threshold: Similarity threshold (0 to 1) for merging

  Returns:
      List of merged topics
  """
  merged_topics = []
  seen_topics = set()
  for topic in topics:
    most_similar_topic = None
    max_similarity = 0
    for seen_topic in seen_topics:
      similarity = np.dot(topic, seen_topic) / (np.linalg.norm(topic) * np.linalg.norm(seen_topic))
      if similarity > max_similarity:
        max_similarity = similarity
        most_similar_topic = seen_topic
    if max_similarity < threshold:
      merged_topics.append(topic)
      seen_topics.add(tuple(topic)) 
    else:
      pass
  return merged_topics

def find_optimal_num_topics(documents, max_topics=25):
    print("Finding the optimal number of topics")
    document_matrix, vectorizer, tokenized_documents = customize_tfidf_vectorizer(documents)

    coherence_values = []
    for num_topics in range(2, max_topics + 1):
        lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
        lda.fit(document_matrix)
        topics = lda.components_
        coherence_values.append(umass_coherence(document_matrix, vectorizer.get_feature_names_out(), topics))

    optimal_num_topics = np.argmax(coherence_values) + 2  # Adding 2 because the loop starts from 2
    return optimal_num_topics

def refine_similarity_with_word_embeddings(merged_topics, word2vec_model):
    print("Refining similarity using word embeddings")
    refined_topics = []
    for topic in merged_topics:
        if isinstance(topic, np.ndarray):  # Check if topic is a numpy array (vector)
            topic_embedding = topic
        else:
            topic_embedding = np.mean([word2vec_model.wv[word] for word in topic], axis=0)
        refined_topics.append(topic_embedding)
    return refined_topics