Running the Topic Modelling and Validation

In [None]:
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaModel, LsiModel, Nmf, CoherenceModel
import os
import matplotlib.pyplot as plt
import numpy as np

# --- Configuration ---
INPUT_FILE = '/Users/shantanusharma/Desktop/preprocessed_sentences_final.csv'
OUTPUT_DIR = 'topic_model_outputs'
NUM_REPRESENTATIVE_SENTENCES = 10
NUM_TOPIC_WORDS = 15

# --- Create Output Directories ---
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
if not os.path.exists(os.path.join(OUTPUT_DIR, 'lda_results')):
    os.makedirs(os.path.join(OUTPUT_DIR, 'lda_results'))
if not os.path.exists(os.path.join(OUTPUT_DIR, 'validation_models')):
    os.makedirs(os.path.join(OUTPUT_DIR, 'validation_models'))

# --- Load Data ---
try:
    df = pd.read_csv(INPUT_FILE)
except FileNotFoundError:
    print(f"Error: '{INPUT_FILE}' not found. Please check the file path.")
    exit()

if df.empty:
    print("No data in the preprocessed DataFrame. Exiting.")
    exit()

print("Data loaded successfully.")
all_preprocessed_sentences_tokenized = [s.split() for s in df['preprocessed_tokens_str'].tolist()]
original_sentence_col = 'original_sentence'

# --- Prepare Gensim Corpus ---
print("Preparing Gensim corpus...")
dictionary = Dictionary(all_preprocessed_sentences_tokenized)
dictionary.filter_extremes(no_below=1, no_above=0.9, keep_n=100000)
bow_corpus = [dictionary.doc2bow(text) for text in all_preprocessed_sentences_tokenized]
tfidf_model = TfidfModel(bow_corpus)
tfidf_corpus = tfidf_model[bow_corpus]

# =========================================================================== #
# 1. INTERACTIVE MODEL TUNING
# =========================================================================== #
print("\n--- Starting Topic Model Tuning ---")

def calculate_exclusivity(model, dictionary, top_n_words=10):
    """Calculates topic exclusivity."""
    exclusivity_scores = []
    topics = model.get_topics()
    for topic_index in range(model.num_topics):
        top_word_indices = np.argsort(topics[topic_index])[-top_n_words:]
        topic_exclusivity = 0
        for word_index in top_word_indices:
            p_word_in_topic = topics[topic_index, word_index]
            p_word_in_all_topics = np.sum(topics[:, word_index])
            if p_word_in_all_topics > 1e-9:
                topic_exclusivity += p_word_in_topic / p_word_in_all_topics
        exclusivity_scores.append(topic_exclusivity / top_n_words)
    return np.mean(exclusivity_scores)

topic_range = range(5, 31, 5)
coherence_scores = []
exclusivity_scores = []

for k in topic_range:
    print(f"Training and evaluating model with {k} topics...")
    lda_model_temp = LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=k, passes=15, random_state=42)
    coherence_model = CoherenceModel(model=lda_model_temp, texts=all_preprocessed_sentences_tokenized, dictionary=dictionary, coherence='c_v')
    coherence_scores.append(coherence_model.get_coherence())
    exclusivity_scores.append(calculate_exclusivity(lda_model_temp, dictionary))

# --- Plotting with Dual-Axis Line Graph ---
fig, ax1 = plt.subplots(figsize=(12, 7))
color1 = 'tab:blue'
ax1.set_xlabel('Number of Topics', fontsize=12)
ax1.set_ylabel('Topic Coherence (c_v)', color=color1, fontsize=12)
ax1.plot(topic_range, coherence_scores, 'o-', color=color1, label='Coherence')
ax1.tick_params(axis='y', labelcolor=color1)
ax1.grid(axis='x')
for i, txt in enumerate(coherence_scores):
    ax1.annotate(f'{txt:.3f}', (list(topic_range)[i], coherence_scores[i]), textcoords="offset points", xytext=(0,5), ha='center')

ax2 = ax1.twinx()
color2 = 'tab:red'
ax2.set_ylabel('Semantic Exclusivity', color=color2, fontsize=12)
ax2.plot(topic_range, exclusivity_scores, 'o--', color=color2, label='Exclusivity')
ax2.tick_params(axis='y', labelcolor=color2)
for i, txt in enumerate(exclusivity_scores):
    ax2.annotate(f'{txt:.3f}', (list(topic_range)[i], exclusivity_scores[i]), textcoords="offset points", xytext=(0,5), ha='center')

fig.suptitle('Model Selection: Coherence vs. Exclusivity', fontsize=16)
fig.tight_layout(rect=[0, 0, 1, 0.96])
print("\nShowing plot. Please close the plot window to continue...")
plt.show()

# --- User Input for Final Model ---
while True:
    try:
        NUM_TOPICS = int(input(f"\nBased on the plot, please enter the final number of topics to use (e.g., from {list(topic_range)}): "))
        if NUM_TOPICS in topic_range:
            break
        else:
            print(f"Invalid input. Please choose a number from the tested range: {list(topic_range)}")
    except ValueError:
        print("Invalid input. Please enter an integer.")

print(f"\nProceeding with {NUM_TOPICS} topics for the final models.")

# =========================================================================== #
# 2. FINAL LDA (Latent Dirichlet Allocation)
# =========================================================================== #
print("\n--- Running Final LDA Model ---")

final_lda_model = LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=NUM_TOPICS, passes=20, random_state=42, alpha='auto', eta='auto')
print("Assigning dominant topics to sentences...")
df['Topic'] = [max(final_lda_model[doc], key=lambda x: x[1])[0] if doc else -1 for doc in tfidf_corpus]
df['Topic_Probability'] = [max(final_lda_model[doc], key=lambda x: x[1])[1] if doc else 0.0 for doc in tfidf_corpus]

# --- Consolidating LDA results into single files ---
print("Exporting consolidated LDA results...")
all_topic_words = []
all_rep_sentences = []

for topic_id in range(final_lda_model.num_topics):
    topic_words = final_lda_model.show_topic(topic_id, topn=NUM_TOPIC_WORDS)
    words_df = pd.DataFrame(topic_words, columns=['Word', 'Weight'])
    words_df['Topic_ID'] = topic_id
    all_topic_words.append(words_df)

    rep_docs_df = df[df['Topic'] == topic_id].sort_values(by='Topic_Probability', ascending=False).head(NUM_REPRESENTATIVE_SENTENCES)
    all_rep_sentences.append(rep_docs_df)

final_words_df = pd.concat(all_topic_words, ignore_index=True)
final_sentences_df = pd.concat(all_rep_sentences, ignore_index=True)

# --- NEW: Explicitly select and order columns for the sentences CSV ---
# Define the columns you want in the final output file.
# This ensures 'Year' and 'CEO_Name' are included.
columns_to_save = [
    original_sentence_col,
    'Topic',
    'Topic_Probability',
    'person_name',
    'Year'
]
# Filter the DataFrame to only include these columns before saving.
final_sentences_df_to_save = final_sentences_df[columns_to_save]

final_words_df.to_csv(os.path.join(OUTPUT_DIR, 'lda_results', 'lda_all_topic_words.csv'), index=False)
final_sentences_df_to_save.to_csv(os.path.join(OUTPUT_DIR, 'lda_results', 'lda_all_representative_sentences.csv'), index=False)
print(f"✅ Final LDA results exported to single files in '{os.path.join(OUTPUT_DIR, 'lda_results')}' directory.")


# =========================================================================== #
# 3. LSI & NMF (For Validation)
# =========================================================================== #
print("\n--- Running Validation Models (LSI & NMF) ---")
lsi_model = LsiModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=NUM_TOPICS)
nmf_model = Nmf(corpus=tfidf_corpus, id2word=dictionary, num_topics=NUM_TOPICS, random_state=42)
lsi_topics = [{'Topic_ID': i, 'Word': w, 'Weight': s} for i in range(NUM_TOPICS) for w, s in lsi_model.show_topic(i, topn=NUM_TOPIC_WORDS)]
nmf_topics = [{'Topic_ID': i, 'Word': w, 'Weight': s} for i in range(NUM_TOPICS) for w, s in nmf_model.show_topic(i, topn=NUM_TOPIC_WORDS)]
pd.DataFrame(lsi_topics).to_csv(os.path.join(OUTPUT_DIR, 'validation_models', 'lsi_topics.csv'), index=False)
pd.DataFrame(nmf_topics).to_csv(os.path.join(OUTPUT_DIR, 'validation_models', 'nmf_topics.csv'), index=False)
print(f"✅ LSI and NMF validation topics exported.")

print("\n🎉 All processes complete.")

Topic Modelling with 5 topics

In [None]:
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaModel
import os

# --- Configuration ---
INPUT_FILE = 'preprocessed_sentences_final.csv'
OUTPUT_DIR = 'topic_model_outputs'
NUM_TOPIC_WORDS = 15 # For file exports

# --- Create Output Directories ---
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
if not os.path.exists(os.path.join(OUTPUT_DIR, 'lda_results')):
    os.makedirs(os.path.join(OUTPUT_DIR, 'lda_results'))

# --- Load Data ---
try:
    df = df
except FileNotFoundError:
    print(f"Error: '{INPUT_FILE}' not found. Please check the file path.")
    exit()

print("Data loaded successfully.")
all_preprocessed_sentences_tokenized = [str(s).split() for s in df['preprocessed_tokens_str']]

# --- Prepare Gensim Corpus ---
print("Preparing Gensim corpus...")
dictionary = Dictionary(all_preprocessed_sentences_tokenized)
dictionary.filter_extremes(no_below=1, no_above=0.9, keep_n=100000)
bow_corpus = [dictionary.doc2bow(text) for text in all_preprocessed_sentences_tokenized]
tfidf_model = TfidfModel(bow_corpus)
tfidf_corpus = tfidf_model[bow_corpus]

# Use the number of topics you determined from your tuning phase.
NUM_TOPICS = 5
print(f"\nProceeding with {NUM_TOPICS} topics for the final models.")

# =========================================================================== #
# FINAL LDA (Latent Dirichlet Allocation)
# =========================================================================== #
print("\n--- Running Final LDA Model ---")

final_lda_model = LdaModel(
    corpus=tfidf_corpus,
    id2word=dictionary,
    num_topics=NUM_TOPICS,
    passes=20,
    random_state=42,
    alpha='auto',
    eta='auto'
)

# --- Assign Topics to DataFrame ---
print("Assigning dominant topics to sentences...")
# (This part can be commented out if you only need to see the topics)
# df['Topic'] = [max(final_lda_model[doc], key=lambda x: x[1])[0] if doc else -1 for doc in tfidf_corpus]
# ...

# --- Display and Export LDA Results ---
print("\n--- LDA Model Results (Gensim Format) ---")

# --- MODIFIED: Display Top 10 Topic Keywords in Gensim Format ---
# You can also use the built-in function for a quick view:
# final_lda_model.print_topics()

for topic_id in range(final_lda_model.num_topics):
    topic_words = final_lda_model.show_topic(topic_id, topn=10)
    # Build the string in the desired format
    topic_string = " + ".join([f'{weight:.3f}*"{word}"' for word, weight in topic_words])
    print(f"Topic #{topic_id}: {topic_string}")
# --- End of Modified Section ---

# --- Exporting to files (optional, can be commented out) ---
# all_topic_words = []
# for topic_id in range(final_lda_model.num_topics):
#     topic_words_for_export = final_lda_model.show_topic(topic_id, topn=NUM_TOPIC_WORDS)
#     words_df = pd.DataFrame(topic_words_for_export, columns=['Word', 'Weight'])
#     words_df['Topic_ID'] = topic_id
#     all_topic_words.append(words_df)
#
# final_words_df = pd.concat(all_topic_words, ignore_index=True)
# final_words_df.to_csv(os.path.join(OUTPUT_DIR, 'lda_results', 'lda_all_topic_words.csv'), index=False)
# print(f"\n✅ Final LDA topic words also exported to a file.")