In [None]:
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaModel

# --- Configuration ---
INPUT_FILE = '/Users/shantanusharma/Desktop/preprocessed_sentences_final.csv'
OUTPUT_FILE = 'df_with_all_topics.csv'

# IMPORTANT: Use the number of topics you selected from the tuning phase.
FINAL_NUM_TOPICS = 10 

# --- 1. Load Preprocessed Data ---
try:
    df = pd.read_csv(INPUT_FILE)
    print(f"Loaded '{INPUT_FILE}' with {len(df)} sentences.")
except FileNotFoundError:
    print(f"Error: The file '{INPUT_FILE}' was not found.")
    exit()

# --- 2. Prepare Corpus for LDA Model ---
# Create a list of token lists from the 'preprocessed_tokens_str' column
all_tokens = [str(s).split() for s in df['preprocessed_tokens_str']]
dictionary = Dictionary(all_tokens)
bow_corpus = [dictionary.doc2bow(text) for text in all_tokens]
tfidf_model = TfidfModel(bow_corpus)
tfidf_corpus = tfidf_model[bow_corpus]

print("Corpus prepared for LDA.")

# --- 3. Train the Final LDA Model ---
print(f"Training final LDA model with {FINAL_NUM_TOPICS} topics...")
final_lda_model = LdaModel(
    corpus=tfidf_corpus,
    id2word=dictionary,
    num_topics=FINAL_NUM_TOPICS,
    passes=20,
    random_state=42,
    alpha='auto',
    eta='auto'
)

# --- 4. Assign Dominant Topic to EVERY Sentence ---
print("Assigning dominant topic to all sentences...")
dominant_topics = []
topic_probabilities = []

# Get the topic distribution for each document in the corpus
all_doc_topic_dist = final_lda_model[tfidf_corpus]

for doc_dist in all_doc_topic_dist:
    if doc_dist:
        # Find the topic with the highest probability
        dominant_topic, prob = max(doc_dist, key=lambda x: x[1])
        dominant_topics.append(dominant_topic)
        topic_probabilities.append(prob)
    else:
        # Handle cases where a document might have no topics
        dominant_topics.append(-1) # Placeholder for no topic
        topic_probabilities.append(0.0)

# Add the results as new columns to the original DataFrame
df['Topic'] = dominant_topics
df['Topic_Probability'] = topic_probabilities

print("Topic assignment complete.")

# --- 5. Save the Complete Dataset ---
df.to_csv(OUTPUT_FILE, index=False)
print(f"\n✅ Complete dataset with all topics saved to '{OUTPUT_FILE}'.")
print("\nYou can now use this file for any analysis.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import MaxNLocator

# --- Configuration ---
INPUT_FILE = 'df_with_all_topics.csv'
PEOPLE_OF_INTEREST = ['Elon Musk', 'Sam Altman', 'Jeff Bezos']
START_YEAR = 2014
END_YEAR = 2024

# --- 1. Define Your Topic Clusters ---
# A dictionary to map topic numbers to your defined cluster names.
topic_to_cluster = {
    0: 'Controversial',
    9: 'Controversial',
    1: 'Maverick',
    3: 'Maverick',
    8: 'Political',
    2: 'Political',
    5: 'Political',
    4: 'Resilient',
    7: 'Resilient',
    6: 'Resilient'
}

# --- 2. Load and Prepare the Data ---
try:
    df = pd.read_csv(INPUT_FILE)
    print("Successfully loaded the complete topic dataset.")
except FileNotFoundError:
    print(f"Error: The file '{INPUT_FILE}' was not found.")
    exit()

# --- 3. Filter and Find the Dominant Topic (Same as before) ---
filtered_df = df[
    (df['person_name'].isin(PEOPLE_OF_INTEREST)) &
    (df['Year'].between(START_YEAR, END_YEAR))
].copy()

if filtered_df.empty:
    print(f"No data found for the specified individuals between {START_YEAR} and {END_YEAR}.")
    exit()

topic_counts = filtered_df.groupby(['person_name', 'Year', 'Topic']).size().reset_index(name='count')
total_counts = filtered_df.groupby(['person_name', 'Year']).size().reset_index(name='total_count')
prevalence_df = pd.merge(topic_counts, total_counts, on=['person_name', 'Year'])
prevalence_df['prevalence'] = prevalence_df['count'] / prevalence_df['total_count']
idx = prevalence_df.groupby(['person_name', 'Year'])['prevalence'].idxmax()
dominant_topics_df = prevalence_df.loc[idx].reset_index(drop=True)

# --- 4. Map Dominant Topics to Cluster Names ---
# Create a new 'Cluster' column by looking up the topic number in our dictionary.
dominant_topics_df['Cluster'] = dominant_topics_df['Topic'].map(topic_to_cluster)

print("\nFound dominant topic and mapped to cluster for each person and year:")
print(dominant_topics_df[['person_name', 'Year', 'Topic', 'Cluster']].head())

# --- 5. Plot the Comparative Graph with Cluster Names ---
print("\nGenerating the comparative graph with cluster annotations...")
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(15, 8))
palette = sns.color_palette("viridis", n_colors=len(PEOPLE_OF_INTEREST))

sns.lineplot(
    data=dominant_topics_df,
    x='Year',
    y='Topic',
    hue='person_name',
    style='person_name',
    marker='o',
    markersize=8,
    linestyle='--',
    palette=palette,
    ax=ax
)

# --- 6. Annotate Each Point with the Cluster Name ---
for person, color in zip(PEOPLE_OF_INTEREST, palette):
    person_df = dominant_topics_df[dominant_topics_df['person_name'] == person]
    for _, row in person_df.iterrows():
        # Use the 'Cluster' column for the annotation text.
        ax.text(
            x=row['Year'],
            y=row['Topic'] + 0.3, # Add a slight offset for better positioning
            s=row['Cluster'],     # Display the cluster name
            color='black', # Use black for all text for readability
            fontweight='bold',
            ha='center',
            va='bottom',
            fontsize=9,
            # Add a semi-transparent background to the text for readability
            bbox=dict(facecolor='white', alpha=0.5, edgecolor='none', pad=1)
        )

# Remove the Y-axis as requested
ax.get_yaxis().set_visible(False)
ax.spines['left'].set_visible(False)

# Formatting the rest of the plot
ax.set_title(f'Trend of Dominant Thematic Clusters for Musk, Altman, and Bezos ({START_YEAR}-{END_YEAR})', fontsize=16, pad=20)
ax.set_xlabel('Year', fontsize=12)
ax.set_xticks(range(START_YEAR, END_YEAR + 1))
ax.tick_params(axis='x', rotation=45)
ax.legend(title='Person')
plt.tight_layout()

plt.savefig('comparative_cluster_trend_graph.png', dpi=300)
print("\n✅ Plot saved as 'comparative_cluster_trend_graph.png'")

plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import MaxNLocator

# --- Configuration ---
INPUT_FILE = 'df_with_all_topics.csv'
PEOPLE_OF_INTEREST = ['Elon Musk', 'Sam Altman', 'Jeff Bezos']
START_YEAR = 2014
END_YEAR = 2024

# --- 1. Define Your Topic Clusters ---
# A dictionary to map topic numbers to your defined cluster names.
topic_to_cluster = {
    0: 'Controversial',
    9: 'Controversial',
    1: 'Maverick',
    3: 'Maverick',
    8: 'Political',
    2: 'Political',
    5: 'Political',
    4: 'Resilient',
    7: 'Resilient',
    6: 'Resilient'
}

# --- 2. Load and Prepare the Data ---
try:
    df = pd.read_csv(INPUT_FILE)
    print("Successfully loaded the complete topic dataset.")
except FileNotFoundError:
    print(f"Error: The file '{INPUT_FILE}' was not found.")
    exit()

# --- 3. Filter and Find the Dominant Topic (Same as before) ---
filtered_df = df[
    (df['person_name'].isin(PEOPLE_OF_INTEREST)) &
    (df['Year'].between(START_YEAR, END_YEAR))
].copy()

if filtered_df.empty:
    print(f"No data found for the specified individuals between {START_YEAR} and {END_YEAR}.")
    exit()

topic_counts = filtered_df.groupby(['person_name', 'Year', 'Topic']).size().reset_index(name='count')
total_counts = filtered_df.groupby(['person_name', 'Year']).size().reset_index(name='total_count')
prevalence_df = pd.merge(topic_counts, total_counts, on=['person_name', 'Year'])
prevalence_df['prevalence'] = prevalence_df['count'] / prevalence_df['total_count']
idx = prevalence_df.groupby(['person_name', 'Year'])['prevalence'].idxmax()
dominant_topics_df = prevalence_df.loc[idx].reset_index(drop=True)

# --- 4. Map Dominant Topics to Cluster Names ---
# Create a new 'Cluster' column by looking up the topic number in our dictionary.
dominant_topics_df['Cluster'] = dominant_topics_df['Topic'].map(topic_to_cluster)

print("\nFound dominant topic and mapped to cluster for each person and year:")
print(dominant_topics_df[['person_name', 'Year', 'Topic', 'Cluster']].head())

# --- 5. Plot the Comparative Graph with Cluster Names ---
print("\nGenerating the comparative graph with cluster annotations...")
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(15, 8))
palette = sns.color_palette("viridis", n_colors=len(PEOPLE_OF_INTEREST))

sns.lineplot(
    data=dominant_topics_df,
    x='Year',
    y='Topic',
    hue='person_name',
    style='person_name',
    marker='o',
    markersize=8,
    linestyle='--',
    palette=palette,
    ax=ax
)

# --- 6. Annotate Each Point with the Cluster Name ---
for person, color in zip(PEOPLE_OF_INTEREST, palette):
    person_df = dominant_topics_df[dominant_topics_df['person_name'] == person]
    for _, row in person_df.iterrows():
        # Use the 'Cluster' column for the annotation text.
        ax.text(
            x=row['Year'],
            y=row['Topic'] + 0.3, # Add a slight offset for better positioning
            s=row['Cluster'],     # Display the cluster name
            color='black', # Use black for all text for readability
            fontweight='bold',
            ha='center',
            va='bottom',
            fontsize=9,
            # Add a semi-transparent background to the text for readability
            bbox=dict(facecolor='white', alpha=0.5, edgecolor='none', pad=1)
        )

# Remove the Y-axis as requested
ax.get_yaxis().set_visible(False)
ax.spines['left'].set_visible(False)

# Formatting the rest of the plot
ax.set_title(f'Trend of Dominant Thematic Clusters for Musk, Altman, and Bezos ({START_YEAR}-{END_YEAR})', fontsize=16, pad=20)
ax.set_xlabel('Year', fontsize=12)
ax.set_xticks(range(START_YEAR, END_YEAR + 1))
ax.tick_params(axis='x', rotation=45)
ax.legend(title='Person')
plt.tight_layout()

plt.savefig('comparative_cluster_trend_graph.png', dpi=300)
print("\n✅ Plot saved as 'comparative_cluster_trend_graph.png'")

plt.show()