In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import os
from bertopic import BERTopic
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    # Tokenize, remove stopwords, and lowercasing
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

In [4]:
def apply_bertopic(text_data):
    # Initialize BERTopic
    topic_model = BERTopic(language="english")

    # Fit the model and get topics
    topics, _ = topic_model.fit_transform(text_data)

    # Extract topic information
    topics_info = topic_model.get_topic_info()
    df_topics = pd.DataFrame(topics_info)

    return df_topics, topic_model

In [9]:
base_dir = os.getcwd()  # Get the script's directory
output_dir = os.path.join(base_dir, "..", "..", "output")

file_path1 = os.path.join(output_dir, "final_data_after_clustering.xlsx")
df1 = pd.read_excel(file_path1)

In [10]:
# Apply preprocessing to the 'Combined_text' column
df1['processed_text'] = df1['Combined_text'].astype(str).apply(preprocess_text)

In [83]:
df_topics, topic_model = apply_bertopic(df1['processed_text'])

In [84]:
print(df_topics[['Topic', 'Name']])

    Topic                                         Name
0      -1                     -1_israel_gaza_hamas_war
1       0                0_yemen_missile_yemeni_houthi
2       1       1_lebanon_hezbollah_ceasefire_lebanese
3       2              2_attack_terrorist_bus_shooting
4       3                     3_get_support_gaza_hamas
5       4                      4_jews_car_think_jewish
6       5              5_rockets_barrage_rocket_impact
7       6              6_hostages_cross_hamas_released
8       7           7_syria_damascus_airstrikes_syrian
9       8                  8_iran_iranian_attack_irans
10      9         9_beirut_israeli_airstrikes_southern
11     10                   10_sirens_red_across_sound
12     11             11_netanyahu_benjamin_arrest_icc
13     12             12_hostages_breaking_israeli_red
14     13                  13_hanukkah_israel_qr_store
15     14    14_prisoners_palestinian_prisoner_release
16     15               15_minister_prime_netanyahu_we
17     16 

In [11]:
cluster_topics = {}
for cluster in df1['Cluster'].unique():
    print(f"Processing Cluster {cluster}...")

    cluster_df = df1[df1['Cluster'] == cluster]  # Filter cluster data
    if cluster_df.shape[0] < 5:  # Ensure enough texts for topic modeling
        print(f"Skipping Cluster {cluster} (not enough data).")
        continue

    df_topics, topic_model = apply_bertopic(cluster_df['processed_text'])

    # Save results
    cluster_topics[cluster] = df_topics
    fig = topic_model.visualize_barchart()
    fig.write_html(os.path.join(output_dir, f"topic_barchart_cluster_{cluster}.html"))

    print(f"Cluster {cluster} topic modeling complete. Results saved.")

Processing Cluster 1...
Cluster 1 topic modeling complete. Results saved.
Processing Cluster 0...
Cluster 0 topic modeling complete. Results saved.
Processing Cluster 2...
Cluster 2 topic modeling complete. Results saved.


In [13]:
for cluster, df_topics in cluster_topics.items():
    print(f"\nCluster {cluster} Topics:\n", df_topics)


Cluster 1 Topics:
     Topic  Count                                      Name  \
0      -1    130                   -1_go_going_iran_israel   
1       0    110             0_yemen_missile_houthi_israel   
2       1     76           1_syria_damascus_syrian_israeli   
3       2     75          2_hezbollah_lebanon_idf_southern   
4       3     58         3_beirut_israeli_southern_lebanon   
5       4     54           4_rockets_barrage_impact_rocket   
6       5     44                    5_west_bank_idf_attack   
7       6     43                 6_red_sirens_across_sound   
8       7     38                7_drone_launched_says_iraq   
9       8     36  8_explosions_reported_interceptions_aviv   
10      9     31               9_front_guidelines_home_idf   
11     10     18           10_army_maariv_israeli_missiles   
12     11     15               11_gaza_strip_northern_city   
13     12     15    12_evacuation_buildings_issued_suburbs   
14     13     13              13_alerts_red_telavi

In [15]:
df1.drop(columns=['processed_text'], inplace=True)

In [16]:
df1.columns

Index(['Message_ID', 'Date', 'Combined_text', 'Total_Comments',
       'Text_Positive_Reactions', 'Text_Negative_Reactions', 'Cluster'],
      dtype='object')