## Connect to your Google Drive




In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Import relevant packages for topic modeling

In [2]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloa

In [None]:
from bertopic import BERTopic
import pandas as pd

## Define the topic modeling function

> Add blockquote



In [None]:
def topic_modelling(text, min_topic_size=35, language="english"):
    """
    Performs topic modeling on a given text using BERTopic.

    Parameters:
        text (list of str): List of text documents to perform topic modeling on.
        min_topic_size (int): Minimum size of topics to be considered.
        language (str): Language of text documents.

    Returns:
        tuple: A tuple containing the topics, topic information, and the BERTopic model.
    """
    model = BERTopic(n_gram_range=(1, 2), verbose=True, language=language, low_memory=True, min_topic_size=min_topic_size)
    topics, probs = model.fit_transform(text)
    topic_info = model.get_topic_info()

    return topics, topic_info, model

## Define path and data columns

In [None]:
PATH= "/content/drive/MyDrive/topic_modelling_june_25/" # add here the correct path on Google Drive

#Optional organisation:

DATE_COLUMN = "Post.Created.Date"
TEXT_COLUMN = "text_translated" # adapt to the correct column name for Facebook and Instagram
AUTHOR_COLUMN = "username"
LIKES_COLUMN = "Likes"
REACTIONS_COLUMN = "Total.Interactions"
LANGUAGE = "english"
REGEX = u'[A-Za-z√Ä-√∫]+' # words only

## Load data

In [None]:
df = pd.read_csv(PATH+"all_data_clean.csv")# add here the correct data file name/content/drive/MyDrive/FileName.csv

## Run topic modeling and explore output

In [None]:
# Drop rows with NaN values in TEXT_COLUMN
df = df.dropna(subset=[TEXT_COLUMN])

In [None]:
# Perform topic modeling
topics, topic_info, model = topic_modelling(df[TEXT_COLUMN].astype(str).tolist(), min_topic_size=15)


2025-06-16 12:52:43,277 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/228 [00:00<?, ?it/s]

2025-06-16 12:55:28,325 - BERTopic - Embedding - Completed ‚úì
2025-06-16 12:55:28,327 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-16 12:55:48,807 - BERTopic - Dimensionality - Completed ‚úì
2025-06-16 12:55:48,809 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-16 12:55:49,292 - BERTopic - Cluster - Completed ‚úì
2025-06-16 12:55:49,300 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-16 12:55:50,440 - BERTopic - Representation - Completed ‚úì


In [None]:
#displays the topics
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1891,-1_wind_power_rt_energy,"[wind, power, rt, energy, wind power, wind ene...",[makes move cdu lower saxony surprising . advo...
1,0,403,0_germany_wind_energy_electricity,"[germany, wind, energy, electricity, power, wi...",[german wind energy association ( # bweev ) : ...
2,1,329,1_nuclear_power_nuclear power_energy,"[nuclear, power, nuclear power, energy, solar,...","[yes , # wind power cheaper # nuclear power . ..."
3,2,181,2_distance_rule_altmaier_distance rule,"[distance, rule, altmaier, distance rule, mini...",[best news day energy transition : altmaier wa...
4,3,145,3_alpine_alpine republic_energy alpine_republi...,"[alpine, alpine republic, energy alpine, repub...","[wind energy alpine republic austria, wind ene..."
5,4,107,4_offshore_offshore wind_onshore_onshore wind,"[offshore, offshore wind, onshore, onshore win...",[handelsblatt : renewables : orsted builds fir...
6,5,85,5_allowance_spd_citizen_participation,"[allowance, spd, citizen, participation, accep...",[roundup : acceptance wind power increase ‚Äì wi...
7,6,77,6_climate_climate protection_protection_rt,"[climate, climate protection, protection, rt, ...",[rt @ vorwaerts : `` can # climate protection ...
8,7,73,7_species_conservation_species conservation_pr...,"[species, conservation, species conservation, ...","[combining wind energy species conservation -,..."
9,8,69,8_turbine_wind turbine_turbines_wind turbines,"[turbine, wind turbine, turbines, wind turbine...","[550 wind turbines paderborn district , one-th..."


In [None]:
topic_info = model.get_topic_info()

## Save topic modeling data

In [None]:
#export the topic information to a CSV file
topic_info.to_csv(PATH+"unique_data_topics_info.csv", sep=',', encoding='utf-8', index=False)

In [None]:
#adds an additional column to the dataset for each entry's topic number
df["Topics"] = topics
df.to_csv(PATH+"unique_data_topics_list.csv", sep=',', encoding='utf-8')#choose file name and save