<a href="https://colab.research.google.com/github/snavasg/NLP_Analysis/blob/main/Excercise2_code_Navas_Gomez.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 2: Text clustering

## Libraries and Downloads

In [1]:
# Libraries
# Import necessary libraries for the code
import csv
import pandas as pd
from bertopic import BERTopic # pip install bertopic
import string
import nltk
from nltk.corpus import stopwords

# Download
# Download the list of stop words in English if you haven't already.
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Future Functions Definitions


In [2]:
###### Stop Words #######
# Function to remove common stop words from a given text.
def eliminar_stop_words(texto):
    stop_words = set(stopwords.words('english'))# Get a set of common stop words in English
    palabras = texto.split()   # Split the input text into words
    palabras_filtradas = [palabra for palabra in palabras if palabra.lower() not in stop_words]# Filter out words that are in the stop words set
    texto_filtrado = ' '.join(palabras_filtradas)  # Reconstruct the text without stop words
    return texto_filtrado


## Load Data

In [3]:
# Open the CSV file with 'utf-8'
with open('/content/File2.csv', mode='r', encoding='utf-8') as file:
    # Configure the CSV reader with delimiter (comma) and quote character (double quotes)
    reader = csv.reader(file, delimiter=',', quotechar='"')
    # Create a DataFrame from the CSV data and set column names
    df = pd.DataFrame(reader, columns=next(reader))
# Convert all text data in the DataFrame to lowercase while preserving non-string values
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
# Apply the function to remove stop words to the 'OBJECTIVE_NAME' column
df['OBJECTIVE_NAME'] = df['OBJECTIVE_NAME'].apply(eliminar_stop_words)

In [4]:
# It's generally not necessary to remove stop words when using BERTopic.
# However, in this case, we're opting to remove them due to their potential
# to introduce more noise than accuracy. This is especially true when
# dealing with highly specific target topics and very short sentences.

# Extract the 'OBJECTIVE_NAME' column from the DataFrame.
OBJECTIVE_NAME = df['OBJECTIVE_NAME']
# Create an instance of the BERTopic model.
topic_model = BERTopic()
# Fit the BERTopic model to the 'OBJECTIVE_NAME' data.
# This will cluster the objectives into topics.
topics, probs = topic_model.fit_transform(OBJECTIVE_NAME)
# Retrieve information about the topics generated by the model.
topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,85,0_water_drinking_improved_service,"[water, drinking, improved, service, quality, ...",[expanded improved drinking water service cove...
1,1,83,1_sanitation_coverage_wastewater_treatment,"[sanitation, coverage, wastewater, treatment, ...",[expanded improved sanitation service coverage...
2,2,13,2_waste_disposal_solid_collection,"[waste, disposal, solid, collection, urban, la...","[increase adequate disposal urban solid waste,..."


In [5]:
# Generate Labels
# Generate topic labels for each cluster using the BERTopic model.
# The 'nr_words' parameter specifies the number of words to use in the label,
# 'topic_prefix' adds a prefix to each label, and 'separator' sets the separator
# between words in the label.
topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, separator=" - ")

# Set the generated topic labels for the BERTopic model.
topic_model.set_topic_labels(topic_labels)

# Create Our Own Labels
# Customize topic labels by assigning meaningful names to each topic cluster.
# In this case, the code assigns human-readable labels to each of the identified topics.
topic_model.set_topic_labels({0: "Drinking Water and Improved Service",
                              1: "Sanitation, Coverage, and Wastewater Treatment",
                              2: "Waste Solid Disposal and Collection"})

In [6]:
# Visualize the Heatmap
# Generate a heatmap that visualizes the relationships between topics based on
# their similarity and word usage.
topic_model.visualize_heatmap()

In [7]:
# Visualize Documents with Custom Labels
# Visualize documents assigned to the specified topics, using the custom labels
# that were assigned in the previous step.
topic_model.visualize_documents(OBJECTIVE_NAME, topics=list(range(3)), custom_labels=True)


In [None]:
# The similarity matrix and the CLusters Plot shows that it is not necessary to reduce the number of topics, Since the similarity is not high enough,
# it is not necessary to reduce the number of topics.

In [8]:
# Create DataFrames for Topics and Objectives
# Create a DataFrame 'df_model' that combines the 'topics' (from BERTopic) and 'OBJECTIVE_NAME' columns.
df_model = pd.DataFrame({"Topics": topics, "OBJECTIVE_NAME": OBJECTIVE_NAME})
# Create a DataFrame 'model_topic_names' containing topic information including topic IDs and custom topic names.
model_topic_names = pd.DataFrame({'Topics': topic_model.get_topic_info()['Topic'], 'Topic Name': topic_model.get_topic_info()['CustomName']})

# Merge DataFrames
# Merge the original DataFrame 'df' with 'df_model' based on the 'OBJECTIVE_NAME' column,
# keeping only rows with matching objectives (inner join).
df_merged = pd.merge(df, df_model, on='OBJECTIVE_NAME', how='inner')
# Merge 'df_merged' with 'model_topic_names' based on the 'Topics' column, adding custom topic names to the result (left join).
df_merged = pd.merge(df_merged, model_topic_names, on='Topics', how='left')



In [9]:
df_merged.to_csv("Excercise2_output_Navas-Gomez.csv", index=False)