<a href="https://colab.research.google.com/github/snavasg/NLP_Analysis/blob/main/Excercise1_code_Navas_Gomez.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 1: Data pre-processing

## Libraries and Downloads

In [None]:
# Libraries
# Import necessary libraries for the code
import csv
import pandas as pd
from googletrans import Translator  # Version used to install googletrans==4.0.0-rc1 (pip install googletrans==4.0.0-rc1)
import string
import nltk
from nltk.corpus import stopwords
import string
from gensim.models import Word2Vec

# Download
# Download the list of stop words in English if you haven't already.
nltk.download('stopwords')

# Download the Punkt tokenizer if you haven't already.
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Future Functions Definitions


In [None]:
###### Stop Words #######
# Function to remove common stop words from a given text.
def eliminar_stop_words(texto):
    stop_words = set(stopwords.words('english'))# Get a set of common stop words in English
    palabras = texto.split()   # Split the input text into words
    palabras_filtradas = [palabra for palabra in palabras if palabra.lower() not in stop_words]# Filter out words that are in the stop words set
    texto_filtrado = ' '.join(palabras_filtradas)  # Reconstruct the text without stop words
    return texto_filtrado

###### Special Characters #######
# Function to clean the text by removing special characters and punctuation.
def limpiar_texto(texto):
    texto_limpio = ''.join([caracter for caracter in texto if caracter not in string.punctuation]) # Remove special characters and punctuation from the text
    return texto_limpio

###### Tokenization ########
# Function to tokenize the input text into individual words.
def tokenizar_texto(texto):
    tokens = nltk.word_tokenize(texto)  # Tokenize the input text using the NLTK word tokenizer
    return tokens

###### Vectorization ########
# Function to vectorize the given text data using Word2Vec.
# Parameters:
#   - texto_column: A list of tokenized sentences or text data. This input data will be used to train the Word2Vec model.

#   - vector_size: The dimensionality of the word vectors. This parameter determines the length of the word vectors and,
#     consequently, the amount of information they can capture. It's typically set to a value between 100 and 300, where higher
#     values can capture more information but may require more data.

############
# NOTE: I use vector_size of 6 because the visualization of vectors in the output .csv is very complicated
############

#   - window: The maximum distance between a sentence's current and predicted word. It defines the context window size for
#     the model. Words outside this window are not considered for context. A smaller value, such as 5, captures the local context,
#     while a larger value, such as 10, captures the broader context.

#   - min_count: Ignores all words with a total frequency lower than this value. Setting it to 1 means that even infrequent
#     words are included in the model, while higher values filter out rare words. This parameter helps control the vocabulary size.

#   - sg: Training algorithm for Word2Vec. Use 0 for Continuous Bag of Words (CBOW) and 1 for Skip-gram. CBOW predicts the
#     target word based on its context words, while Skip-gram predicts context words given the target word. The choice between
#     CBOW and Skip-gram affects the focus of the word vectors.

def vectorizar_texto(texto_column, vector_size=6, window=5, min_count=1, sg=0):
    # Train a Word2Vec model with the specified parameters
    model = Word2Vec(texto_column, vector_size=vector_size, window=window, min_count=min_count, sg=sg)
    return model


## Load Data

In [None]:
# Open the CSV file with 'latin-1' encoding (ISO-8859-1)
with open('/content/File1.csv', mode='r', encoding='latin-1') as file:
    # Configure the CSV reader with delimiter (comma) and quote character (double quotes)
    reader = csv.reader(file, delimiter=',', quotechar='"')
    # Create a DataFrame from the CSV data and set column names
    df = pd.DataFrame(reader, columns=next(reader))


### Data Pre-processing


In [None]:
# Convert all text data in the DataFrame to lowercase while preserving non-string values
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
# Create a single instance of the translator
translator = Translator()
# Translate the elements in the 'INDICATOR_NAME_RAW' column sequentially to English
df['INDICATOR_NAME_RAW_ENGLISH'] = df['INDICATOR_NAME_RAW'].apply(lambda text: translator.translate(text, dest='en').text.lower())

In [None]:
# FIRST, REMOVE STOP WORDS BECAUSE IF WE EXAMINE 'set(stopwords.words('english'))', it includes options like "weren't," "shouldn't," "aren't," etc.
# These are contractions of words like "were not," "should not," "are not," which also contain the special character "'", so if we first
# remove special characters, we would get "werent," "shouldnt," "arent," and then we wouldn't be able to identify them as stop words, but they are.

# Apply the function to remove stop words to the 'INDICATOR_NAME_RAW_ENGLISH' column
df['INDICATOR_NAME_RAW_ENGLISH_CLEAN'] = df['INDICATOR_NAME_RAW_ENGLISH'].apply(eliminar_stop_words)
# Apply the cleaning function to the 'INDICATOR_NAME_RAW_ENGLISH_CLEAN' column
df['INDICATOR_NAME_RAW_ENGLISH_CLEAN'] = df['INDICATOR_NAME_RAW_ENGLISH_CLEAN'].apply(limpiar_texto)
# Apply the tokenization function to the 'INDICATOR_NAME_RAW_ENGLISH_CLEAN' column
df['TOKENS'] = df['INDICATOR_NAME_RAW_ENGLISH_CLEAN'].apply(tokenizar_texto)
# Call the function to vectorize the text
df['MODEL_EMBEDDINGS'] = vectorizar_texto(df['TOKENS']).wv
# EMBEDDINGS_LIST column represents the values of the vectorization, in the form of a list, since
# the MODEL_EMBEDDINGS column cannot be displayed when exporting, so it is created only for contextual purposes.
df['EMBEDDINGS_LIST'] = df['MODEL_EMBEDDINGS'].apply(lambda x: x[0] if x else [])



In [None]:
df.to_csv("Excercise1_output_Navas-Gomez.csv", index=False)  # I suggest when load the Excercise1_output_Navas-Gomez.csv the following python code

In [None]:
# with open('/content/Excercise1_output_Navas-Gomez.csv', mode='r', encoding='latin-1') as file:
#     # Configure the CSV reader with delimiter (comma) and quote character (double quotes)
#     reader = csv.reader(file, delimiter=',', quotechar='"')
#     # Create a DataFrame from the CSV data and set column names
#     df = pd.DataFrame(reader, columns=next(reader))