In [None]:
# Import libraries
import pandas as pd
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from unidecode import unidecode

In [None]:
# Load file
df = pd.read_csv('./df_pre_process_I.csv')

In [None]:
df['title'] = df['title'].astype(str)
df['content'] = df['content'].astype(str)
df['answers'] = df['answers'].astype(str)

full_text_col = 'full_text'
required_columns = ["title", "content", "answers", full_text_col]
df[full_text_col] = df["title"] + " " + df["content"] + df["answers"]

###############

nltk.download('stopwords')

stop_words = set(stopwords.words('portuguese'))
stop_words.update(["pra","etc", "none", "vai", "ter", "nan", "user", "author", "title", "none",
                    "name", "score", "content", "down", "votes", "created", "comments", "comment",
                    "answercontent", "vote", "type", "points", "aqui", "pode", "sobre", "fazer",
                    "alguem", "tudo", "regular", "coisa", "bem", "vou", "sei", "boca", "algum",
                    "alguns", "alguma", "algo", "nada", "bom", "entao", "acho", "quer", "the",
                    "and", "you", "cara", "coisas", "sim", "ainda", "ver", "usar", "assim",
                    "index"])

# Replace characters with accents with their unaccented equivalents
stop_words_without_accents = set()
for word in stop_words:
    stop_words_without_accents.add(unidecode(word))
stop_words = stop_words_without_accents

def preprocess_text(text):    
    # Replace characters with accents with their unaccented equivalents
    text = unidecode(text)
    # Convert the text to lowercase
    text = text.lower()
    # Replace \\n with a white space
    text = re.sub(r'\s*\\n\s*', ' ', text)
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    # Remove .onion links
    text = re.sub(r'\S*\.onion\S*', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)
    # Remove the titles that appear in answers
    text = re.sub(r"'title': [^,]*,", ' ', text)
    # Remove the usernames that appear in answers
    text = re.sub(r"'name': [^,]*,", ' ', text)
    # Remove the types that appear in answers 
    text = re.sub(r"'type': [^,]*,", ' ', text)
    # Remove the authors that appear in answers 
    text = re.sub(r"'author': [^,]*,", ' ', text)
    # Remove sequences of kkkk
    text = re.sub(r'k{2,}\S*', ' ', text)
    
    # Remove terms with more than 4 consecutive consonants
    consonants_5m = "([bcdfghjklmnpqrstvwxyz]{5,})"
    text = re.sub(consonants_5m, " ", text)
    # Remove terms with more than 5 consecutive vowels
    vowels_6m = "([aeiou]{6,})"
    text = re.sub(vowels_6m, " ", text)
    
    # Replace non-letter characters with white spaces
    text = re.sub('[^A-Za-z]+', ' ', text)
    # Remove extra white spaces and replace sequences of white spaces with a single white space
    text = re.sub('\s+', ' ', text.strip())
    # Remove stop words
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [word for word in words if len(word) > 2]
    text = ' '.join(words)
    return text


In [None]:
# Apply the preprocessing function to df[full_text_col]
df[full_text_col] = df[full_text_col].apply(preprocess_text)

In [None]:
df[full_text_col][2]

In [None]:
# Excluding rows where 'full_text' is null
df = df.dropna(subset=['full_text'])

# Reset the index
df = df.reset_index(drop=True)

In [None]:
# Save CSV file
df.to_csv('df_pre_process_II.csv', index=False)

# Save XLSX file
df.to_excel('df_pre_process_II.xlsx', index=False)