##### Sentiment Analysis
This notebook leverages a NLP model pre-trained on engligh reviews. Reviews are firstly translated to English, using `Narrativa/mbart-large-50-finetuned-opus-pt-en-translation`, along with language identification via `facebook/fasttext-language-identification1` and then classified within `cardiffnlp/twitter-xlm-roberta-base-sentiment`.

In [None]:
# Libraries setup
import os
import re
import string
import unicodedata
import nltk
import torch

import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from dotenv import load_dotenv
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, MBartForConditionalGeneration, MBart50TokenizerFast

load_dotenv()

# Setup pytorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Download the WordNet resource
# nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
# Define paths
fpath_data = os.environ.get("FPATH_DATA")

# Load reviews data
reviews = pd.read_csv(fpath_data+"order_reviews.csv")

In [None]:
reviews.isnull().sum()

In [None]:
# It looks like all reviews are from Brazil according to customers.csv

# Convert to datetime column review_creation_date
reviews['review_creation_date'] = pd.to_datetime(reviews['review_creation_date'])

# For the sake of simplicit, lets merge review comment title and message
reviews['review_concat'] = reviews['review_comment_title'].fillna('')+', '+reviews['review_comment_message'].fillna('')

# # Drop rows with null ['review_concat']
# reviews = reviews.dropna(subset=['review_concat'])

# Drop rows with only ', ' string
reviews = reviews[reviews.review_concat != ', ']

# Lets keep only relevant columns
reviews = reviews[['review_id', 'order_id', 'review_concat','review_creation_date']]

##### Preprocessing

In [None]:
# Lets check if there are any numbers in the reviews - this will be helpfull for the second task
def search_numbers(text):
    return bool(re.search(r'\d+', text))

reviews['contain_num'] = reviews['review_concat'].apply(search_numbers)
# reviews[reviews['contain_num']==True].head(50)

In [None]:
def remove_emojis(text):
    """Remove emojis from the text."""
    if isinstance(text, str):
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002500-\U00002BEF"  # chinese char
                                   u"\U00002702-\U000027B0"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   u"\U0001f926-\U0001f937"
                                   u"\U00010000-\U0010ffff"
                                   u"\u2640-\u2642"
                                   u"\u2600-\u2B55"
                                   u"\u200d"
                                   u"\u23cf"
                                   u"\u23e9"
                                   u"\u231a"
                                   u"\ufe0f"  # dingbats
                                   u"\u3030"
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)
    else:
        return text

In [None]:
def keep_only_ascii(text):
    def is_supported(char):
        try:
            char.encode(encoding='utf-8').decode('ascii')
        except UnicodeDecodeError:
            return False
        else:
            return True
        
    def clean_string(s):
        return ''.join(c for c in s if is_supported(c) or unicodedata.category(c) not in ['Cn', 'Co', 'Cs'])

    cleaned_list = [clean_string(s) for s in text]
    return [s for s in cleaned_list if s]  # Remove any empty strings

In [None]:
def remove_stopwords(tokens, lang):
    """Eliminate common stopwords from the tokenized text.
    Stop words are commonly used words like "the," "is," or 
    "and" that don't carry much meaning and can be removed to reduce noise in the data
    """
    stop_words = set(stopwords.words(lang)) # Stopwords language
    return [word for word in tokens if word not in stop_words]

In [None]:
def tokenize_text(text):
    """Split the text into individual words or tokens."""
    return word_tokenize(text)

In [None]:
def remove_numbers(text):
    """Exclude numerical digits from the text."""
    return re.sub(r'\d+', '', text)

def remove_punctuation(text):
    """Remove punctuation marks from the text."""
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_extra_whitespaces(text):
    """Remove extra white space from text."""
    return re.sub(r'\s+', ' ', text, flags=re.I)

def remove_single_chars(text):
    """Remove all single characters from text"""
    return re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

def remove_special_chars(text):
    """Remove all the special characters from text."""
    return re.sub(r'\W', ' ', text)

def remove_not_alphabetical(text):
    """Remove any character that isn't alphabetical."""
    return re.sub(r'[^a-zA-Z\s]', '', text)

In [None]:
# Final preprocessing pipeline, some of the functions might not be used in the solution, althrough defined
def preprocess_en(text):
    text = str(text).lower() # To lower
    text = remove_extra_whitespaces(text)
    # text = remove_numbers(text)
    text = remove_punctuation(text)
    text = tokenize_text(text)
    text = remove_stopwords(text, 'english')
    text = remove_emojis(text)
    text = keep_only_ascii(text)
    return text

##### Translation to English

In [None]:
model_name = "Narrativa/mbart-large-50-finetuned-opus-pt-en-translation"
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
tokenizer.src_lang = 'pt_XX' # Further should be expanded with facebook/fasttext-language-identification to identify language instead of hardcoding PT

In [None]:
def translate_pt_to_en(text):
    inputs = tokenizer(text, return_tensors='pt')
    input_ids = inputs.input_ids.to('cuda')
    attention_mask = inputs.attention_mask.to('cuda')
    output = model.generate(input_ids, attention_mask=attention_mask, forced_bos_token_id=tokenizer.lang_code_to_id['en_XX'])
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
tqdm.pandas(desc="Translating")
reviews['reviews_en'] = reviews['review_concat'].progress_apply(translate_pt_to_en)
reviews

##### Prerocessing of english reviews
Step applied after the translation

In [None]:
reviews['reviews_en_preprocessed'] = reviews['reviews_en'].apply(preprocess_en)
reviews['reviews_en_preprocessed_str'] = reviews['reviews_en_preprocessed'].apply(' '.join)

In [None]:
reviews.to_csv('reviews_en.csv')

##### Word cloud

In [None]:
reviews_aio = ' '.join(reviews['reviews_en_preprocessed_str'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(reviews_aio)
plt.figure(figsize=(12,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Translated Reviews Word Cloud')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
reviews['review_length'] = reviews['reviews_en'].str.len()

# Histogram of review length
ax.hist(reviews['review_length'], bins=20, edgecolor='black')
ax.set_title('Histogram of a Review String Length')
ax.set_xlabel('Review String Length')
ax.set_ylabel('Frequency')

##### Sentiment analysis
As mentioned in the beginning, this notebook leverages `ramonmedeiro1/bertimbau-products-reviews-pt-br` pre-trained model from Huggingface

In [None]:
model_name = "ramonmedeiro1/bertimbau-products-reviews-pt-br"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = model.to(device)

In [None]:
sentiment_classes = ['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive']

def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

    predicted_class = torch.argmax(probabilities, dim=1).item()
    predicted_sentiment = sentiment_classes[predicted_class]

    return predicted_sentiment

In [None]:
reviews['sentiment'] = reviews['reviews_processed_str'].apply(get_sentiment)
reviews

##### Results visualization

In [None]:
results = reviews.set_index('review_creation_date')
sentiment_over_time = results.resample('ME')['sentiment'].value_counts().unstack().fillna(0)

plt.figure(figsize=(15, 8))
for sentiment in sentiment_over_time.columns:
    plt.plot(sentiment_over_time.index, sentiment_over_time[sentiment], label=sentiment, marker='o')

plt.title('Sentiment of Reviews by Year Month')
plt.xlabel('Date')
plt.ylabel('Count')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)

In [None]:
# WordCloud for a very positive sentiment
reviews_aio = ' '.join(reviews[reviews['sentiment']=='Very Positive']['reviews_processed_str'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(reviews_aio)
plt.figure(figsize=(12,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Very Positive Reviews Word Cloud')
plt.show()

In [None]:
# WordCloud for a very negative sentiment
reviews_aio = ' '.join(reviews[reviews['sentiment']=='Very Negative']['reviews_processed_str'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(reviews_aio)
plt.figure(figsize=(12,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Very Negative Reviews Word Cloud')
plt.show()