In [17]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils import resample


In [2]:
import kagglehub

path = kagglehub.dataset_download("olistbr/brazilian-ecommerce")
print("Path to dataset files:", path)

customers = pd.read_csv(f"{path}/olist_customers_dataset.csv")
orders = pd.read_csv(f"{path}/olist_orders_dataset.csv")
order_items = pd.read_csv(f"{path}/olist_order_items_dataset.csv")
payments = pd.read_csv(f"{path}/olist_order_payments_dataset.csv")
reviews = pd.read_csv(f"{path}/olist_order_reviews_dataset.csv")
sellers = pd.read_csv(f"{path}/olist_sellers_dataset.csv")

Using Colab cache for faster access to the 'brazilian-ecommerce' dataset.
Path to dataset files: /kaggle/input/brazilian-ecommerce


In [3]:
reviews.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [4]:
# Step 1: Create sentiment labels
def label_sentiment(score):
    if score <= 2:
        return "negative"
    elif score == 3:
        return "neutral"
    else:
        return "positive"

reviews['sentiment'] = reviews['review_score'].apply(label_sentiment)
reviews['review_comment_message'] = reviews['review_comment_message'].fillna("no_comment")




In [18]:
# Split classes
df_pos = reviews[reviews['sentiment'] == 'positive']
df_neg = reviews[reviews['sentiment'] == 'negative']
df_neu = reviews[reviews['sentiment'] == 'neutral']

In [19]:
# Downsample positives to match negatives (you can also use len(df_neu))
df_pos_downsampled = resample(
    df_pos,
    replace=False,
    n_samples=len(df_neg),
    random_state=42
)

In [21]:
#  Combine into balanced dataset
reviews_balanced = pd.concat([df_pos_downsampled, df_neg, df_neu])

print("Original distribution:\n", reviews['sentiment'].value_counts())
print("\nBalanced distribution:\n", reviews_balanced['sentiment'].value_counts())

Original distribution:
 sentiment
positive    76470
negative    14575
neutral      8179
Name: count, dtype: int64

Balanced distribution:
 sentiment
positive    14575
negative    14575
neutral      8179
Name: count, dtype: int64


In [22]:
reviews.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,sentiment,clean_text
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,no_comment,2018-01-18 00:00:00,2018-01-18 21:46:59,positive,nocomment
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,no_comment,2018-03-10 00:00:00,2018-03-11 03:05:13,positive,nocomment
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,no_comment,2018-02-17 00:00:00,2018-02-18 14:36:24,positive,nocomment
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06,positive,recebi bem antes do prazo estipulado
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53,positive,parabns lojas lannister adorei comprar pela in...


In [23]:
# Step 2: Preprocess text
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove non-letters
    return text

reviews['clean_text'] = reviews['review_comment_message'].astype(str).apply(clean_text)

In [7]:
reviews.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,sentiment,clean_text
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,no_comment,2018-01-18 00:00:00,2018-01-18 21:46:59,positive,nocomment
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,no_comment,2018-03-10 00:00:00,2018-03-11 03:05:13,positive,nocomment
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,no_comment,2018-02-17 00:00:00,2018-02-18 14:36:24,positive,nocomment
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06,positive,recebi bem antes do prazo estipulado
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53,positive,parabns lojas lannister adorei comprar pela in...


In [24]:
# Step 3: TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(reviews_balanced['clean_text'])
y = reviews_balanced['sentiment']

In [25]:
# Step 4: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [26]:
# Step 5: Train Model
model = LogisticRegression(max_iter=200, class_weight="balanced",
    solver="liblinear")
model.fit(X_train, y_train)

In [27]:
# Step 6: Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.79      0.69      0.73      2915
     neutral       0.45      0.13      0.20      1636
    positive       0.59      0.90      0.72      2915

    accuracy                           0.65      7466
   macro avg       0.61      0.57      0.55      7466
weighted avg       0.64      0.65      0.61      7466

