# Explorer Models

## Import libs

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from gensim.models import KeyedVectors

## Load product reviews preprocessing

In [3]:
df_reviews = pd.read_csv('./data/product_reviews_preprocessed.csv')
df_reviews.head()

Unnamed: 0,review_score,review_comment_message,review_creation_date,label,processed_review_comment
0,5,"Só achei ela pequena pra seis xícaras ,mais é ...",2017-08-08 00:00:00,1,"achar pequeno pra seis xícara , bom produto"
1,5,Entrega antes da data marcada. Excelente,2018-06-20 00:00:00,1,entregar antes data marcar . excelente
2,5,estou satisfeito,2018-08-15 00:00:00,1,satisfeito
3,5,Mais uma ve satisfeito,2018-05-09 00:00:00,1,ve satisfeito
4,5,"Muito boa a compra, dentro do prazo.",2017-12-08 00:00:00,1,"bom compra , dentro prazo ."


## BoW, TF-IDF and Word2Vec

Functions that return values from textual representations in BoW, TF-IDF and Word2Vec

In [4]:
def bow_representation(df):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['processed_review_comment'])
    y = df['label']
    return train_test_split(X, y, test_size=0.2, random_state=42)

def tfidf_representation(df):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['processed_review_comment'])
    y = df['label']
    return train_test_split(X, y, test_size=0.2, random_state=42)

def word2vec_representation(df, model_path):
    model = KeyedVectors.load_word2vec_format(model_path, unicode_errors='ignore')

    def get_vector(text):
        words = text.split()
        word_vectors = [model[word] for word in words if word in model]
        if len(word_vectors) == 0:
            return np.zeros(50)
        return np.mean(word_vectors, axis=0)

    X = df['processed_review_comment'].apply(get_vector)
    y = df['label']
    return train_test_split(np.vstack(X.values), y, test_size=0.2, random_state=42)


## Models Machine Learning 
Logistic Regression, Naive Bayes and SVM

In [33]:
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, representation_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f"Modelo: {model_name} - Representação: {representation_name}\nAcurácia: {accuracy}\nRelatório de Classificação:\n{report}\n")

# Modelos a serem usados
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced'),
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(class_weight='balanced'),
    'Random Forest': RandomForestClassifier(class_weight='balanced')
}

for model_name, model in models.items():
    # Bag-of-Words
    X_train, X_test, y_train, y_test = bow_representation(df_reviews)
    train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "Bag-of-Words")

    # TF-IDF
    X_train, X_test, y_train, y_test = tfidf_representation(df_reviews)
    train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "TF-IDF")

    # Word2Vec (excluindo Naive Bayes)
    if model_name != 'Naive Bayes':
        X_train, X_test, y_train, y_test = word2vec_representation(df_reviews, './data/cbow_s50.txt')
        train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "Word2Vec")


Modelo: Logistic Regression - Representação: Bag-of-Words
Acurácia: 0.9526074700493306
Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      1644
           1       0.97      0.96      0.97      4032

    accuracy                           0.95      5676
   macro avg       0.94      0.95      0.94      5676
weighted avg       0.95      0.95      0.95      5676


Modelo: Logistic Regression - Representação: TF-IDF
Acurácia: 0.9506694855532065
Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      1644
           1       0.98      0.95      0.96      4032

    accuracy                           0.95      5676
   macro avg       0.93      0.95      0.94      5676
weighted avg       0.95      0.95      0.95      5676


Modelo: Logistic Regression - Representação: Word2Vec
Acurácia: 0.8537702607470049
Relatório de Classificação:
         

In [6]:
# Modelos a serem usados
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced'),
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(class_weight='balanced'),
    'Random Forest': RandomForestClassifier(class_weight='balanced')
}

for model_name, model in models.items():
    # Bag-of-Words
    X_train, X_test, y_train, y_test = bow_representation(df_reviews)
    train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "Bag-of-Words")

    # TF-IDF
    X_train, X_test, y_train, y_test = tfidf_representation(df_reviews)
    train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "TF-IDF")

    # Word2Vec (excluindo Naive Bayes)
    if model_name != 'Naive Bayes':
        X_train, X_test, y_train, y_test = word2vec_representation(df_reviews, './data/cbow_s50.txt')
        train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "Word2Vec")

def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, representation_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    print(f"Modelo: {model_name} - Representação: {representation_name}\nAcurácia: {accuracy}\nRelatório de Classificação:\n{report_dict}\n")
    
    # Armazenando as métricas em um dicionário
    metrics = {
        'Modelo': model_name,
        'Representação': representation_name,
        'Acurácia': accuracy,
        'Precision 0': report_dict['0']['precision'],
        'Recall 0': report_dict['0']['recall'],
        'F1-Score 0': report_dict['0']['f1-score'],
        'Precision 1': report_dict['1']['precision'],
        'Recall 1': report_dict['1']['recall'],
        'F1-Score 1': report_dict['1']['f1-score']
    }
    return metrics

# Lista para armazenar os resultados
results = []



for model_name, model in models.items():
    # Repita para cada representação como Bag-of-Words, TF-IDF, Word2Vec
    # Exemplo para Bag-of-Words
    X_train, X_test, y_train, y_test = bow_representation(df_reviews)
    result = train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "Bag-of-Words")

    # TF-IDF
    X_train, X_test, y_train, y_test = tfidf_representation(df_reviews)
    result = train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "TF-IDF")

    # Word2Vec (excluindo Naive Bayes)
    if model_name != 'Naive Bayes':
        X_train, X_test, y_train, y_test = word2vec_representation(df_reviews, './data/cbow_s50.txt')
        result = train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "Word2Vec")

    results.append(result)

# Convertendo a lista de resultados em um DataFrame
results_df = pd.DataFrame(results)
display(results_df)

Modelo: Logistic Regression - Representação: Bag-of-Words
Acurácia: 0.9526074700493306
Relatório de Classificação:
{'0': {'precision': 0.9109384339509863, 'recall': 0.927007299270073, 'f1-score': 0.9189026228519745, 'support': 1644.0}, '1': {'precision': 0.9700224831376467, 'recall': 0.9630456349206349, 'f1-score': 0.9665214685749844, 'support': 4032.0}, 'accuracy': 0.9526074700493306, 'macro avg': {'precision': 0.9404804585443165, 'recall': 0.9450264670953539, 'f1-score': 0.9427120457134794, 'support': 5676.0}, 'weighted avg': {'precision': 0.9529093441554639, 'recall': 0.9526074700493306, 'f1-score': 0.952729117911026, 'support': 5676.0}}

Modelo: Logistic Regression - Representação: TF-IDF
Acurácia: 0.9506694855532065
Relatório de Classificação:
{'0': {'precision': 0.8870601589103292, 'recall': 0.9507299270072993, 'f1-score': 0.9177921315325894, 'support': 1644.0}, '1': {'precision': 0.9793050587634133, 'recall': 0.9506448412698413, 'f1-score': 0.9647621444752076, 'support': 4032.0}

In [8]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, representation_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    print(f"Model: {model_name} - Representation: {representation_name}\nAccuracy: {accuracy}\nClassification Report:\n{report_dict}\n")
    
    # Creating metric rows for each class
    metrics_0 = {
        'Model': model_name,
        'Representation': representation_name,
        'Class': 0,
        'Accuracy': accuracy,
        'Precision': report_dict['0']['precision'],
        'Recall': report_dict['0']['recall'],
        'F1-Score': report_dict['0']['f1-score']
    }

    metrics_1 = {
        'Model': model_name,
        'Representation': representation_name,
        'Class': 1,
        'Accuracy': accuracy,
        'Precision': report_dict['1']['precision'],
        'Recall': report_dict['1']['recall'],
        'F1-Score': report_dict['1']['f1-score']
    }

    return metrics_0, metrics_1

# List to store the results
results = []

for model_name, model in models.items():
    # Repeat for each representation like Bag-of-Words, TF-IDF, Word2Vec
    # Example for Bag-of-Words
    X_train, X_test, y_train, y_test = bow_representation(df_reviews)
    result_0, result_1 = train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "Bag-of-Words")
    results.extend([result_0, result_1])

    # TF-IDF
    X_train, X_test, y_train, y_test = tfidf_representation(df_reviews)
    result_0, result_1 = train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "TF-IDF")
    results.extend([result_0, result_1])

    # Word2Vec (excluding Naive Bayes)
    if model_name != 'Naive Bayes':
        X_train, X_test, y_train, y_test = word2vec_representation(df_reviews, './data/cbow_s50.txt')
        result_0, result_1 = train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "Word2Vec")
        results.extend([result_0, result_1])

# Converting the list of results into a DataFrame
results_df = pd.DataFrame(results)
display(results_df)

Model: Logistic Regression - Representation: Bag-of-Words
Accuracy: 0.9526074700493306
Classification Report:
{'0': {'precision': 0.9109384339509863, 'recall': 0.927007299270073, 'f1-score': 0.9189026228519745, 'support': 1644.0}, '1': {'precision': 0.9700224831376467, 'recall': 0.9630456349206349, 'f1-score': 0.9665214685749844, 'support': 4032.0}, 'accuracy': 0.9526074700493306, 'macro avg': {'precision': 0.9404804585443165, 'recall': 0.9450264670953539, 'f1-score': 0.9427120457134794, 'support': 5676.0}, 'weighted avg': {'precision': 0.9529093441554639, 'recall': 0.9526074700493306, 'f1-score': 0.952729117911026, 'support': 5676.0}}

Model: Logistic Regression - Representation: TF-IDF
Accuracy: 0.9506694855532065
Classification Report:
{'0': {'precision': 0.8870601589103292, 'recall': 0.9507299270072993, 'f1-score': 0.9177921315325894, 'support': 1644.0}, '1': {'precision': 0.9793050587634133, 'recall': 0.9506448412698413, 'f1-score': 0.9647621444752076, 'support': 4032.0}, 'accurac

Unnamed: 0,Model,Representation,Class,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,Bag-of-Words,0,0.952607,0.910938,0.927007,0.918903
1,Logistic Regression,Bag-of-Words,1,0.952607,0.970022,0.963046,0.966521
2,Logistic Regression,TF-IDF,0,0.950669,0.88706,0.95073,0.917792
3,Logistic Regression,TF-IDF,1,0.950669,0.979305,0.950645,0.964762
4,Logistic Regression,Word2Vec,0,0.85377,0.694737,0.883212,0.777718
5,Logistic Regression,Word2Vec,1,0.85377,0.946458,0.841766,0.891048
6,Naive Bayes,Bag-of-Words,0,0.945913,0.898154,0.917275,0.907614
7,Naive Bayes,Bag-of-Words,1,0.945913,0.965974,0.957589,0.961764
8,Naive Bayes,TF-IDF,0,0.936751,0.912123,0.864964,0.887918
9,Naive Bayes,TF-IDF,1,0.936751,0.946077,0.966022,0.955946


In [10]:
pivot_df = results_df.pivot_table(index=['Model', 'Representation'], columns='Class', values=['Accuracy', 'Precision', 'Recall', 'F1-Score'])

# Renaming columns for clarity
pivot_df.columns = [f'{col[0]} {col[1]} ({["Negative","Positive"][col[1]]}' for col in pivot_df.columns]

display(pivot_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy 0 (Negative,Accuracy 1 (Positive,F1-Score 0 (Negative,F1-Score 1 (Positive,Precision 0 (Negative,Precision 1 (Positive,Recall 0 (Negative,Recall 1 (Positive
Model,Representation,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Logistic Regression,Bag-of-Words,0.952607,0.952607,0.918903,0.966521,0.910938,0.970022,0.927007,0.963046
Logistic Regression,TF-IDF,0.950669,0.950669,0.917792,0.964762,0.88706,0.979305,0.95073,0.950645
Logistic Regression,Word2Vec,0.85377,0.85377,0.777718,0.891048,0.694737,0.946458,0.883212,0.841766
Naive Bayes,Bag-of-Words,0.945913,0.945913,0.907614,0.961764,0.898154,0.965974,0.917275,0.957589
Naive Bayes,TF-IDF,0.936751,0.936751,0.887918,0.955946,0.912123,0.946077,0.864964,0.966022
Random Forest,Bag-of-Words,0.946617,0.946617,0.907594,0.962467,0.910092,0.961396,0.905109,0.963542
Random Forest,TF-IDF,0.945208,0.945208,0.905786,0.961371,0.902233,0.962926,0.909367,0.959821
Random Forest,Word2Vec,0.87315,0.87315,0.758875,0.913937,0.844262,0.882095,0.689173,0.948165
SVM,Bag-of-Words,0.946441,0.946441,0.910641,0.961761,0.881115,0.975753,0.942214,0.948165
SVM,TF-IDF,0.956307,0.956307,0.926103,0.968984,0.90771,0.977296,0.945255,0.960813
