# Explorer Models

## Import libs

In [28]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from gensim.models import KeyedVectors

## Load product reviews preprocessing

In [15]:
df_reviews = pd.read_csv('./data/product_reviews_preprocessed.csv')
df_reviews.head()

Unnamed: 0,review_score,review_comment_message,review_creation_date,label,processed_review_comment
0,5,"Só achei ela pequena pra seis xícaras ,mais é ...",2017-08-08 00:00:00,1,"achar pequeno pra seis xícara , bom produto"
1,5,Entrega antes da data marcada. Excelente,2018-06-20 00:00:00,1,entregar antes data marcar . excelente
2,5,estou satisfeito,2018-08-15 00:00:00,1,satisfeito
3,5,Mais uma ve satisfeito,2018-05-09 00:00:00,1,ve satisfeito
4,5,"Muito boa a compra, dentro do prazo.",2017-12-08 00:00:00,1,"bom compra , dentro prazo ."


## BoW, TF-IDF and Word2Vec

Functions that return values from textual representations in BoW, TF-IDF and Word2Vec

In [26]:
def bow_representation(df):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['processed_review_comment'])
    y = df['label']
    return train_test_split(X, y, test_size=0.2, random_state=42)

def tfidf_representation(df):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['processed_review_comment'])
    y = df['label']
    return train_test_split(X, y, test_size=0.2, random_state=42)

def word2vec_representation(df, model_path):
    model = KeyedVectors.load_word2vec_format(model_path, unicode_errors='ignore')

    def get_vector(text):
        words = text.split()
        word_vectors = [model[word] for word in words if word in model]
        if len(word_vectors) == 0:
            return np.zeros(50)
        return np.mean(word_vectors, axis=0)

    X = df['processed_review_comment'].apply(get_vector)
    y = df['label']
    return train_test_split(np.vstack(X.values), y, test_size=0.2, random_state=42)


## Models Machine Learning 
Logistic Regression, Naive Bayes and SVM

In [33]:
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, representation_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f"Modelo: {model_name} - Representação: {representation_name}\nAcurácia: {accuracy}\nRelatório de Classificação:\n{report}\n")

# Modelos a serem usados
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced'),
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(class_weight='balanced'),
    'Random Forest': RandomForestClassifier(class_weight='balanced')
}

for model_name, model in models.items():
    # Bag-of-Words
    X_train, X_test, y_train, y_test = bow_representation(df_reviews)
    train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "Bag-of-Words")

    # TF-IDF
    X_train, X_test, y_train, y_test = tfidf_representation(df_reviews)
    train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "TF-IDF")

    # Word2Vec (excluindo Naive Bayes)
    if model_name != 'Naive Bayes':
        X_train, X_test, y_train, y_test = word2vec_representation(df_reviews, './data/cbow_s50.txt')
        train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name, "Word2Vec")


Modelo: Logistic Regression - Representação: Bag-of-Words
Acurácia: 0.9526074700493306
Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      1644
           1       0.97      0.96      0.97      4032

    accuracy                           0.95      5676
   macro avg       0.94      0.95      0.94      5676
weighted avg       0.95      0.95      0.95      5676


Modelo: Logistic Regression - Representação: TF-IDF
Acurácia: 0.9506694855532065
Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      1644
           1       0.98      0.95      0.96      4032

    accuracy                           0.95      5676
   macro avg       0.93      0.95      0.94      5676
weighted avg       0.95      0.95      0.95      5676


Modelo: Logistic Regression - Representação: Word2Vec
Acurácia: 0.8537702607470049
Relatório de Classificação:
         