# Projeto: Análise de Sentimentos em Redes Sociais
## Etapa: Otimização
**Autor:** Vinícius Ramos  
**Fonte do Dataset:** [Kaggle](https://www.kaggle.com/datasets/kashishparmar02/social-media-sentiments-analysis-dataset)

## 1. Importações e Carregamento de Dados

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

# Reutilizando o código de pré-processamento (idealmente, isso viraria uma função em um script .py)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [2]:
df = pd.read_csv('../data/processed/sentimentdataset_tratado.csv')
df.dropna(subset=['Text'], inplace=True)

In [3]:
stop_words = set(stopwords.words('english'))
stop_words.add('like')
lemmatizer = WordNetLemmatizer()

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

In [5]:
df['Text_Processed'] = df['Text'].apply(preprocess_text)

In [6]:
X = df['Text_Processed']
y = df['Sentiment_Simplified']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## 2. Criação dos Pipelines Otimizados

In [20]:
# Logistic Regression
pipeline_lr_balanced = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('classifier', OneVsRestClassifier(LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')))
])

# Random Forest
pipeline_rf_balanced = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced'))
])


## 3. Treinamento e Avaliação dos Modelos

In [12]:
pipeline_lr_balanced.fit(X_train, y_train)
y_pred_lr_b = pipeline_lr_balanced.predict(X_test)

In [21]:
accuracy_score(y_test, y_pred_lr_b)

0.7414965986394558

In [14]:
print(classification_report(y_test, y_pred_lr_b))

              precision    recall  f1-score   support

    Negativo       0.74      0.72      0.73        32
      Neutro       0.69      0.55      0.61        44
    Positivo       0.77      0.87      0.82        71

    accuracy                           0.74       147
   macro avg       0.73      0.71      0.72       147
weighted avg       0.74      0.74      0.73       147



In [22]:
pipeline_rf_balanced.fit(X_train, y_train)
y_pred_rf_b = pipeline_rf_balanced.predict(X_test)

In [23]:
accuracy_score(y_test, y_pred_rf_b)

0.6598639455782312

In [25]:
print(classification_report(y_test, y_pred_rf_b))

              precision    recall  f1-score   support

    Negativo       0.79      0.47      0.59        32
      Neutro       0.92      0.27      0.42        44
    Positivo       0.61      0.99      0.75        71

    accuracy                           0.66       147
   macro avg       0.77      0.58      0.59       147
weighted avg       0.74      0.66      0.62       147



In [None]:
accuracy_score(y_test, y_pred_lr_b)

0.7414965986394558

## 4. Salvando o Modelo Otimizado

In [28]:
caminho_modelo_final = '../data/modelos_salvos/modelo_final_otimizado.joblib'

score_lr = accuracy_score(y_test, y_pred_lr_b)
score_rf = accuracy_score(y_test, y_pred_rf_b)

if score_rf > score_lr:
    print(f"\nRandom Forest Balanceado foi o melhor ({score_rf:.2%}) e será salvo.")
    joblib.dump(pipeline_rf_balanced, caminho_modelo_final)
else:
    print(f"\nRegressão Logística Balanceada foi a melhor ({score_lr:.2%}) e será salva.")
    joblib.dump(pipeline_lr_balanced, caminho_modelo_final)

print(f"Modelo final salvo em: {caminho_modelo_final}")


Regressão Logística Balanceada foi a melhor (74.15%) e será salva.
Modelo final salvo em: ../data/modelos_salvos/modelo_final_otimizado.joblib
