In [1]:
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import joblib

In [2]:
# Descargar stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:

# Cargar dataset
candy = pd.read_csv('../data/Candy-Crush-Saga.csv')

# Eliminar nulos y duplicados
candy = candy.dropna()
candy = candy.drop_duplicates(subset=['reviewId', 'content'], keep='first')

# Convertir texto a minúsculas
candy['content'] = candy['content'].str.lower()

# Eliminar caracteres especiales y emojis
candy['content'] = candy['content'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

# Eliminar stopwords en español e inglés
stop_words = stopwords.words('english') + stopwords.words('spanish')
candy['content'] = candy['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Eliminar reseñas con menos de 3 palabras
candy = candy[candy['content'].str.split().str.len() > 3]

# Función para análisis de sentimientos
def analizar_sentimiento(texto):
    blob = TextBlob(texto)
    sentimiento = blob.sentiment.polarity
    if sentimiento > 0:
        return 'positivo'
    elif sentimiento < 0:
        return 'negativo'
    else:
        return 'neutral'

# Aplicar análisis de sentimientos
candy['sentiment'] = candy['content'].apply(analizar_sentimiento)

# Guardar data limpia en un CSV
candy.to_csv('../data_process/candy_clean.csv', index=False)

In [None]:
# Visualización de sentimientos
sentiment_counts = candy['sentiment'].value_counts()
plt.figure(figsize=(6, 4))
plt.bar(sentiment_counts.index, sentiment_counts.values, color=['green', 'gray', 'red'])
plt.title('Distribución de Sentimientos')
plt.xlabel('Sentimiento')
plt.ylabel('Cantidad')
plt.show()

# Separar datos en entrenamiento y prueba
X = candy['content']
y = candy['sentiment']

# Convertir texto a vectores
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

# Dividir datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar modelo Naive Bayes
modelo = MultinomialNB()
modelo.fit(X_train, y_train)

# Predecir
y_pred = modelo.predict(X_test)

# Evaluar modelo
print(classification_report(y_test, y_pred))

# Guardar modelo y vectorizador
joblib.dump(modelo, 'modelo_sentimientos.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')