In [35]:
import gdown
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:

#Carga de datos
url = 'https://drive.google.com/uc?id=1nONpng5xOXNr2kK-bdtLL4RsaingezVK'
amazonReview = 'Reviews.csv'
gdown.download(url, amazonReview, quiet=False)
data = pd.read_csv(amazonReview)


In [None]:
#Exploración de datos
data.dtypes
data.isnull().sum()

In [None]:
#Grafico de frecuencia de los scores
plt.figure(figsize=(12, 8))
sns.countplot(x='Score', data=data)  
plt.title('Distribución de Scores') 
plt.xlabel('Score')  
plt.ylabel('Frecuencia')  
plt.xticks(rotation=45)  
plt.show()

In [None]:
#Wordcloud

In [None]:
#Descarga de Stopwords
nltk.download('stopwords')

In [26]:
#Funncion para limpiar texto
def limpiar_texto(texto):
    texto = texto.lower()  
    texto = re.sub(r'[^a-zA-Z\s]', '', texto) 
    palabras = texto.split()  # tokenización
    palabras = [palabra for palabra in palabras if palabra not in stopwords.words('english')]
    return ' '.join(palabras)

In [28]:
#Wordcloud de los reviews
data['Summary'].fillna('').astype(str)
reviews = data['Summary']
reviews_limpio = reviews.apply(limpiar_texto)
texto_wc = ' '.join(reviews_limpio)



In [None]:
#Generación de Wordcloud
wc = WordCloud(width=800, height=400, background_color='white').generate(texto_wc)

plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')  
plt.show()

In [31]:
#Clasificacion de reseñas como positivas y negativas

# Score de 3 es considerado una reseña neutra
data.query('Score == 3') 

#Eliminación de reseñas con score 3
data = data[data['Score'] != 3]

In [None]:
#Creacion de columna sentiment

data['sentiment'] = 0
data.loc[data['Score'] > 3, 'sentiment'] = 1  # Reseñas positivas
data.loc[data['Score'] < 3, 'sentiment'] = -1  # Reseñas negativas

print(data['sentiment'].value_counts())
print(data.columns)



In [33]:
#WordCloud Reseñas positivas
reviews_positivas = data.query('sentiment == 1')['Summary']
reviews_positivas_limpio = reviews_positivas.apply(limpiar_texto)
texto_wc_positivo = ' '.join(reviews_positivas_limpio)



In [None]:
#Generación de Wordcloud Reseñas positivas
wc = WordCloud(width=800, height=400, background_color='white').generate(texto_wc_positivo)

plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')  
plt.show()

In [39]:
#WordCloud Reseñas negativas
reviews_negativas = data.query('sentiment == -1')['Summary']
reviews_negativas_limpio = reviews_negativas.apply(limpiar_texto)
texto_wc_negativo = ' '.join(reviews_negativas_limpio)

In [None]:
#Generación de Wordcloud Reseñas negativas
wc = WordCloud(width=800, height=400, background_color='white').generate(texto_wc_negativo)
plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')  
plt.show()

In [None]:
#Distribucion de reseñas positivas y negativas
plt.figure(figsize=(12, 8))
sns.countplot(x='sentiment', data=data)
plt.title('Distribución de Sentimientos')
plt.xlabel('Sentimiento')
plt.ylabel('Frecuencia')
plt.xticks(ticks=[0, 1,], labels=['Negativo', 'Positivo'])
plt.show()

In [None]:

data['Summary'] = data['Summary'].apply(limpiar_texto)

In [57]:
#Nuevo dataframe con las columnas Summary y sentiment

data_nuevo = data[['Summary', 'sentiment']]

#Input
X = data_nuevo['Summary']
#Output
Y = data_nuevo['sentiment']

X_training, X_test, Y_training, Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [58]:
vectorizer = CountVectorizer()
X_training_vectorized = vectorizer.fit_transform(X_training)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
word_count = X_training_vectorized.sum(axis=0) 
words = vectorizer.get_feature_names_out() 
word_count_df = pd.DataFrame(word_count.A1, index=words, columns=['Count']) 
print(word_count_df.sort_values(by='Count', ascending=False).head(10))

In [None]:
#Modlo de regresion logistica
model = LogisticRegression()
model.fit(X_training_vectorized, Y_training)

predictions = model.predict(X_test_vectorized)
accuracy = accuracy_score(Y_test, predictions)

print(f'Precisión del modelo: {accuracy:.2f}')

In [None]:
# Matriz de confusión
cm = confusion_matrix(Y_test, predictions, labels=[-1, 1])  
cm_array = np.array(cm)
print(cm_array)

#Reporte de clasificacion
report = classification_report(Y_test, predictions, target_names=['Negativo', 'Positivo'])
print(report)


