# Prerrequisitos

In [None]:
# !pip install -q selenium

# !pip install -U pip setuptools wheel
# !pip install -U spacy
# !python -m spacy download es_core_news_sm
# !python -m spacy_spanish_lemmatizer download wiki

# !pip install wordcloud

In [20]:
URL = 'https://www.solotodo.cl/stores/12/ratings'

# Color de estrella
HexBlanca = 'color:#333'
RgbBlanca = 'rgb(51, 51, 51)'
HexAmarilla = 'color:#ffb400'
RgbAmarilla = 'rgb(255, 180, 0)'

# guarda fecha, producto, estrellas y opinion de cada reseña
comentarios = {
    5: [],
    4: [],
    3: [],
    2: [],
    1: []
}

# desde X hacia abajo
malaCalificacion = 3


# Web scraping: solotodo

In [21]:
import locale
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By


locale.setlocale(locale.LC_ALL, 'es_ES') 

# codigo en kora.selenium
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',options=options)

wd.get(URL)
totalPaginas = wd.find_elements(By.XPATH, "//ul[@class='pagination']")[0].find_elements(By.XPATH, "./li[@class='page-item']//a")[-2].text

for i in range(1, int(totalPaginas)+1):
  pagina = f"""{URL}?page={str(i)}"""
  wd.get(pagina)
  opiniones_div = wd.find_elements(By.XPATH, "//div[@class='col-12 rating-container']//div[@class='content-card mb-3']")

  for opinion in opiniones_div:
    estrellas = 0
    estrellas_div = opinion.find_elements(By.XPATH, ".//div[@class='rating-stars']//div[@class='dv-star-rating dv-star-rating-non-editable']//label")

    for i in estrellas_div:
      if RgbAmarilla in i.get_attribute('style'): estrellas += 1

    card = opinion.find_elements(By.XPATH, ".//dl//dd")
    comentarios[estrellas].append({
        'estrellas': estrellas,
        'fecha': datetime.strptime(card[0].text, '%d de %b de %Y %H:%M'),
        'producto': card[1].find_element(By.XPATH, ".//a").text,
        'opinion': card[-1].text
    })

wd.quit()

# Reordenar reseñas

In [None]:
import pandas as pd
import numpy as np

comentarios_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in comentarios.items() ]))

print(f"""
Total de opiniones: {sum(k for k in comentarios_df.count())}
Cantidad de estrellas igual o menor a {malaCalificacion}: {sum(comentarios_df.loc[:, malaCalificacion: ].count())}""",
end = 4*'\n')

# Grafico de datos obtenidos

In [None]:
import matplotlib.pyplot as plt
import numpy as np

labels = comentarios.keys()
estrellas = list(len(v) for v in comentarios.values())

# Barra - valoracion
fig, ax = plt.subplots(figsize = (10,5))
y_pos = (labels)
ax.bar(y_pos, estrellas,  align='center', alpha=0.5)
for i, d in enumerate(estrellas):
  plt.text(x=5-i, y=d+1, s=f"{d}", fontdict=dict(fontsize=14), ha='center')
plt.ylabel('Cantidad de estrellas')
plt.title(f'Valoración de PC Factory en solotodo.cl (total: {sum(k for k in comentarios_df.count())})')
fig.set_facecolor('white')
fig.get_size_inches
plt.show()
print()

# Pie - valoracion
fig, ax = plt.subplots()
ax.pie(estrellas, autopct='%1.0f%%', shadow=True, startangle=90)
ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
ax.legend(labels=labels, title = 'Cantidad de estrellas', loc='lower right', bbox_to_anchor=(1,0), bbox_transform=plt.gcf().transFigure)
plt.title(f'Valoración de PC Factory en solotodo.cl (total: {sum(k for k in comentarios_df.count())})')
fig.set_facecolor('white')
plt.show()
print()

# barra - reseñas x año
comentarios_df_year = pd.DataFrame(columns=dict( (i['fecha'].year,1) for k,v in comentarios_df.iteritems() for i in v if pd.notnull(i)).keys())
for k,v in comentarios_df.iteritems():
  for i in v:
    if pd.notnull(i): comentarios_df_year = comentarios_df_year.append({i['fecha'].year: i}, ignore_index=True)
comentarios_df_year = comentarios_df_year.apply(lambda x: pd.Series(x.dropna().values))

fig, ax = plt.subplots(figsize = (10,5))
y_pos = comentarios_df_year.count().keys()
ax.bar(y_pos, comentarios_df_year.count().values,  align='center', alpha=0.5)
plt.ylabel('Cantidad de comentarios')
plt.title(f'Calificaciones puestas por año de PC Factory en solotodo.cl (total: {sum(k for k in comentarios_df.count())})')
plt.show()
print()


# barra - reseñas buenas/malas x año
comentarios_df_yearPos = pd.DataFrame(columns=dict( (i['fecha'].year,1) for k,v in comentarios_df.iteritems() for i in v if pd.notnull(i)).keys())
for k,v in comentarios_df.iteritems():
  for i in v:
    if pd.notnull(i) and i['estrellas'] > malaCalificacion: comentarios_df_yearPos = comentarios_df_yearPos.append({i['fecha'].year: i}, ignore_index=True)
comentarios_df_yearPos = comentarios_df_yearPos.apply(lambda x: pd.Series(x.dropna().values))

comentarios_df_yearNeg = pd.DataFrame(columns=dict( (i['fecha'].year,1) for k,v in comentarios_df.iteritems() for i in v if pd.notnull(i)).keys())
for k,v in comentarios_df.iteritems():
  for i in v:
    if pd.notnull(i) and i['estrellas'] <= malaCalificacion: comentarios_df_yearNeg = comentarios_df_yearNeg.append({i['fecha'].year: i}, ignore_index=True)
comentarios_df_yearNeg = comentarios_df_yearNeg.apply(lambda x: pd.Series(x.dropna().values))


fig, ax = plt.subplots(figsize = (10,5))
y_pos = comentarios_df_year.count().keys()
ax.bar(y_pos-.1, comentarios_df_yearNeg.count().values, width=.2, color='red', align='center', alpha=0.5)
ax.bar(y_pos+.1, comentarios_df_yearPos.count().values, width=.2, color='green', align='center', alpha=0.5)
plt.ylabel('Cantidad de comentarios')
plt.title(f'Calificaciones positivas/negativas puestas por año de PC Factory en solotodo.cl (total: {sum(k for k in comentarios_df.count())})')
plt.show()
print()


# NLP

In [None]:
import re
import string
import numpy as np
import spacy
# from spacy.lang.es import Spanish
import spacy_spanish_lemmatizer
from spacy.lang.es.stop_words import STOP_WORDS
import matplotlib.pyplot as plt
from wordcloud import WordCloud

def limpiar(s: str):
    return re.sub(r'\s+', ' ', s.translate(str.maketrans('áéíóúü','aeiouu', string.punctuation)).lower())

def jion(l: list):
    return ' '.join((i['opinion'] for i in l if pd.notnull(i)))

def wordcloud(l: list, title: string):
    # doc = nlp(jion(l))
    doc = nlp(' '.join(i['opinion'] for i in l))
    wc = WordCloud(background_color="white", 
        max_words=350, 
        width=1000, 
        height=600, 
        random_state=1).generate(' '.join((token.lemma_ for token in doc if not token.is_stop and token.pos_ == 'ADJ')))

    plt.figure(figsize=(15,15))
    plt.imshow(wc)
    plt.axis("off")
    plt.title(f'Adjetivos: {title}')
    plt.show()
    print()

nlp = spacy.load("es_core_news_sm")

comentarios_df = comentarios_df.applymap(lambda x: {**x, 'opinion': limpiar(x['opinion'])} if pd.notnull(x) else None)

lista = []
for k,v in comentarios_df.loc[:, 5:malaCalificacion+1].iteritems():
    for i in v: 
        if i != None: lista.extend([i])
wordcloud(l=lista, title='Buenas calificaciones\n')

lista = []
for k,v in comentarios_df.loc[:, malaCalificacion:1].iteritems():
    for i in v: 
        if i != None: lista.extend([i])
wordcloud(l=lista, title='Malas calificaciones\n')
