# Librerías necesarias

In [None]:
!pip install -r requirements.txt
!pip install utils.py



In [None]:
import random
import numpy as np
import pandas as pd
from collections import Counter # Exploración
from nltk import ngrams # Exploración
from nltk.probability import FreqDist # Exploración
from stop_words import get_stop_words # Preprocesado
import unicodedata # Preprocesado
import re # Preprocesado
import gzip


from sklearn.model_selection import train_test_split # Modelado
from sklearn.pipeline import Pipeline # Modelado
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Modelado
from sklearn.feature_selection import chi2 # Reporte
from sklearn.linear_model import LogisticRegression # Reporte
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve # Reporte

import matplotlib.pyplot as plt

# Preprocesado

En este cuaderno se resumirán los trabajos de preprocesamiento que se aplicarán en las reviews. Al final del mismo, se resumirá en un pipeline como último paso de cara a presentar el preprocesamiento realizado.

### Función de preprocesado de texto

Se implementa un preprocesado en las reviews para estandarizar el texto contenido en las mismas, de cara a unificar los criterios antes de procesar el texto en los distintos modelos. Esto consiste en:

* Eliminar símbolos
* Eliminar todo carácter que no sea una letra
* Elimar stopwords

Como las reviews están en inglés, las tildes no aplican.

In [None]:
import pandas as pd

df = pd.read_csv("/content/cds_&_vinyl_expl_done.csv", index_col = None)

In [None]:
def sentence_normalization(sentence):
    sentence = unicodedata.normalize('NFKD', sentence).lower().encode('ascii', errors='ignore').decode('utf-8')
    sentence = re.sub(' +', ' ', ' '.join([word if word.isalpha() else '' for word in sentence.split()])).strip()
    return sentence

In [None]:
def remove_stopwords(sentence, sw_list):
    sentence = ' '.join([word for word in sentence.split() if word not in sw_list])
    return sentence

In [None]:
from stop_words import get_stop_words
sw_list = get_stop_words('en')

In [None]:
def process_reviews(reviews, sw_list):
    processed_sentences = []
    for sent in df['reviewText']:
        if not sent != sent:  # check if sent is not nan
            sent = sentence_normalization(sent)
            sent = remove_stopwords(sent, sw_list)
            processed_sentences.append(sent)
        else:
            processed_sentences.append('None')
    return processed_sentences

In [None]:
processed_reviews = process_reviews(df['reviewText'], sw_list)

Vemos lo que conseguimos con nuestro preprocesado

In [None]:
print('Review original: {}'.format(df['reviewText'].values[0]))
print('Review procesada: {}'.format(processed_reviews[0]))

Review original: What happened?????? They were so good back in the day, this is absolute garbage!!!! I couldn't even sell it to my buddy who is a diehard Axe fan!! C'mon guys, stick with the winning formula, Metallica didn't, and look at'em now. PLease do not blow your money on this, it's a waste of the plastic in the CD.
Review procesada: good back absolute even sell buddy diehard axe stick winning metallica look please blow money waste plastic


Vemos en esta review que la persona habla que eran buenos antes (pero que ahora ya no lo son!). Habría que tener en cuenta estos detalles a la hora de procesar el texto y aplicar modelos que incluyan un contexto con las palabras de alrededor para poder comprender lo que está ocurriendo.

In [None]:
df.loc[:, 'processedReview'] = processed_reviews

In [None]:
df['processedReview'] = df['processedReview'].replace('', np.nan)
df = df.dropna(subset=['processedReview'])

In [None]:
df

Unnamed: 0,reviewText,overall,overall_binary,processedReview
0,What happened?????? They were so good back in ...,1,0,good back absolute even sell buddy diehard axe...
1,"Thanks to people like Curtis ""I been shot 9 ti...",1,0,thanks people like curtis shot drug woman crim...
2,dont waste your bucks .. its poor poor po...,1,0,dont waste bucks poor poor poor
3,the worst female performer ever seen with shri...,1,0,worst female performer ever seen shrieking voi...
4,What happened to the days when you could liste...,1,0,happened days listen metallica cd sounded fres...
...,...,...,...,...
7995,Thank you.,5,1,thank
7996,Now finally available a new superb 3 CD set on...,5,1,now finally available new superb cd set coveri...
7997,What is there to say. You like Nirvana or you ...,5,1,like nirvana hate happen enjoy music much mood
7998,All is ok,5,1,ok


#Preprocesado Pipeline

In [None]:
import pandas as pd

In [None]:
def review_normalizer(data_path='/content/', file_name='cds_&_vinyl_expl_done.csv', column_name='reviewText', sw_list=None):

  def sentence_normalization(sentence):
    sentence = unicodedata.normalize('NFKD', sentence).lower().encode('ascii', errors='ignore').decode('utf-8')
    sentence = re.sub(' +', ' ', ' '.join([word if word.isalpha() else '' for word in sentence.split()])).strip()
    return sentence

  def remove_stopwords(sentence, sw_list):
    sentence = ' '.join([word for word in sentence.split() if word not in sw_list])
    return sentence

  processed_sentences = []

  if sw_list is None:
      sw_list = []

  df = pd.read_csv(data_path + file_name, index_col = None)

  for sent in df[column_name]:
      if not sent != sent:  # check if sent is not nan
          sent = sentence_normalization(sent)
          sent = remove_stopwords(sent, sw_list)
          processed_sentences.append(sent)
      else:
          processed_sentences.append('None')

  df.loc[:, 'processedReview'] = processed_sentences
  df['processedReview'] = df['processedReview'].replace('', np.nan)
  df = df.dropna(subset=['processedReview'])

  return df

In [None]:
from stop_words import get_stop_words
sw_list = get_stop_words('en')

In [None]:
df = review_normalizer(data_path='/content/', file_name='cds_&_vinyl_expl_done.csv', column_name='reviewText', sw_list=sw_list)

In [None]:
df

Unnamed: 0,reviewText,overall,overall_binary,processedReview
0,What happened?????? They were so good back in ...,1,0,good back absolute even sell buddy diehard axe...
1,"Thanks to people like Curtis ""I been shot 9 ti...",1,0,thanks people like curtis shot drug woman crim...
2,dont waste your bucks .. its poor poor po...,1,0,dont waste bucks poor poor poor
3,the worst female performer ever seen with shri...,1,0,worst female performer ever seen shrieking voi...
4,What happened to the days when you could liste...,1,0,happened days listen metallica cd sounded fres...
...,...,...,...,...
7995,Thank you.,5,1,thank
7996,Now finally available a new superb 3 CD set on...,5,1,now finally available new superb cd set coveri...
7997,What is there to say. You like Nirvana or you ...,5,1,like nirvana hate happen enjoy music much mood
7998,All is ok,5,1,ok


Como se puede observar, ejecutando de forma correcta el pipeline, obtenemos el mismo resultado del proceso que tuvimos en el paso a paso durante el desarrollo del preprocesado. Guardamos el dataframe obtenido para usarlo en el siguiente: Modelado.

Por último, vemos que han desaparecido algunos registros por carencia de información 'relevante' según el preprocesado realizado.

In [None]:
df.to_csv('cds_&_vinyl_prepoc_done.csv', index=False)