# Laboratorio 6: Análisis de sentimientos
# * Eric Mendoza
# * Javier Jo
# * Marlon Fuentes
# ---

# 1. Preparación de entorno de ejecución

## Carga de librerías

In [None]:
# Matematica
import pandas as pd
import numpy as np

# Visualización
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
color = sns.color_palette()
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import plotly.tools as tls

from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer

from wordcloud import WordCloud, STOPWORDS

import warnings
warnings.filterwarnings('ignore')

import os
os.listdir("../input/grammar-and-online-product-reviews")

## Carga de data

In [None]:
df=pd.read_csv('../input/grammar-and-online-product-reviews/GrammarandProductReviews.csv')
df.head()

# 3. Preprocesamiento

## Procesamiento de nulos
# El análisis debe realizarse utilizando los reviews de las personas, por lo que una línea sin un review es inservible. Se eliminarán estas.

In [None]:
# Lineas nulas
df.isnull().sum()

In [None]:
# Eliminar nulos
df = df.dropna(subset=['reviews.text'])

## Eliminar mayúsculas a minúsculas

In [None]:
df['reviews.text'] = df['reviews.text'].apply(lambda line: line.lower())

## Eliminar caracteres especiales

In [None]:
import string
punc_ext = string.punctuation + '¡¿'
def remove_punctuation(text):
    return text.translate(text.maketrans('', '', punc_ext))

In [None]:
remove_punctuation('¡hola! TIO!?')

In [None]:
df['reviews.text'] = df['reviews.text'].apply(lambda line: remove_punctuation(line))

## Eliminar URL's

In [None]:
import re
def remove_url(text):
    return re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)

In [None]:
df['reviews.text'] = df['reviews.text'].apply(lambda line: remove_url(line))

## Eliminar emoticones
Para eliminar los emojis de los reviews, se utilizará la librería *emoji*, esta los convertirá a texto para que así puedan ser utilizadas para el análisis de calidad de review.

In [None]:
import emoji
df['reviews.text'] = df['reviews.text'].apply(lambda line: emoji.demojize(line))

### Eliminar stop-words

In [None]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# 4. Análisis exploratorio

## Frecuencia de tipos de calificaciones

In [None]:
sns.set(style="darkgrid")
sns.countplot(df['reviews.rating'])

## Palabras más utilizadas según tipo de calificación

In [None]:
r1 = df.ix[df['reviews.rating']==1, ['reviews.text']]
r2 = df.ix[df['reviews.rating']==2, ['reviews.text']]
r3 = df.ix[df['reviews.rating']==3, ['reviews.text']]
r4 = df.ix[df['reviews.rating']==4, ['reviews.text']]
r5 = df.ix[df['reviews.rating']==5, ['reviews.text']]

In [None]:
stopwords = set(STOPWORDS)

def most_used_words(data):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1).generate(str(data))

    fig = plt.figure(1, figsize=(15, 15))
    plt.axis('off')
    plt.imshow(wordcloud)
    plt.show()

### Rating 1

In [None]:
most_used_words(r1)

### Rating 2

In [None]:
most_used_words(r2)

### Rating 3

In [None]:
most_used_words(r3)

### Rating 4

In [None]:
most_used_words(r4)

### Rating 5

In [None]:
most_used_words(r5)

## Número de palabras según calificación

In [None]:
df['reviews_length'] = df['reviews.text'].apply(len)
g = sns.FacetGrid(df,col='reviews.rating',size=5)
g.map(plt.hist,'reviews_length', range=(0, 1200))

## Correlación entre variables

In [None]:
corr = df.corr()
f, ax = plt.subplots(figsize=(10, 5))
sns.heatmap(corr, cbar=True, annot=True,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

## Determinación de palabras positivas y negativas

### Creación de nuevas categorías de clasificación

In [None]:
def classifyRating(rate):
    if rate < 3:
        return 'Bad'
    elif rate == 3:
        return 'Neutral'
    else:
        return 'Good'

In [None]:
df['ReviewType'] = df['reviews.rating'].apply(lambda rate: classifyRating(rate))

In [None]:
def classifyRating2(rate):
    if rate < 3:
        return '0'
    elif rate == 3:
        return '1'
    else:
        return '2'

In [None]:
df['numberRate'] = df['reviews.rating'] < 4

### Clasificación de palabras buenas y malas

In [None]:
buenas = {}
malas = {}
    
for review, classType in zip(df['reviews.text'], df['ReviewType']):
    text = review.split(' ')
    for word in text:
        if word not in stop_words and word != '':
            if classType == 'Good':
                counter = buenas.get(word)
                if counter:
                    buenas[word] = counter + 1
                else:
                    buenas[word] = 1
            elif classType == 'Bad':
                counter = malas.get(word)
                if counter:
                    malas[word] = counter + 1
                else:
                    malas[word] = 1
                

In [None]:
import operator
best_good = sorted(buenas.items(), key=operator.itemgetter(1))
best_good.reverse()
best_good

In [None]:
best_bad = sorted(malas.items(), key=operator.itemgetter(1))
best_bad.reverse()
best_bad

In [None]:
def return_next(modelo, texto):
    return diccionario[:6]

# 5. Algoritmo de clasificación

### Terminar de cargar librerías

In [None]:
from keras.layers import Dense, Input, Flatten
from keras.layers import GlobalAveragePooling1D, Embedding
from keras.models import Model

In [None]:
np.random.seed(32)

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.manifold import TSNE

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Conv1D, MaxPooling1D, Dropout
from keras.utils.np_utils import to_categorical

### División en datasets y preparación de librerías

In [None]:
train_text, test_text, train_y, test_y = train_test_split(df['reviews.text'],df['numberRate'],test_size = 0.3)

In [None]:
MAX_NB_WORDS = 20000

texts_train = train_text.astype(str)
texts_test = test_text.astype(str)

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(texts_train)
sequences = tokenizer.texts_to_sequences(texts_train)
sequences_test = tokenizer.texts_to_sequences(texts_test)

word_index = tokenizer.word_index

In [None]:
index_to_word = dict((i, w) for w, i in tokenizer.word_index.items())
" ".join([index_to_word[i] for i in sequences[0]])

In [None]:
MAX_SEQUENCE_LENGTH = 150

x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
y_train = train_y
y_test = test_y

y_train = to_categorical(np.asarray(y_train))
print('Shape of label tensor:', y_train.shape)

In [None]:
from keras.layers import Dense, Input, Flatten
from keras.layers import GlobalAveragePooling1D, Embedding
from keras.models import Model

EMBEDDING_DIM = 50
N_CLASSES = 2

# input: a sequence of MAX_SEQUENCE_LENGTH integers
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)
embedded_sequences = embedding_layer(sequence_input)

average = GlobalAveragePooling1D()(embedded_sequences)
predictions = Dense(N_CLASSES, activation='softmax')(average)

model = Model(sequence_input, predictions)
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['acc'])

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

### LSTM

In [None]:
x = LSTM(128, dropout=0.2, recurrent_dropout=0.2)(embedded_sequences)
predictions = Dense(2, activation='softmax')(x)


model = Model(sequence_input, predictions)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
model.fit(x_train, y_train, validation_split=0.1,
          nb_epoch=2, batch_size=128)

In [None]:
output_test = model.predict(x_test)
print("test auc:", roc_auc_score(y_test,output_test[:,1]))

# 6. Resultados

Ahora se presenta el algoritmo de predicción de palabras. Este utiliza los modelos generados en las secciones anteriores junto con el diccionario desarrollado.

In [None]:
def predictor(modelo, texto):
    return return_next(modelo, texto)
    

In [None]:
predictor(model, 'this is a test of a bad bad bad bad bad review i hated this product a lot it sucked')

# 7. Estrategia para producto con menos reviews

Según el análisis realizado, el que tiene el peor modelo presenta una mayor cantidad de palabras negativas y emoticones de no agrado. Para mejorar sus reviews se recomienda enfocarse en las palabras que más se utilizan para describir descpectivamente a un producto y enfocarse en corregirlo exactamente en ese punto.