# NLP - olist review dataset

In this exercise, you will go back to the Olist dataset. Run the code below to load the data.

In [21]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, max_error, accuracy_score

In [2]:
data = pd.read_csv("data/olist_review.csv")
data.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,review_id,length_review,review_score,order_id,product_category_name,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,7bc2406110b926393aa56f80a40eba40,0,4,73fc7af87114b39712e6da79b0a377eb,esporte_lazer,,,2018-01-18 00:00:00,2018-01-18 21:46:59,41dcb106f807e993532d446263290104,delivered,2018-01-11 15:30:49,2018-01-11 15:47:59,2018-01-12 21:57:22,2018-01-17 18:42:41,2018-02-02 00:00:00
1,80e641a11e56f04c1ad469d5645fdfde,0,5,a548910a1c6147796b98fdf73dbeba33,informatica_acessorios,,,2018-03-10 00:00:00,2018-03-11 03:05:13,8a2e7ef9053dea531e4dc76bd6d853e6,delivered,2018-02-28 12:25:19,2018-02-28 12:48:39,2018-03-02 19:08:15,2018-03-09 23:17:20,2018-03-14 00:00:00
2,228ce5500dc1d8e020d8d1322874b6f0,0,5,f9e4b658b201a9f2ecdecbb34bed034b,informatica_acessorios,,,2018-02-17 00:00:00,2018-02-18 14:36:24,e226dfed6544df5b7b87a48208690feb,delivered,2018-02-03 09:56:22,2018-02-03 10:33:41,2018-02-06 16:18:28,2018-02-16 17:28:48,2018-03-09 00:00:00
3,e64fb393e7b32834bb789ff8bb30750e,37,5,658677c97b385a9be170737859d3511b,ferramentas_jardim,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06,de6dff97e5f1ba84a3cd9a3bc97df5f6,delivered,2017-04-09 17:41:13,2017-04-09 17:55:19,2017-04-10 14:24:47,2017-04-20 09:08:35,2017-05-10 00:00:00
4,f7c4243c7fe1938f181bec41a392bdeb,100,5,8e6bfb81e283fa7e4f11123a3fb894f1,esporte_lazer,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53,5986b333ca0d44534a156a52a8e33a83,delivered,2018-02-10 10:59:03,2018-02-10 15:48:21,2018-02-15 19:36:14,2018-02-28 16:33:35,2018-03-09 00:00:00


In [3]:
data['review_score'] = pd.to_numeric(data['review_score'], errors='coerce', downcast='integer')
data.dropna(subset=['review_comment_message', 'review_score'], inplace=True)

## Clean data

In [7]:
def remove_punctuation(text):
    import string 
    for punctuation in string.punctuation:
        text = text.str.replace(punctuation, '')
    return text

def lower_text(text):
    return text.str.lower()

def remove_numbers(text):
    numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
    for number in numbers:
        text = text.replace(number, '')
    return text

def remove_stopwords(text):
    stop_words = stopwords.words('portuguese')
    word_tokens = word_tokenize(text)
    text = [w for w in word_tokens if not w in stop_words] 
    return ' '.join(text)

def lemm_text(text):
    lemmatizer = WordNetLemmatizer()
    text = word_tokenize(text) 
    lemmatized = [lemmatizer.lemmatize(word) for word in text]
    return ' '.join(lemmatized)

In [14]:
data['clean_text'] = remove_punctuation(data['review_comment_message'])
data['clean_text'] = lower_text(data['clean_text'])
data['clean_text'] = remove_numbers(data['clean_text'])
data['clean_text'] = data['clean_text'].map(lambda x: remove_stopwords(x))
data['clean_text'] = data['clean_text'].map(lambda x: lemm_text(x))

  text = text.str.replace(punctuation, '')


## Model

In [16]:
X = data['clean_text']
y = data['review_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### NB

In [17]:
pipeline_1 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

parameters_1 = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'nb__alpha': (0.01,0.1,1),}

grid_search_1 = GridSearchCV(pipeline_1, parameters_1, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search_1.fit(data['clean_text'], y)

print('tfid')
print(grid_search_1.best_params_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
tfid
{'nb__alpha': 0.1, 'tfidf__ngram_range': (1, 1)}


In [18]:
pipe = grid_search_1.best_estimator_
pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('nb', MultinomialNB(alpha=0.1))])

In [19]:
pred = pipe.predict(X_test)

In [22]:
r2_score(y_test, pred)

0.4830813151903116

### LDA

In [23]:
vectorizer = TfidfVectorizer().fit(X_train)

data_vectorized = vectorizer.transform(X_train)

lda_model = LatentDirichletAllocation(n_components=2).fit(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])

print_topics(lda_model, vectorizer)

Topic 0:
[('bom', 1508.9156288751578), ('prazo', 1365.9637110282197), ('produto', 1304.8779278098755), ('ante', 1067.9345446655702), ('entrega', 1011.8755650669912), ('chegou', 944.0049783139564), ('entregue', 681.8583748624602), ('tudo', 568.497316523499), ('bem', 528.800970440287), ('gostei', 508.691543533337)]
Topic 1:
[('produto', 721.1735550509152), ('recebi', 619.5945462307276), ('excelente', 603.665498251648), ('ótimo', 600.1042737751467), ('recomendo', 540.2638271814518), ('boa', 398.2980100096355), ('ainda', 349.58384670222364), ('comprei', 330.2737566478495), ('qualidade', 317.6694664510917), ('otimo', 302.6415787850103)]


