## Análise dos dados do dataset

In [137]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import nltk
from nltk import word_tokenize, download
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('rslp')
from nltk.stem import RSLPStemmer
import spacy

download('punkt')
download('stopwords')
download('wordnet')

pd.set_option("display.max_rows", 5)
pd.set_option('display.max_columns',100)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\STEFA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\STEFA\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\STEFA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\STEFA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\STEFA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv('../dataset/B2W-Reviews01.csv')

display(df.info(),df.sample())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132373 entries, 0 to 132372
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   submission_date        132373 non-null  object 
 1   reviewer_id            132373 non-null  object 
 2   product_id             132373 non-null  object 
 3   product_name           132289 non-null  object 
 4   product_brand          40982 non-null   object 
 5   site_category_lv1      132367 non-null  object 
 6   site_category_lv2      128360 non-null  object 
 7   review_title           132071 non-null  object 
 8   overall_rating         132373 non-null  int64  
 9   recommend_to_a_friend  132355 non-null  object 
 10  review_text            129098 non-null  object 
 11  reviewer_birth_year    126389 non-null  float64
 12  reviewer_gender        128237 non-null  object 
 13  reviewer_state         128382 non-null  object 
dtypes: float64(1), int64(1), object(12)


  exec(code_obj, self.user_global_ns, self.user_ns)


None

Unnamed: 0,submission_date,reviewer_id,product_id,product_name,product_brand,site_category_lv1,site_category_lv2,review_title,overall_rating,recommend_to_a_friend,review_text,reviewer_birth_year,reviewer_gender,reviewer_state
9882,2018-01-08 07:12:16,805fc232bf7f3e46f35c0c87c7ee88b1ef2a6b4bd637b5...,11380724,Bomba Elétrica A Pilha Para Galão De Água,,Utilidades Domésticas,Utensílios e Acessórios Domésticos,Produto de má qualidade,1,No,Produto não funcionou. Veio com o motor fraco ...,1969.0,M,SP


In [3]:
df.dropna(subset=['review_title', 'overall_rating','recommend_to_a_friend','review_text','reviewer_birth_year'],inplace=True)
df['reviewer_birth_year'] = df['reviewer_birth_year'].astype('int')

## NLP

##### Remoção de caracteres especiais, numeros e pontuações

In [171]:
def pre_processed(text):
    characters_1 = re.compile("[$.;:!\'?@,\"()\[\]]")
    characters_2 = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    characters_3 = re.compile('[0-9]+')

    text = characters_1.sub("", text)
    text = characters_2.sub(" ", text.lower())
    text = characters_3.sub("", text)
    
    return text

##### Stop words

In [170]:
stop_words = set(stopwords.words('portuguese'))

##### Processamento textual usando stemmatização do nltk

In [None]:
def processing_sw_stemmer(text):
    stemmer = RSLPStemmer()
    text = pre_processed(text)
    text_tokens = word_tokenize(text, language='portuguese') 
    text_without_sw = [token for token in text_tokens if token not in stop_words]
    text_stem = [stemmer.stem(token) for token in text_without_sw]
    return text_stem

test_stemm = df['review_text'].apply(processing_sw_stemmer)

##### Processamento utilizando a lematização do Spacy

In [152]:
def processing_sw_lemma(text):
    nlp = spacy.load('pt_core_news_sm')
    text = pre_processed(text)
    text_tokens = word_tokenize(text, language='portuguese') 
    text_without_sw = [token for token in text_tokens if token not in stop_words]
    doc = nlp(str(text_without_sw))
    text_lemma = [token.lemma_ for token in doc if token.pos_ == 'NOUN' or token.pos_ == 'ADJ']
    return text_lemma

test_lemma = df['review_text'].apply(processing_sw_lemma)

##### Processamento utilizando a stemmatização do nltk e lematização do Spacy

In [166]:
def processing_sw_lemma_stemmer(text):
    nlp = spacy.load('pt_core_news_sm')
    stemmer = RSLPStemmer()
    text = pre_processed(text)
    text_tokens = word_tokenize(text, language='portuguese') 
    text_without_sw = [token for token in text_tokens if token not in stop_words]
    doc = nlp(str(text_without_sw))
    text_lemma = [token.lemma_ for token in doc]
    text_stem = [stemmer.stem(token) for token in text_without_sw]
    return text_stem

test_lemma_stemm = df['review_text'][0:5].apply(processing_sw_lemma_stemmer)