# 6º Período - Pré-processamento, Representação Vetorial e Classificação de Textos

In [3]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from pathlib import Path  
import glob
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
#import contractions

In [4]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Dados

In [5]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)

dataset_path = '/content/gdrive/MyDrive/Musical_instruments_reviews.csv'

Mounted at /content/gdrive


In [9]:
df = pd.read_csv(dataset_path)

df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [10]:
df.shape

(10261, 9)

In [11]:
df.isna().sum()

reviewerID         0
asin               0
reviewerName      27
helpful            0
reviewText         7
overall            0
summary            0
unixReviewTime     0
reviewTime         0
dtype: int64

Como o número de NAs é insignificante, não há problema em ignorar essas linhas

In [12]:
df = df.dropna()
df.isna().sum()

reviewerID        0
asin              0
reviewerName      0
helpful           0
reviewText        0
overall           0
summary           0
unixReviewTime    0
reviewTime        0
dtype: int64

In [13]:
df.shape

(10227, 9)

## Função de pré-processamento dos dados usando RegEx e NLTK;

(lowercasing, remover pontuações, stopwords, lemmatize, etc);

In [None]:
def pre_processing_regex(texto, patterns):
  novo = texto.tolist()
  for frase in novo:
    novo_texto = re.sub(patterns, lambda m: m.group(0).lower(), frase)
    novo_texto = novo_texto.translate(str.maketrans('', '', string.punctuation))
    novo.append(novo_texto)
  return novo

#phrase = ['TESTANDO, 1. 2. 3. 123!!','Que vontade DE UM STROGONOFF CARA!!! SERIO BRO??? ;-;']

phrase = df['reviewerName'].head(1)
padrao= '[A-Z]+'

pre_processing_regex(phrase,padrao)

Apesar de criar a função e ela estar funcional, não a utilziaremos por existirem diversos métodos mais eficientes e intuitivos para pré processar os dados, como utilizando o NLTK:

In [14]:
data = df[['reviewText','summary']]

A **primeira ação** ao texto é expandir as contrações das palavras, ou seja, "I´ve" se torna "I have" e são tokenizados juntos, ou seja, não são alocadas diferentes como em "I" "have". A **segunda ação** é a Tokenização, ou seja, cada palavra individual vai ser dividida em um token. A **terceira ação** é converter todas as palavras para a letra minúscula. A **quarta ação** é para remover as pontuações. A **quinta ação** é para remover as chamadas "stop words", também chamadas de palavras vazias, ou seja, palavras sem significado (já definidas em uma função). A **sexta e última ação** é a chamada de "Lemmatization", que classifica as palavras em adjetivo, verbo, pronome, substantivo e etc.

In [15]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def pre_processing_nltk(data):
  for column in data.columns:
 #   data[column] = data[column].apply(lambda x: [contractions.fix(word) for word in x.split()])

    data[column] = data[column].apply(word_tokenize)

    data[column] = data[column].apply(lambda x: [word.lower() for word in x])
 
    data[column] = data[column].apply(lambda x: [word for word in x if word not in string.punctuation])

    data[column] = data[column].apply(lambda x: [word for word in x if word not in stop_words])

    data[column] = data[column].apply(nltk.tag.pos_tag)

    data[column] = data[column].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

    wnl = WordNetLemmatizer()
    data[column] = data[column].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])

  return data

def preprocessnltk(data):
  data['reviewText'] = data['reviewText'].apply(word_tokenize)

  data['reviewText'] = data['reviewText'].apply(lambda x: [word.lower() for word in x])
 
  data['reviewText'] = data['reviewText'].apply(lambda x: [word for word in x if word not in string.punctuation])

  data['reviewText'] = data['reviewText'].apply(lambda x: [word for word in x if word not in stop_words])

  data['reviewText'] = data['reviewText'].apply(nltk.tag.pos_tag)

  data['reviewText'] = data['reviewText'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

  wnl = WordNetLemmatizer()
  data['reviewText'] = data['reviewText'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])

  return data

#data = pre_processing_nltk(data)

In [None]:
data = pre_processing_nltk(data)

In [18]:
data.head()

Unnamed: 0,reviewText,summary
0,"[much, write, exactly, 's, suppose, filter, po...",[good]
1,"[product, exactly, quite, affordable.i, realiz...",[jake]
2,"[primary, job, device, block, breath, would, o...","[job, well]"
3,"[nice, windscreen, protects, mxl, mic, prevent...","[good, windscreen, money]"
4,"[pop, filter, great, look, performs, like, stu...","[pop, record, vocal]"


## Tokenizando os textos com os métodos de BOW e TFIFD (sklearn);

In [19]:
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)  

In [24]:
tfidf.fit(data['reviewText'])
tfidf.vocabulary_

{'much': 16086,
 'write': 27445,
 'exactly': 9463,
 "'s": 215,
 'suppose': 23731,
 'filter': 10030,
 'pop': 18802,
 'sound': 22565,
 'recording': 20144,
 'crisp': 7098,
 'one': 17214,
 'low': 14738,
 'price': 19130,
 'amazon': 3111,
 'might': 15569,
 'well': 26918,
 'buy': 5311,
 'honestly': 12388,
 'work': 27324,
 'despite': 7774,
 'product': 19300,
 'quite': 19752,
 'affordable.i': 2811,
 'realize': 19995,
 'double': 8338,
 'screen': 21291,
 'arrive': 3571,
 'even': 9413,
 'expected.as': 9583,
 'add': 2685,
 'bonus': 4804,
 'carry': 5625,
 'small': 22226,
 'hint': 12275,
 'smell': 22257,
 'old': 17155,
 'grape': 11341,
 'candy': 5513,
 'use': 26151,
 'reminiscent': 20358,
 'sake': 21077,
 'stop': 23218,
 'put': 19601,
 'next': 16520,
 'nose': 16788,
 'record': 20125,
 'dif': 7900,
 'need': 16411,
 'expensive': 9598,
 'may': 15169,
 'come': 6387,
 'please': 18633,
 'aroma': 3548,
 'like': 14417,
 'mine': 15623,
 'primary': 19196,
 'job': 13686,
 'device': 7821,
 'block': 4681,
 'breat

In [26]:
vect = CountVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)

vect.fit(data['reviewText'])

CountVectorizer(preprocessor=<function dummy_fun at 0x7f3778b24a70>,
                token_pattern=None,
                tokenizer=<function dummy_fun at 0x7f3778b24a70>)

In [27]:
vect.vocabulary_

{'much': 16086,
 'write': 27445,
 'exactly': 9463,
 "'s": 215,
 'suppose': 23731,
 'filter': 10030,
 'pop': 18802,
 'sound': 22565,
 'recording': 20144,
 'crisp': 7098,
 'one': 17214,
 'low': 14738,
 'price': 19130,
 'amazon': 3111,
 'might': 15569,
 'well': 26918,
 'buy': 5311,
 'honestly': 12388,
 'work': 27324,
 'despite': 7774,
 'product': 19300,
 'quite': 19752,
 'affordable.i': 2811,
 'realize': 19995,
 'double': 8338,
 'screen': 21291,
 'arrive': 3571,
 'even': 9413,
 'expected.as': 9583,
 'add': 2685,
 'bonus': 4804,
 'carry': 5625,
 'small': 22226,
 'hint': 12275,
 'smell': 22257,
 'old': 17155,
 'grape': 11341,
 'candy': 5513,
 'use': 26151,
 'reminiscent': 20358,
 'sake': 21077,
 'stop': 23218,
 'put': 19601,
 'next': 16520,
 'nose': 16788,
 'record': 20125,
 'dif': 7900,
 'need': 16411,
 'expensive': 9598,
 'may': 15169,
 'come': 6387,
 'please': 18633,
 'aroma': 3548,
 'like': 14417,
 'mine': 15623,
 'primary': 19196,
 'job': 13686,
 'device': 7821,
 'block': 4681,
 'breat

## Utilizando classificadores clássicos de Machine Learning;


## Tokenização dos textos usando o Tensorflow (Tokenizer e método text_to_sequences);

## Rede Neural Recorrente Simples com camada de Embedding;

## Substituindo a RNN simples por uma LSTM unidirecional e uma bidimensional;