### Data import

In [21]:
import pandas as pd
import seaborn as sns
import sys

data = pd.read_json('jsons/reviews_class_nubank.json')
data.describe()
#data.loc[:,["text"]]

Unnamed: 0,text
count,240
unique,240
top,por algum motivo o aplicativo nao esta querend...
freq,1


### Cleaning text

In [22]:
import spacy
import re
import unicodedata
import string

def setup_abbr():
    file = open("abbr_portuguese.txt", encoding='utf-8')
    abbr_dict = {}

    for line in file:
        w = line.split(";")
        abbr_dict[w[0]] = w[1].replace("\n", "")
    file.close()

    return abbr_dict

def clean(data):
    doc = nlp(data)
    doc_lower = doc.text.lower()
    punct = string.punctuation
    for c in punct:
        doc_lower = doc_lower.replace(c, "")
    doc_without_emoji = emoji_pattern.sub(r'', doc_lower)
    doc_punctuation = u"".join([c for c in unicodedata.normalize('NFKD', doc_without_emoji) if not unicodedata.combining(c)])
    doc_corrected = nlp(" ".join([abbr_dict.get(w, w) for w in doc_punctuation.split()]))
    
    return doc_corrected.text

nlp = spacy.load('pt_core_news_sm')
abbr_dict = setup_abbr()
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

data['cleaned_reviews'] = data['text'].apply(clean)
#data.loc[:,["cleaned_reviews"]]

### Tagging

In [23]:
import joblib
from nltk import word_tokenize

def wordTag(text):
    tagger = joblib.load('POS_tagger_brill.pkl')
    text = tagger.tag(word_tokenize(text))
    return text
 
data['tag_reviews'] = data['cleaned_reviews'].apply(wordTag)
data.loc[:,["cleaned_reviews", "tag_reviews"]]

Unnamed: 0,cleaned_reviews,tag_reviews
0,por algum motivo o aplicativo nao esta querend...,"[(por, PREP), (algum, PROADJ), (motivo, N), (o..."
1,o aplicativo tem muito problema e bug faz um m...,"[(o, ART), (aplicativo, N), (tem, V), (muito, ..."
2,a nubank veio piorando muito esses dias agora ...,"[(a, ART), (nubank, NPROP), (veio, V), (pioran..."
3,amo usar o nubank e pratico de facil acesso in...,"[(amo, N), (usar, V), (o, ART), (nubank, NPROP..."
4,pessimo banco recebi um bloqueio do nada por s...,"[(pessimo, ADJ), (banco, N), (recebi, V), (um,..."
...,...,...
235,pior analise de credito que ha no mercado nao ...,"[(pior, ADJ), (analise, V), (de, PREP), (credi..."
236,ao pressionar para colar a chave do pix o apli...,"[(ao, PREP), (pressionar, V), (para, PREP), (c..."
237,a entrada no aplicativo digitando o cpf e muit...,"[(a, ART), (entrada, N), (no, PREP), (aplicati..."
238,ja tem um tempo tinha esquecido porque da outr...,"[(ja, N), (tem, V), (um, ART), (tempo, N), (ti..."


### Tokenization

In [24]:
import nltk
from nltk.tokenize import word_tokenize

def tokenize(text):
    text = word_tokenize(text)
    return text

data['tokenized_reviews'] = data['cleaned_reviews'].apply(tokenize)
data.loc[:,["tokenized_reviews"]]

Unnamed: 0,tokenized_reviews
0,"[por, algum, motivo, o, aplicativo, nao, esta,..."
1,"[o, aplicativo, tem, muito, problema, e, bug, ..."
2,"[a, nubank, veio, piorando, muito, esses, dias..."
3,"[amo, usar, o, nubank, e, pratico, de, facil, ..."
4,"[pessimo, banco, recebi, um, bloqueio, do, nad..."
...,...
235,"[pior, analise, de, credito, que, ha, no, merc..."
236,"[ao, pressionar, para, colar, a, chave, do, pi..."
237,"[a, entrada, no, aplicativo, digitando, o, cpf..."
238,"[ja, tem, um, tempo, tinha, esquecido, porque,..."


### Stopwords remove

In [25]:
from nltk.corpus import stopwords
from spacy.lang.pt.stop_words import STOP_WORDS

def stopwordsRemove(text):
    STOP_WORDS.update(['nao', 'sim', 'caixa', 'nubank', 'aplicativo', 'dinheiro', 'acessar', 'consigo', 'banco', 'email', 'pra', 'pro', 'ta', 'ja', 'so', 'fica'])
    stop_words = list(set(stopwords.words('portuguese') + list(STOP_WORDS)))
    phrase = []
    for word in text:
        if word not in stop_words:
            phrase.append(word)
    return phrase

data['stopwords_reviews'] = data['tokenized_reviews'].apply(stopwordsRemove)
data.loc[:,["stopwords_reviews"]]

Unnamed: 0,stopwords_reviews
0,"[algum, motivo, querendo, abrir, celular, mand..."
1,"[problema, bug, mes, tento, mudaram, entrar, d..."
2,"[veio, piorando, dias, voce, colocar, cartao, ..."
3,"[amo, pratico, facil, acesso, interface, linda..."
4,"[pessimo, recebi, bloqueio, simplesmente, cont..."
...,...
235,"[pior, analise, credito, ha, mercado, levam, c..."
236,"[pressionar, colar, chave, pix, demora, aciona..."
237,"[entrada, digitando, cpf, ruim, deveria, atrav..."
238,"[esquecido, ate, desisti, tento, conta, pj, mo..."


### Stemming

In [37]:
import nltk
nltk.download('rslp')
from nltk.stem import RSLPStemmer

def stemming(text):
    stemmer = RSLPStemmer()
    phrase = []
    for word in text:
        phrase.append(stemmer.stem(word))
    return phrase

data['stem_reviews'] = data['tokenized_reviews'].apply(stemming)
#data.loc[:,["stem_reviews"]]

[nltk_data] Downloading package rslp to /Users/moabsouza/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


### Lemmatizer

In [39]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    lemma = " "
    for word in text:
        lemma = wordnet_lemmatizer.lemmatize(word)
        
    return lemma

data['lemma_reviews'] = data['stopwords_reviews'].apply(lemmatize)
data.loc[:,["lemma_reviews"]]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/moabsouza/nltk_data...


Unnamed: 0,lemma_reviews
0,conta
1,piorar
2,acesso
3,duvidas
4,prendem
...,...
235,excelencia
236,comando
237,pronto
238,lixo


### Phrase junction

In [40]:
def juction(text):
    phrase = []
    for word in text:
        phrase.append(word)
    
    phraseStr = ' '.join(phrase)
    return phraseStr

data['junction'] = data['stem_reviews'].apply(juction)
data.loc[:,["junction"]]

Unnamed: 0,junction
0,por algum motiv o aplic nao est quer abr no me...
1,o aplic tem muit problem e bug faz um me ja qu...
2,a nubank vei pior muit ess dia agor voc tem qu...
3,amo us o nubank e pra de facil acess interfac ...
4,pess banc receb um bloquei do nad por simples ...
...,...
235,pi analis de credit que ha no merc nao lev em ...
236,ao press par col a chav do pix o aplic dem par...
237,a entr no aplic digit o cpf e muit ruim dev se...
238,ja tem um temp tinh esquec porqu da outr vez e...


### Security posts extraction

In [41]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load('pt_core_news_sm')

def securityReviewsClassifier(text):
    securityReviews = []
    doc = nlp(text)
    
    securityTerms = mostCommonStem
    patterns = [nlp(term) for term in securityTerms]
    
    matcher = PhraseMatcher(nlp.vocab) 
    matcher.add("SECURITY_PATTERN", patterns)
    
    matches = matcher(doc)
    
    for i in range(0,len(matches)):
        token = doc[matches[i][1]:matches[i][2]]
        securityReviews.append(str(token))
            
    return securityReviews

def class1(text):
    class1Reviews = []
    doc = nlp(text)
    
    class1Terms = ['senh', 'acess']
    patterns = [nlp(term) for term in class1Terms]
    
    matcher = PhraseMatcher(nlp.vocab) 
    matcher.add("SECURITY_PATTERN", patterns)
    
    matches = matcher(doc)
    
    for i in range(0,len(matches)):
        token = doc[matches[i][1]:matches[i][2]]
        class1Reviews.append(str(token))
            
    return class1Reviews

def class2(text):
    class2Reviews = []
    doc = nlp(text)
    
    class2Terms = ['assinat', 'eletron', 'biometr', 'reconhec', 'fac']
    patterns = [nlp(term) for term in class2Terms]
    
    matcher = PhraseMatcher(nlp.vocab) 
    matcher.add("SECURITY_PATTERN", patterns)
    
    matches = matcher(doc)
    
    for i in range(0,len(matches)):
        token = doc[matches[i][1]:matches[i][2]]
        class2Terms.append(str(token))
            
    return class2Reviews

def class3(text):
    class3Reviews = []
    doc = nlp(text)
    
    class3Terms = ['bloquei', 'chav']
    patterns = [nlp(term) for term in class3Terms]
    
    matcher = PhraseMatcher(nlp.vocab) 
    matcher.add("SECURITY_PATTERN", patterns)
    
    matches = matcher(doc)
    
    for i in range(0,len(matches)):
        token = doc[matches[i][1]:matches[i][2]]
        class3Reviews.append(str(token))
            
    return class3Reviews

def class4(text):
    class4Reviews = []
    doc = nlp(text)
    
    class4Terms = ['fraud', 'golp', 'clon', 'roub']
    patterns = [nlp(term) for term in class4Terms]
    
    matcher = PhraseMatcher(nlp.vocab) 
    matcher.add("SECURITY_PATTERN", patterns)
    
    matches = matcher(doc)
    
    for i in range(0,len(matches)):
        token = doc[matches[i][1]:matches[i][2]]
        class4Reviews.append(str(token))
            
    return class4Reviews

# data['security_reviews'] = data['junction'].apply(securityReviewsClassifier)
# data['class1'] = data['junction'].apply(class1)
# data['class2'] = data['junction'].apply(class2)
# data['class3'] = data['junction'].apply(class3)
data['class4'] = data['junction'].apply(class4)
# data.loc[:, 'class1']
# data.loc[:, 'class2']
# data.loc[:, 'class3']
# data.loc[:, 'class4']

In [14]:
import pandas as pd

extracted = []

for i in range(len(data)):
    if len(data.loc[i,'class4'])!=0:
        extracted.append(data.loc[i,'cleaned_reviews'])
        
dfExtracted = pd.DataFrame(extracted, columns=["reviews_classified_all"])
dfExtracted.to_csv("csvresult_class4_caixa.csv", columns = ["reviews_classified_all"])

#data.to_csv("csvresult_all.csv", columns = ["security_reviews"])

In [20]:
import nltk
from nltk.stem import RSLPStemmer

def stemming(text):
    stemmer = RSLPStemmer()
    phrase = []
    
    for word in text:
        phrase.append(stemmer.stem(word))
        
    return phrase 

words = ['acesso', 'senha', 'assinatura', 'eletronica', 'biometria', 'reconhecimento', 'facial', 'bloqueio', 'chave', 'fraude', 'golpe', 'clonagem', 'roubo']

stem = stemming(words)

print(stem)

LookupError: 
**********************************************************************
  Resource [93mrslp[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('rslp')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mstemmers/rslp/step0.pt[0m

  Searched in:
    - '/Users/moabsouza/nltk_data'
    - '/opt/homebrew/opt/python@3.11/Frameworks/Python.framework/Versions/3.11/nltk_data'
    - '/opt/homebrew/opt/python@3.11/Frameworks/Python.framework/Versions/3.11/share/nltk_data'
    - '/opt/homebrew/opt/python@3.11/Frameworks/Python.framework/Versions/3.11/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************
