# Setup do projeto

In [315]:
import os
import urllib

import pandas as pd
import numpy as np 
import matplotlib
import matplotlib.pyplot as plt 
import re
import nltk
import spacy
import sklearn

# Recuperação dos dados

In [316]:
#Dados obtivos de https://github.com/amitt001/Android-App-Reviews-Dataset
URL_ROOT = 'https://raw.githubusercontent.com/amitt001/Android-App-Reviews-Dataset/master/'

POSITIVE_REVIEWS_DATA_URL = URL_ROOT+'positive10k.txt'
NEGATIVE_REVIEWS_DATA_URL = URL_ROOT+'negative10k.txt'

DATA_PATH = os.path.join('..', 'data', 'raw')
POSITIVE_DATA_FILE = os.path.join(DATA_PATH, 'positive10k.txt')
NEGATIVE_DATA_FILE = os.path.join(DATA_PATH, 'negative10k.txt')


In [317]:
def download_data(data_url, data_path, data_file):
    os.makedirs(data_path, exist_ok=True)
    urllib.request.urlretrieve(data_url, data_file)

download_data(POSITIVE_REVIEWS_DATA_URL, DATA_PATH, POSITIVE_DATA_FILE)
download_data(NEGATIVE_REVIEWS_DATA_URL, DATA_PATH, NEGATIVE_DATA_FILE)

In [318]:
positive_df = pd.read_csv(POSITIVE_DATA_FILE,  delimiter = "\t", header=None, names=['reviews'])
negative_df = pd.read_csv(NEGATIVE_DATA_FILE,  delimiter = "\t", header=None, names=['reviews'])

In [319]:
#Adiciona coluna com 0 pra negativo e 1 pra positivo e concatena os dataframes
positive_df['label'] = 1
negative_df['label'] = 0
data = pd.concat([positive_df, negative_df])

In [320]:
data.head()

Unnamed: 0,reviews,label
0,Very simple and effective way for new words fo...,1
1,Fh d Fcfatgv,1
2,My son loved it. It is easy even though my son...,1
3,Brilliant A brilliant app that is challenging ...,1
4,Good I have gotten several updates and new gam...,1


In [321]:
data.shape

(19655, 2)

In [322]:
#Pegando só um amostra pra melhorar o desempenho
data = data.sample(frac=0.4, replace=False, random_state=42)

In [323]:
data.shape

(7862, 2)

# Preparação dos dados

In [324]:
# import spacy.cli
# spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [325]:
nlp = spacy.load('en_core_web_sm')

In [331]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thiago/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [379]:
from nltk.corpus import stopwords
stopwords_en = stopwords.words("english")
to_remove = [ 'not',  "aren't", "couldn't",]


In [387]:
doc = nlp("not aren't couldn't worst bad")
for word in doc:
    print(word.text)

not
are
n't
could
n't
worst
bad


In [380]:
stopwords_en

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [388]:
# Passar texto para minuscula, filtrar só as letras, retirar stopwords, lematizar
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[\W\d_]+", " ", text)
    #text = [word for word in text.split() if word not in stopwords_en]
    npl_text = nlp(text)
    tokens = [word.lemma_ if word.lemma_ != "-PRON-" else word.lower_ for word in npl_text]
    return " ".join(tokens)

In [389]:
data['reviews'] = data['reviews'].apply(clean_text)

In [390]:
data.head()

Unnamed: 0,reviews,label
2909,bore without well control display,0
4546,food great game amongst definately entertainme...,1
5927,excellent use free version till entry upgrade ...,1
5466,thank love wonderful thing well nail salon ama...,1
4797,home screen app ok aster recent update homepag...,0


In [391]:
from sklearn.feature_extraction.text import CountVectorizer
# Instanciando o CountVectorizer, binary=True faz a codificacao binaria
vectorizer = CountVectorizer(binary=True, max_features=5000)
text = data['reviews']

# Vetorizando o texto
X_bow = vectorizer.fit_transform(text)

In [392]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(max_features=5000)
# Vetorizando
X_tfidf = tfidf_vect.fit_transform(text)

In [393]:
X_bow

<7862x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 72853 stored elements in Compressed Sparse Row format>

In [394]:
X_tfidf.shape

(7862, 5000)

In [395]:
from sklearn.model_selection import train_test_split

X1_train, X1_test, y1_train, y1_test = train_test_split(X_bow, data['label'], test_size=0.3, random_state = 42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X_tfidf, data['label'], test_size=0.3, random_state = 42)


In [428]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Instanciando a reg. logistica
reglog = SVC()

# Aplicando o modelo
reglog.fit(X1_train, y1_train)

# Predicao
y1_reglog_pred = reglog.predict(X1_test)

In [426]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score

def mostra_metricas(y_true, y_pred):
  ''' Função que recebe o y real, o y predito e mostra as
  principais metricas.
  '''
  print("Acurácia: ", accuracy_score(y_true, y_pred))
  print("\nAUROC:", roc_auc_score(y_true, y_pred))
  print("\nF1-Score:", f1_score(y_true, y_pred, average='weighted'))
  print("\nMatriz de confusão:")
  sns.heatmap(confusion_matrix(y_true, y_pred), annot=True)
  plt.show()

In [429]:
mostra_metricas(y1_test, y1_reglog_pred)

Acurácia:  0.8664688427299704

AUROC: 0.8666917114837076

F1-Score: 0.8664706183856835

Matriz de confusão:


NameError: name 'sns' is not defined

In [421]:
def nova_predicao(texto):
    texto = clean_text(texto)
    texto_vetorizado = tfidf_vect.transform([texto])
    pred = reglog.predict(texto_vetorizado)

    if pred == 0:
        print("Negativa")
    elif pred == 1:
        print("Positiva")
    else:
        print("nem sei")    


In [431]:
nova_predicao('the best app in the world, i like more than other one')

Negativa


In [400]:
data['label'].value_counts()

1    4000
0    3862
Name: label, dtype: int64

In [401]:
import spacy
from spacy.pipeline.textcat import exclusive_classes

nlp = spacy.blank("en")
if "textcat" not in nlp.pipe_names:
    nlp.add_pipe( 'textcat', config=exclusive_classes, architecture='bow')



ImportError: cannot import name 'exclusive_classes' from 'spacy.pipeline.textcat' (/opt/anaconda3/lib/python3.8/site-packages/spacy/pipeline/textcat.py)

In [215]:
data.head()

Unnamed: 0,reviews,label
0,simple effective way new word kid,1
1,fh fcfatgv,1
2,son love easy even though son first grade high...,1
3,brilliant brilliant app challenge great fun th...,1
4,good get several update new game help alot,1


In [224]:
train_texts = data['reviews'].values
train_labels = [{'cats' : {'positive': label == 1, 'negative': label==0}} for label in data['label']]
train_data = list(zip(train_texts, train_labels))

In [227]:
import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separate lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

ValueError: [E955] Can't find table(s) lexeme_norm for language 'en' in spacy-lookups-data. Make sure you have the package installed or provide your own lookup tables if no default lookups are available for your language.