# Recuperação dos dados

Os dados foram obtidos de https://github.com/amitt001/Android-App-Reviews-Dataset

In [None]:
import os

URL_ROOT = 'https://raw.githubusercontent.com/amitt001/Android-App-Reviews-Dataset/master/'

POSITIVE_REVIEWS_DATA_URL = URL_ROOT+'positive10k.txt'
NEGATIVE_REVIEWS_DATA_URL = URL_ROOT+'negative10k.txt'

DATA_PATH = os.path.join('..', 'data', 'raw')
POSITIVE_DATA_FILE = os.path.join(DATA_PATH, 'positive10k.txt')
NEGATIVE_DATA_FILE = os.path.join(DATA_PATH, 'negative10k.txt')


In [None]:
import urllib

def download_data(data_url, data_path, data_file):
    os.makedirs(data_path, exist_ok=True)
    urllib.request.urlretrieve(data_url, data_file)

download_data(POSITIVE_REVIEWS_DATA_URL, DATA_PATH, POSITIVE_DATA_FILE)
download_data(NEGATIVE_REVIEWS_DATA_URL, DATA_PATH, NEGATIVE_DATA_FILE)

In [None]:
import pandas as pd

positive_df = pd.read_csv(POSITIVE_DATA_FILE,  delimiter = "\t", header=None, names=['reviews'])
negative_df = pd.read_csv(NEGATIVE_DATA_FILE,  delimiter = "\t", header=None, names=['reviews'])

In [None]:
#Adiciona coluna com 0 pra negativo e 1 pra positivo e concatena os dataframes
positive_df['label'] = 1
negative_df['label'] = 0
data = pd.concat([positive_df, negative_df])

In [None]:
data.head()

In [None]:
data.shape

In [None]:
#Pegando só um amostra pra melhorar o desempenho
data = data.sample(frac=0.4, replace=False, random_state=42)

In [None]:
data.shape

# Preparação dos dados

Download do modelo do spacy e das stopwords do nltk. Descomentar as linhas ao rodas pela primeira vez.

In [None]:
# import spacy.cli
# spacy.cli.download("en_core_web_sm")

In [None]:
# import nltk
# nltk.download('stopwords')

## Preparando os dados

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
from nltk.corpus import stopwords
stopwords_en = stopwords.words("english")

Removendo as stopwords que são úteis para a análise do conjunto de stop words

In [None]:
to_remove = [ 'not',  "aren't", "couldn't",]

for word in to_remove:
    stopwords_en.remove(word)


Função que limpa o texto filtrando somente as letras, retirando as stopword e lematizando. 

In [None]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[\W\d_]+", " ", text)
    text = [word for word in text.split() if word not in stopwords_en]
    nlp_text = nlp(" ".join(text))
    tokens = [word.lemma_ if word.lemma_ != "-PRON-" else word.lower_ for word in nlp_text]
    return " ".join(tokens)

In [None]:
data['reviews'] = data['reviews'].apply(clean_text)

In [None]:
data.head()

# Testando os modelos

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

text = data['reviews']

vectorizer = CountVectorizer(binary=True, max_features=5000)
X_bow = vectorizer.fit_transform(text)

tfidf_vect = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vect.fit_transform(text)
y = data['label']

In [None]:
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X_bow, y, test_size=0.3, random_state = 42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X_tfidf, y, test_size=0.3, random_state = 42)


## Testando os modelos com o GridSearchCV

In [None]:
import sklearn

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
def build_classifiers():
    classifiers = []
    classifiers.append(
                      ('knn',                              # nome do classificador
                        KNeighborsClassifier(),            # instancia do classificador
                        {'n_neighbors' : range(1, 33, 2)}  # hiperparametros
                      )
    )
    
    classifiers.append(
                      ('lr',                               
                        LogisticRegression(max_iter=1000), 
                        {'penalty' : ['l2'], 'C' : [100, 10, 1, 0.1, 0.01]}  
                      )
    )
    
    classifiers.append(
                      ('dt',
                        DecisionTreeClassifier(),
                        {'max_depth' : [2, 4, 6, 8, 10, 12]}
                      )  
    )
    
    classifiers.append(
                      ('rf',
                        RandomForestClassifier(),
                        {'n_estimators' : [10, 50, 100]}
                      ) 
    )

    return classifiers

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import KFold

classifiers = build_classifiers()

for name, model , parameters in classifiers:
    print('\n>> Classifier name: {}'.format(name))
    gs = GridSearchCV(estimator=model, param_grid=parameters, refit=True, cv = 5, n_jobs=3)
    gs.fit(X1_train, y1_train)
    print("BOW Best parameters: {}".format(gs.best_params_))
    print("BOW Best score: {}".format(gs.best_score_))
    gs.fit(X2_train, y2_train)
    print("TFIDF Best parameters: {}".format(gs.best_params_))
    print("TFIDFBest score: {}".format(gs.best_score_))

In [None]:
clf = LogisticRegression(max_iter=1000, C = 1, penalty = 'l2')
clf.fit(X2_train, y2_train)
y_pred = clf.predict(X2_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme()

def mostra_metricas(y_true, y_pred):
  ''' Função que recebe o y real, o y predito e mostra as
  principais metricas.
  '''
  print("Acurácia: ", accuracy_score(y_true, y_pred))
  print("\nAUROC:", roc_auc_score(y_true, y_pred))
  print("\nF1-Score:", f1_score(y_true, y_pred, average='weighted'))
  print("\nMatriz de confusão:")
  sns.heatmap(confusion_matrix(y_true, y_pred), annot=True)
  plt.show()

In [None]:
mostra_metricas(y2_test, y_pred)

In [None]:
def predict_label(text):
    text = clean_text(text)
    text = tfidf_vect.transform([text])
    label = clf.predict(text)[0]
    if label == 0:
        return "Negativo"
    elif label == 1:
        return "Positivo"
    else:
        return "Neutro"



In [None]:
reviews = ['Its a good app', "its a bad app", "Better app", "Worst app", "I like it", "I dont like it"]
for review in reviews:
    print(f"Review: {review} --> TAG: {predict_label(review)}")


# Deploy do modelo


In [None]:
import joblib

DEPLOY_PATH = os.path.join('..', 'model')
DEPLOY_FILE = 'finalized.sav'

os.makedirs(DEPLOY_PATH, exist_ok=True)

deploy_path = os.path.join(DEPLOY_PATH, DEPLOY_FILE)


joblib.dump(predict_label, deploy_path)