## Testes de valência para o projeto final de IA369Y 2 Semestre 2018

Passos para tratar os dados com valência, testar e escolher um classificador para utilizar no projeto final de IA369Y.

1) Remover espaços duplos, quebras de linha, números e links do dataset e das frases a serem testadas.

2) Remover stopwords e aplicar o stemmer.

3) Treinar os classificadores.

4) Realizar as predições com os classificadores.

5) Avaliar as medidas obtidas com os classificadores.

In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle
import time

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk import word_tokenize
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.externals import joblib

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

def highlight_max(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)


def highlight_min(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_min = data == data.min()
        return [attr if v else '' for v in is_min]
    else:  # from .apply(axis=None)
        is_min = data == data.min().min()
        return pd.DataFrame(np.where(is_min, attr, ''),
                            index=data.index, columns=data.columns)   

### Matriz de Resultados

In [2]:
classf = {
    'MultinomialNB': 0,
    'ComplementNB': 0,
    'LogisticRegression': 0,
    'RandomForestClassifier': 0,
    'KNeighborsClassifier': 0,
    'MLPClassifier': 0,
    'LinearSVC': 0,
    'SVC': 0
}

matriz_resultados = {
    'all': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    },
    'tweets': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),       
    },
    'titulo_noticias': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    }
}

train_time = {
    'all': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    },
    'tweets': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),       
    },
    'titulo_noticias': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    }
}

predict_time = {
    'all': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    },
    'tweets': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),       
    },
    'titulo_noticias': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    }
}

## Datasets

Para validar, serão utilizados dois datasets.

Os dos datasets foram obtidos do site minerando dados.

O primeiro deles tem tweets de política de Minas Gerais com rótulos de valência: positivo, negativo e neutro. Foi feito um tratamento para eliminar tweets repetidos e dessa forma sobraram 3016 tweets.

O segundo contém 2123 títulos de notícias com rótulos de valência: positivo, negativo e neutro.

In [3]:
path = '/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/valencia/'

#Carregando os datasets
def carregar(filename):
    frases = []
    with open(filename, 'r', encoding='utf-8') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            valencia = row[1].upper()
            if len(frase) > 5:
                frases.append((valencia, frase))
    return frases

In [4]:
#Carrega os datasets em separado
tweets_mg = carregar(f'{path}tweets_mg_tratados.csv')
shuffle(tweets_mg)
titulo_noticias = carregar(f'{path}titulo_noticias.txt')
shuffle(titulo_noticias)

frases = tweets_mg + titulo_noticias
shuffle(frases)

print(frases[:5])
print('-' * 20)
print(tweets_mg[:5])
print('-' * 20)
print(titulo_noticias[:5])

[('POSITIVO', 'vcs ir viaj pra min lug afet febr amarel favor procur vacin'), ('POSITIVO', 'ivec vend ônibus govern min ger jornaut'), ('POSITIVO', 'pmg pmg prisã tráfic drog estelionat'), ('NEUTRO', 'alert nom utiliz golp estar'), ('NEUTRO', 'ceciliedit nã sab pra quê serv vereador nã fiscaliz nad govern min ger')]
--------------------
[('NEUTRO', 'bitcoin super dól máxim três ano estar min'), ('NEUTRO', 'new post quart estar garant direit amament públic'), ('NEUTRO', 'julianaciprian ministéri públic restring entrevist promotor imprens politic estar min'), ('POSITIVO', 'cham inicial design iníci dia janeir'), ('POSITIVO', 'pmg frustr tentat furt bom esperanc')]
--------------------
[('NEUTRO', 'sabesp quer capt mais biliã litr volum mort notíc paul'), ('POSITIVO', 'ford vend ecosport frot localiz rent car'), ('POSITIVO', 'vend lançament mrv cresc trimestr exam'), ('POSITIVO', 'petrobr descart risc ativ argentin após med eua notic uol econom'), ('NEGATIVO', 'empresári relat ameac diret

In [5]:
def carregar(filename):
    frases = []
    with open(filename, 'r', encoding='utf-8') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = ' '.join(row[0].strip().splitlines())
            valencia = row[1].upper()
            if len(frase) > 5:
                frases.append((valencia, frase))
    return frases

titulo_noticias2 = carregar(f'{path}titulo_noticias.txt')
only_neutro = []
for sent, new in titulo_noticias2:
    if sent == 'NEUTRO':
        only_neutro.append(new)
with open(f'{path}titulo_noticias_neutro.txt', 'w') as h:
    for k in only_neutro:
        h.write(f'{k}|NEUTRO\n')

In [6]:
#all_datasets
afrases = []
avalencias = []
for valencia, frase in frases:
    afrases.append(frase)
    avalencias.append(valencia)
    
print(afrases[:5])
print(avalencias[:5])
print('-' * 20)

#tweets_mg
atweets_mg = []
aval_tweets_mg = []
for valencia, frase in tweets_mg:
    atweets_mg.append(frase)
    aval_tweets_mg.append(valencia)

print(atweets_mg[:5])
print(aval_tweets_mg[:5])
print('-' * 20)

#titulo_noticias
atitulo_noticias = []
aval_titulo_noticias = []
for valencia, frase in titulo_noticias:
    atitulo_noticias.append(frase)
    aval_titulo_noticias.append(valencia)

print(atitulo_noticias[:5])
print(aval_titulo_noticias[:5])

['vcs ir viaj pra min lug afet febr amarel favor procur vacin', 'ivec vend ônibus govern min ger jornaut', 'pmg pmg prisã tráfic drog estelionat', 'alert nom utiliz golp estar', 'ceciliedit nã sab pra quê serv vereador nã fiscaliz nad govern min ger']
['POSITIVO', 'POSITIVO', 'POSITIVO', 'NEUTRO', 'NEUTRO']
--------------------
['bitcoin super dól máxim três ano estar min', 'new post quart estar garant direit amament públic', 'julianaciprian ministéri públic restring entrevist promotor imprens politic estar min', 'cham inicial design iníci dia janeir', 'pmg frustr tentat furt bom esperanc']
['NEUTRO', 'NEUTRO', 'NEUTRO', 'POSITIVO', 'POSITIVO']
--------------------
['sabesp quer capt mais biliã litr volum mort notíc paul', 'ford vend ecosport frot localiz rent car', 'vend lançament mrv cresc trimestr exam', 'petrobr descart risc ativ argentin após med eua notic uol econom', 'empresári relat ameac diretor petrobr']
['NEUTRO', 'POSITIVO', 'POSITIVO', 'POSITIVO', 'NEGATIVO']


In [7]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    start = time.time()
    model.fit(X_train, y_train)
    train_time = round(time.time() - start, 5)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    start = time.time()
    y_pred = model.predict(X_test)
    predict_time = round(time.time() - start, 5)
    print(f'Modelo      : {model.__class__.__name__}')
    print(f'Train Time  : {train_time}s')
    print(f'Predict Time: {predict_time}s')
    print(f'Acurácia    : {accuracy}%')
    print(classification_report(y_test, y_pred))
    print('Matrix de Confusão: ')
    print(confusion_matrix(y_test, y_pred))
    print('-' * 30)
    print()
    return accuracy, train_time, predict_time

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

## Classificadores

In [8]:
classifiers = (
    MultinomialNB(),
    ComplementNB(),
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=50, min_samples_split=8, random_state=0),
    KNeighborsClassifier(n_neighbors=8, algorithm='auto'),
    MLPClassifier(hidden_layer_sizes=(100, 25), max_iter=500, random_state=0),
    LinearSVC(max_iter=150, random_state=0),
    SVC(gamma='auto', max_iter=150),
)

## TF-IDF

In [9]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

vec_tfidf_tmg = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf_tmg = vec_tfidf_tmg.fit_transform(atweets_mg)

vec_tfidf_tn = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf_tn = vec_tfidf_tn.fit_transform(atitulo_noticias)

In [10]:
print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_tfidf, avalencias))
        matriz_resultados['all']['tfidf'][classifier.__class__.__name__] = acc
        train_time['all']['tfidf'][classifier.__class__.__name__] = tt
        predict_time['all']['tfidf'][classifier.__class__.__name__] = pt
    except:
        pass

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_tfidf_tmg, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf'][classifier.__class__.__name__] = acc
        train_time['tweets']['tfidf'][classifier.__class__.__name__] = tt
        predict_time['tweets']['tfidf'][classifier.__class__.__name__] = pt
    except:
        pass

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_tfidf_tn, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['tfidf'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['tfidf'][classifier.__class__.__name__] = pt
    except:
        pass      


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.00832s
Predict Time: 0.00054s
Acurácia    : 59.27%
              precision    recall  f1-score   support

    NEGATIVO       0.66      0.22      0.33       447
      NEUTRO       0.57      0.48      0.52       518
    POSITIVO       0.59      0.90      0.71       724

   micro avg       0.59      0.59      0.59      1689
   macro avg       0.61      0.54      0.52      1689
weighted avg       0.60      0.59      0.55      1689

Matrix de Confusão: 
[[100 127 220]
 [ 37 250 231]
 [ 15  58 651]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.00935s
Predict Time: 0.00051s
Acurácia    : 66.25%
              precision    recall  f1-score   support

    NEGATIVO       0.54      0.51      0.53       447
      NEUTRO       0.58      0.61      0.59       518
    POSITIVO       0.79      0.80      0.79       724

   micro avg       0.66      0.66      0.66      1689
   macro avg       0.64      0.64      0.64  

Modelo      : SVC
Train Time  : 0.14998s
Predict Time: 0.08666s
Acurácia    : 50.76%
              precision    recall  f1-score   support

    NEGATIVO       0.29      0.26      0.28       189
      NEUTRO       0.59      0.22      0.32       384
    POSITIVO       0.55      0.88      0.68       416

   micro avg       0.51      0.51      0.51       989
   macro avg       0.47      0.46      0.42       989
weighted avg       0.51      0.51      0.46       989

Matrix de Confusão: 
[[ 50  46  93]
 [ 88  85 211]
 [ 36  13 367]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.00289s
Predict Time: 0.00026s
Acurácia    : 62.86%
              precision    recall  f1-score   support

    NEGATIVO       0.75      0.56      0.65       248
      NEUTRO       0.78      0.10      0.17       142
    POSITIVO       0.58      0.92      0.71       310

   micro avg       0.63      0.63      0.63       700
   macro avg       0.70      0.53      0.51       7

## LSA (usando TF-IDF)

In [34]:
#all_datasets
svd = TruncatedSVD(n_components=70, random_state=0)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, avalencias))
        matriz_resultados['all']['tfidf+lsa'][classifier.__class__.__name__] = acc
        train_time['all']['tfidf+lsa'][classifier.__class__.__name__] = tt
        predict_time['all']['tfidf+lsa'][classifier.__class__.__name__] = pt
    except Exception as e:
        print(e)

#tweets_mg
X_svd = lsa.fit_transform(X_tfidf_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf+lsa'][classifier.__class__.__name__] = acc
        train_time['tweets']['tfidf+lsa'][classifier.__class__.__name__] = tt
        predict_time['tweets']['tfidf+lsa'][classifier.__class__.__name__] = pt
    except:
        pass


#titulo_noticias
X_svd = lsa.fit_transform(X_tfidf_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf+lsa'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['tfidf+lsa'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['tfidf+lsa'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.00447s
Predict Time: 0.00026s
Acurácia    : 42.81%
              precision    recall  f1-score   support

    NEGATIVO       0.00      0.00      0.00       447
      NEUTRO       0.00      0.00      0.00       518
    POSITIVO       0.43      1.00      0.60       724

   micro avg       0.43      0.43      0.43      1689
   macro avg       0.14      0.33      0.20      1689
weighted avg       0.18      0.43      0.26      1689

Matrix de Confusão: 
[[  0   2 445]
 [  0   0 518]
 [  0   1 723]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.00437s
Predict Time: 0.00025s
Acurácia    : 53.52%
              precision    recall  f1-score   support

    NEGATIVO       0.53      0.22      0.32       447
      NEUTRO       0.46      0.50      0.48       518
    POSITIVO       0.58      0.75      0.65       724

   micro avg       0.54      0.54      0.54      1689
   macro avg       0.53      0.49      0.48  

Modelo      : SVC
Train Time  : 0.13051s
Predict Time: 0.06352s
Acurácia    : 42.77%
              precision    recall  f1-score   support

    NEGATIVO       0.33      0.09      0.14       189
      NEUTRO       0.36      0.04      0.08       384
    POSITIVO       0.44      0.94      0.60       416

   micro avg       0.43      0.43      0.43       989
   macro avg       0.38      0.36      0.27       989
weighted avg       0.39      0.43      0.31       989

Matrix de Confusão: 
[[ 17  12 160]
 [ 26  17 341]
 [  9  18 389]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.00226s
Predict Time: 0.00018s
Acurácia    : 44.29%
              precision    recall  f1-score   support

    NEGATIVO       0.00      0.00      0.00       248
      NEUTRO       0.00      0.00      0.00       142
    POSITIVO       0.44      1.00      0.61       310

   micro avg       0.44      0.44      0.44       700
   macro avg       0.15      0.33      0.20       7

## LDA (usando TF-IDF)

In [12]:
#all_datasets
lda = LatentDirichletAllocation(n_components=100, max_iter=25, random_state=0, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_tfidf)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, avalencias))
        matriz_resultados['all']['tfidf+lda'][classifier.__class__.__name__] = acc
        train_time['all']['tfidf+lda'][classifier.__class__.__name__] = tt
        predict_time['all']['tfidf+lda'][classifier.__class__.__name__] = pt
    except:
        pass


#tweets_mg
X_lda = lda.fit_transform(X_tfidf_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf+lda'][classifier.__class__.__name__] = acc
        train_time['tweets']['tfidf+lda'][classifier.__class__.__name__] = tt
        predict_time['tweets']['tfidf+lda'][classifier.__class__.__name__] = pt
    except:
        pass


#titulo_noticias
X_lda = lda.fit_transform(X_tfidf_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf+lda'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['tfidf+lda'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['tfidf+lda'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.01183s
Predict Time: 0.00058s
Acurácia    : 45.06%
              precision    recall  f1-score   support

    NEGATIVO       0.21      0.01      0.01       447
      NEUTRO       0.42      0.23      0.29       518
    POSITIVO       0.46      0.89      0.60       724

   micro avg       0.45      0.45      0.45      1689
   macro avg       0.36      0.37      0.30      1689
weighted avg       0.38      0.45      0.35      1689

Matrix de Confusão: 
[[  3  82 362]
 [  8 117 393]
 [  3  80 641]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.01269s
Predict Time: 0.00221s
Acurácia    : 42.51%
              precision    recall  f1-score   support

    NEGATIVO       0.31      0.34      0.32       447
      NEUTRO       0.40      0.44      0.42       518
    POSITIVO       0.54      0.47      0.50       724

   micro avg       0.43      0.43      0.43      1689
   macro avg       0.42      0.42      0.41  

Modelo      : SVC
Train Time  : 0.16703s
Predict Time: 0.08994s
Acurácia    : 48.13%
              precision    recall  f1-score   support

    NEGATIVO       0.28      0.12      0.16       189
      NEUTRO       0.49      0.36      0.42       384
    POSITIVO       0.50      0.76      0.60       416

   micro avg       0.48      0.48      0.48       989
   macro avg       0.43      0.41      0.39       989
weighted avg       0.46      0.48      0.45       989

Matrix de Confusão: 
[[ 22  60 107]
 [ 38 138 208]
 [ 18  82 316]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.00287s
Predict Time: 0.00025s
Acurácia    : 46.0%
              precision    recall  f1-score   support

    NEGATIVO       0.43      0.30      0.36       248
      NEUTRO       0.40      0.03      0.05       142
    POSITIVO       0.47      0.78      0.59       310

   micro avg       0.46      0.46      0.46       700
   macro avg       0.43      0.37      0.33       70

## Count

In [13]:
#all_datasets
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_count, avalencias))
        matriz_resultados['all']['count'][classifier.__class__.__name__] = acc
        train_time['all']['count'][classifier.__class__.__name__] = tt
        predict_time['all']['count'][classifier.__class__.__name__] = pt
    except:
        pass

      
#tweets_mg
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count_tmg = vec_count.fit_transform(atweets_mg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_count_tmg, aval_tweets_mg))
        matriz_resultados['tweets']['count'][classifier.__class__.__name__] = acc
        train_time['tweets']['count'][classifier.__class__.__name__] = tt
        predict_time['tweets']['count'][classifier.__class__.__name__] = pt
    except:
        pass
      
      
#titulo_noticias
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count_tn = vec_count.fit_transform(atitulo_noticias)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_count_tn, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['count'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['count'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['count'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.00558s
Predict Time: 0.00052s
Acurácia    : 65.84%
              precision    recall  f1-score   support

    NEGATIVO       0.55      0.51      0.53       447
      NEUTRO       0.57      0.62      0.59       518
    POSITIVO       0.79      0.78      0.79       724

   micro avg       0.66      0.66      0.66      1689
   macro avg       0.64      0.64      0.64      1689
weighted avg       0.66      0.66      0.66      1689

Matrix de Confusão: 
[[227 160  60]
 [110 321  87]
 [ 77  83 564]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.00677s
Predict Time: 0.00049s
Acurácia    : 67.02%
              precision    recall  f1-score   support

    NEGATIVO       0.54      0.56      0.55       447
      NEUTRO       0.58      0.62      0.60       518
    POSITIVO       0.83      0.77      0.80       724

   micro avg       0.67      0.67      0.67      1689
   macro avg       0.65      0.65      0.65  

Modelo      : SVC
Train Time  : 0.12703s
Predict Time: 0.06713s
Acurácia    : 49.24%
              precision    recall  f1-score   support

    NEGATIVO       0.33      0.14      0.19       189
      NEUTRO       0.53      0.17      0.26       384
    POSITIVO       0.50      0.95      0.66       416

   micro avg       0.49      0.49      0.49       989
   macro avg       0.45      0.42      0.37       989
weighted avg       0.48      0.49      0.42       989

Matrix de Confusão: 
[[ 26  44 119]
 [ 47  67 270]
 [  7  15 394]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.00286s
Predict Time: 0.00025s
Acurácia    : 66.14%
              precision    recall  f1-score   support

    NEGATIVO       0.70      0.66      0.68       248
      NEUTRO       0.56      0.41      0.47       142
    POSITIVO       0.67      0.78      0.72       310

   micro avg       0.66      0.66      0.66       700
   macro avg       0.64      0.62      0.62       7

## LSA (usando Count)

In [39]:
#all_datasets
svd = TruncatedSVD(n_components=70, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(svd, normalizer)
X_svd = lda.fit_transform(X_count)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, avalencias))
        matriz_resultados['all']['count+lsa'][classifier.__class__.__name__] = acc
        train_time['all']['count+lsa'][classifier.__class__.__name__] = tt
        predict_time['all']['count+lsa'][classifier.__class__.__name__] = pt
    except:
        pass
      

#tweets_mg
X_svd = lda.fit_transform(X_count_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, aval_tweets_mg))
        matriz_resultados['tweets']['count+lsa'][classifier.__class__.__name__] = acc
        train_time['tweets']['count+lsa'][classifier.__class__.__name__] = tt
        predict_time['tweets']['count+lsa'][classifier.__class__.__name__] = pt
    except:
        pass
      
      
#titulos_noticias
X_svd = lda.fit_transform(X_count_tn)

print("\ntitulos_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['count+lsa'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['count+lsa'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['count+lsa'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets

tweets_mg
Modelo      : MultinomialNB
Train Time  : 0.00345s
Predict Time: 0.00022s
Acurácia    : 50.35%
              precision    recall  f1-score   support

    NEGATIVO       0.00      0.00      0.00       189
      NEUTRO       0.52      0.35      0.42       384
    POSITIVO       0.50      0.87      0.63       416

   micro avg       0.50      0.50      0.50       989
   macro avg       0.34      0.41      0.35       989
weighted avg       0.41      0.50      0.43       989

Matrix de Confusão: 
[[  0  71 118]
 [  0 136 248]
 [  0  54 362]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.00313s
Predict Time: 0.00021s
Acurácia    : 58.14%
              precision    recall  f1-score   support

    NEGATIVO       0.33      0.01      0.01       189
      NEUTRO       0.51      0.72      0.60       384
    POSITIVO       0.67      0.72      0.69       416

   micro avg       0.58      0.58      0.58       989
   macro avg       0.50      0.48 

Modelo      : SVC
Train Time  : 0.09786s
Predict Time: 0.04241s
Acurácia    : 44.71%
              precision    recall  f1-score   support

    NEGATIVO       0.60      0.11      0.18       248
      NEUTRO       0.18      0.07      0.10       142
    POSITIVO       0.46      0.89      0.61       310

   micro avg       0.45      0.45      0.45       700
   macro avg       0.41      0.36      0.30       700
weighted avg       0.45      0.45      0.35       700

Matrix de Confusão: 
[[ 27  22 199]
 [  8  10 124]
 [ 10  24 276]]
------------------------------



## LDA (usando Count)

In [15]:
#all_datasets
lda = LatentDirichletAllocation(n_components=100, max_iter=25, random_state=0, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_count)


print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, avalencias))
        matriz_resultados['all']['count+lda'][classifier.__class__.__name__] = acc
        train_time['all']['count+lda'][classifier.__class__.__name__] = tt
        predict_time['all']['count+lda'][classifier.__class__.__name__] = pt
    except:
        pass


#tweets_mg
X_lda = lda.fit_transform(X_count_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, aval_tweets_mg))
        matriz_resultados['tweets']['count+lda'][classifier.__class__.__name__] = acc
        train_time['tweets']['count+lda'][classifier.__class__.__name__] = tt
        predict_time['tweets']['count+lda'][classifier.__class__.__name__] = pt
    except:
        pass
      
      
#titulo_noticias
X_lda = lda.fit_transform(X_count_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['count+lda'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['count+lda'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['count+lda'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.00493s
Predict Time: 0.00033s
Acurácia    : 47.72%
              precision    recall  f1-score   support

    NEGATIVO       0.29      0.04      0.07       447
      NEUTRO       0.46      0.40      0.43       518
    POSITIVO       0.49      0.80      0.61       724

   micro avg       0.48      0.48      0.48      1689
   macro avg       0.41      0.41      0.37      1689
weighted avg       0.43      0.48      0.41      1689

Matrix de Confusão: 
[[ 18 123 306]
 [ 24 209 285]
 [ 20 125 579]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.00456s
Predict Time: 0.0003s
Acurácia    : 48.67%
              precision    recall  f1-score   support

    NEGATIVO       0.34      0.31      0.33       447
      NEUTRO       0.43      0.56      0.49       518
    POSITIVO       0.65      0.54      0.59       724

   micro avg       0.49      0.49      0.49      1689
   macro avg       0.47      0.47      0.47   

Modelo      : SVC
Train Time  : 0.1676s
Predict Time: 0.09281s
Acurácia    : 42.97%
              precision    recall  f1-score   support

    NEGATIVO       0.27      0.14      0.19       189
      NEUTRO       0.35      0.03      0.05       384
    POSITIVO       0.45      0.93      0.61       416

   micro avg       0.43      0.43      0.43       989
   macro avg       0.36      0.37      0.28       989
weighted avg       0.38      0.43      0.31       989

Matrix de Confusão: 
[[ 27  10 152]
 [ 53  11 320]
 [ 19  10 387]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.00353s
Predict Time: 0.00022s
Acurácia    : 48.14%
              precision    recall  f1-score   support

    NEGATIVO       0.46      0.40      0.43       248
      NEUTRO       0.41      0.09      0.15       142
    POSITIVO       0.50      0.73      0.59       310

   micro avg       0.48      0.48      0.48       700
   macro avg       0.45      0.41      0.39       70

## Count + TF-IDF + Word2Vec

In [16]:
#all_datasets
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(afrases)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(afrases)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in afrases:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=25,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(37608500, 43012000)

In [17]:
#all_datasets
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X, avalencias))
        matriz_resultados['all']['tfidf+count+w2c'][classifier.__class__.__name__] = acc
        train_time['all']['tfidf+count+w2c'][classifier.__class__.__name__] = tt
        predict_time['all']['tfidf+count+w2c'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : LogisticRegression
Train Time  : 5.66224s
Predict Time: 0.01869s
Acurácia    : 66.02%
              precision    recall  f1-score   support

    NEGATIVO       0.62      0.40      0.49       447
      NEUTRO       0.56      0.66      0.61       518
    POSITIVO       0.75      0.82      0.78       724

   micro avg       0.66      0.66      0.66      1689
   macro avg       0.64      0.63      0.63      1689
weighted avg       0.66      0.66      0.65      1689

Matrix de Confusão: 
[[179 174  94]
 [ 72 344 102]
 [ 37  95 592]]
------------------------------

Modelo      : RandomForestClassifier
Train Time  : 6.02349s
Predict Time: 0.11378s
Acurácia    : 65.25%
              precision    recall  f1-score   support

    NEGATIVO       0.60      0.38      0.47       447
      NEUTRO       0.53      0.72      0.61       518
    POSITIVO       0.80      0.77      0.78       724

   micro avg       0.65      0.65      0.65      1689
   macro avg       0.64      0

In [18]:
#tweets_mg
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(atweets_mg)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(atweets_mg)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in atweets_mg:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=25,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(21303001, 26219000)

In [19]:
#tweets_mg
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf+count+w2c'][classifier.__class__.__name__] = acc
        train_time['tweets']['tfidf+count+w2c'][classifier.__class__.__name__] = tt
        predict_time['tweets']['tfidf+count+w2c'][classifier.__class__.__name__] = pt
    except:
        pass


tweets_mg
Modelo      : LogisticRegression
Train Time  : 2.10596s
Predict Time: 0.00616s
Acurácia    : 63.7%
              precision    recall  f1-score   support

    NEGATIVO       0.26      0.07      0.12       189
      NEUTRO       0.55      0.79      0.65       384
    POSITIVO       0.81      0.75      0.78       416

   micro avg       0.64      0.64      0.64       989
   macro avg       0.54      0.54      0.51       989
weighted avg       0.60      0.64      0.60       989

Matrix de Confusão: 
[[ 14 151  24]
 [ 31 303  50]
 [  8  95 313]]
------------------------------

Modelo      : RandomForestClassifier
Train Time  : 2.3992s
Predict Time: 0.05927s
Acurácia    : 66.13%
              precision    recall  f1-score   support

    NEGATIVO       0.25      0.07      0.11       189
      NEUTRO       0.56      0.86      0.68       384
    POSITIVO       0.90      0.74      0.81       416

   micro avg       0.66      0.66      0.66       989
   macro avg       0.57      0.56  

In [20]:
#titulo_noticias
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(atitulo_noticias)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(atitulo_noticias)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in atitulo_noticias:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=25,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(13988239, 16793000)

In [21]:
#titulo_noticias
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wjv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf+count+w2c'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['tfidf+count+w2c'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['tfidf+count+w2c'][classifier.__class__.__name__] = pt
    except:
        pass


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.01034s
Predict Time: 0.00238s
Acurácia    : 62.29%
              precision    recall  f1-score   support

    NEGATIVO       0.70      0.58      0.63       248
      NEUTRO       0.74      0.14      0.24       142
    POSITIVO       0.58      0.88      0.70       310

   micro avg       0.62      0.62      0.62       700
   macro avg       0.67      0.53      0.52       700
weighted avg       0.66      0.62      0.58       700

Matrix de Confusão: 
[[143   2 103]
 [ 29  20  93]
 [ 32   5 273]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.01042s
Predict Time: 0.00713s
Acurácia    : 65.43%
              precision    recall  f1-score   support

    NEGATIVO       0.69      0.66      0.67       248
      NEUTRO       0.53      0.37      0.43       142
    POSITIVO       0.67      0.78      0.72       310

   micro avg       0.65      0.65      0.65       700
   macro avg       0.63      0.60      0.6

### Resultado dos Classificadores

#### Resultado dos Classificadores para todas as frases

In [40]:
print('Acuraria (%):')
print('-' * 20)
df = pd.DataFrame.from_dict(matriz_resultados['all'])
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Acuraria (%):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,66.25%,53.52%,42.51%,67.02%,51.98%,48.67%,0.00%
KNeighborsClassifier,60.57%,56.96%,40.62%,40.26%,57.08%,44.52%,62.94%
LinearSVC,68.32%,61.87%,46.42%,66.73%,62.23%,49.14%,66.49%
LogisticRegression,65.25%,61.93%,46.06%,67.55%,62.17%,48.96%,66.02%
MLPClassifier,65.36%,62.11%,45.23%,65.42%,62.52%,47.96%,63.29%
MultinomialNB,59.27%,42.81%,45.06%,65.84%,44.35%,47.72%,0.00%
RandomForestClassifier,65.54%,62.34%,46.24%,65.60%,62.05%,50.03%,65.25%
SVC,53.46%,43.64%,42.87%,51.33%,44.94%,42.75%,52.10%


In [41]:
print('Tempo para treinar o classificador (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(train_time['all'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para treinar o classificador (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00935,0.00437,0.01269,0.00677,0.00435,0.00456,0.0
KNeighborsClassifier,0.00324,0.00754,0.01185,0.00145,0.00718,0.01017,1.04194
LinearSVC,0.0774,0.3273,0.26569,0.1358,0.3289,0.17671,0.06683
LogisticRegression,1.74976,0.2931,0.34268,1.40931,0.22584,0.09258,5.66224
MLPClassifier,142.3455,4.94202,38.46367,95.33391,3.35291,10.02127,47.47517
MultinomialNB,0.00832,0.00447,0.01183,0.00558,0.00442,0.00493,0.0
RandomForestClassifier,4.09342,1.08415,1.78255,2.81718,0.98208,0.57475,6.02349
SVC,0.24947,0.24086,0.35283,0.20592,0.22276,0.28523,12.53051


In [42]:
print('Tempo para predizer novos itens (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(predict_time['all'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para predizer novos itens (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00051,0.00025,0.00221,0.00049,0.00024,0.0003,0.0
KNeighborsClassifier,0.16644,0.63737,0.62866,0.1183,0.5713,0.50721,50.18383
LinearSVC,0.00035,0.00026,0.00361,0.0003,0.00027,0.00032,0.01861
LogisticRegression,0.00053,0.00028,0.00219,0.00052,0.00026,0.00041,0.01869
MLPClassifier,0.01457,0.00215,0.0072,0.00546,0.00214,0.00274,0.03811
MultinomialNB,0.00054,0.00026,0.00058,0.00052,0.00026,0.00033,0.0
RandomForestClassifier,0.074,0.01687,0.01994,0.05494,0.01617,0.02437,0.11378
SVC,0.15376,0.12406,0.18183,0.11997,0.10885,0.15884,7.97576


#### Resultado dos Classificadores para os Tweets MG

In [43]:
print('Acuraria (%):')
print('-' * 20)
df = pd.DataFrame.from_dict(matriz_resultados['tweets'])
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Acuraria (%):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,61.48%,56.22%,48.43%,61.68%,58.14%,51.37%,0.00%
KNeighborsClassifier,60.97%,59.86%,50.15%,53.08%,60.36%,50.96%,62.29%
LinearSVC,63.80%,64.61%,52.78%,63.40%,65.22%,55.71%,63.80%
LogisticRegression,65.42%,65.72%,52.58%,63.60%,63.60%,55.92%,63.70%
MLPClassifier,63.50%,65.72%,51.37%,59.86%,66.33%,54.10%,61.48%
MultinomialNB,62.49%,43.28%,52.17%,62.69%,50.35%,54.30%,0.00%
RandomForestClassifier,65.82%,63.70%,51.97%,65.62%,64.71%,53.08%,66.13%
SVC,50.76%,42.77%,48.13%,49.24%,42.16%,42.97%,52.07%


In [44]:
print('Tempo para treinar o classificador (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(train_time['tweets'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para treinar o classificador (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.0046,0.00283,0.00873,0.00455,0.00313,0.00295,0.0
KNeighborsClassifier,0.00121,0.00589,0.00648,0.00101,0.00574,0.0057,0.37202
LinearSVC,0.04887,0.18542,0.0885,0.07498,0.19485,0.08679,0.03399
LogisticRegression,1.13157,0.16104,0.24545,0.78002,0.29276,0.06736,2.10596
MLPClassifier,55.5361,3.15065,8.5575,19.7481,3.45858,6.27532,22.48731
MultinomialNB,0.00383,0.00289,0.01024,0.00372,0.00345,0.00299,0.0
RandomForestClassifier,1.16313,0.58644,0.85569,1.13529,0.56945,0.28052,2.3992
SVC,0.14998,0.13051,0.16703,0.12703,0.13193,0.1676,5.67275


In [45]:
print('Tempo para predizer novos itens (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(predict_time['tweets'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para predizer novos itens (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00034,0.0003,0.0017,0.00032,0.00021,0.00021,0.0
KNeighborsClassifier,0.04634,0.26826,0.25279,0.04617,0.29945,0.21488,16.10939
LinearSVC,0.00021,0.00054,0.00022,0.0002,0.00022,0.00022,0.00634
LogisticRegression,0.00038,0.00022,0.00174,0.00034,0.00022,0.00022,0.00616
MLPClassifier,0.00354,0.0031,0.00157,0.0032,0.00155,0.00156,0.04648
MultinomialNB,0.00037,0.00022,0.00177,0.00035,0.00022,0.00022,0.0
RandomForestClassifier,0.02998,0.01163,0.01258,0.0295,0.01238,0.01482,0.05927
SVC,0.08666,0.06352,0.08994,0.06713,0.06607,0.09281,3.15827


#### Resultados dos Classificadores para os Títulos de Notícias

In [46]:
print('Acuraria (%):')
print('-' * 20)
df = pd.DataFrame.from_dict(matriz_resultados['titulo_noticias'])
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Acuraria (%):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,65.43%,56.57%,39.14%,66.00%,48.00%,41.43%,65.43%
KNeighborsClassifier,63.29%,53.86%,39.29%,40.71%,51.57%,41.86%,62.57%
LinearSVC,68.57%,62.29%,45.29%,69.43%,60.86%,47.29%,67.57%
LogisticRegression,65.86%,60.14%,45.71%,68.57%,60.71%,47.86%,65.86%
MLPClassifier,66.29%,60.43%,44.00%,68.29%,57.00%,40.71%,64.86%
MultinomialNB,62.86%,44.29%,46.00%,66.14%,44.29%,48.14%,62.29%
RandomForestClassifier,64.00%,58.71%,45.14%,63.57%,60.29%,46.29%,65.57%
SVC,58.86%,44.00%,43.00%,50.29%,44.71%,43.86%,53.29%


In [47]:
print('Tempo para treinar o classificador (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(train_time['titulo_noticias'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para treinar o classificador (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00352,0.00218,0.00238,0.00329,0.00486,0.00227,0.01042
KNeighborsClassifier,0.00093,0.00284,0.00362,0.00081,0.00289,0.00371,0.12917
LinearSVC,0.01939,0.14858,0.06041,0.0523,0.12729,0.06235,0.01959
LogisticRegression,0.58221,0.14202,0.0455,0.40483,0.21385,0.04819,0.73175
MLPClassifier,41.06379,2.3593,4.8293,12.90877,1.5975,4.13456,7.47005
MultinomialNB,0.00289,0.00226,0.00287,0.00286,0.00513,0.00353,0.01034
RandomForestClassifier,0.86183,0.39672,0.60482,0.7174,0.37491,0.26086,1.01343
SVC,0.10089,0.09667,0.11383,0.08896,0.09786,0.11339,2.38047


In [48]:
print('Tempo para predizer novos itens (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(predict_time['titulo_noticias'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para predizer novos itens (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00025,0.00017,0.0002,0.00024,0.00017,0.0002,0.00713
KNeighborsClassifier,0.02002,0.11933,0.11897,0.02032,0.12277,0.11563,4.08739
LinearSVC,0.00017,0.00019,0.00021,0.00016,0.00022,0.00021,0.00237
LogisticRegression,0.00029,0.00019,0.0002,0.00026,0.00018,0.00021,0.00237
MLPClassifier,0.00228,0.002,0.00118,0.00215,0.00097,0.00118,0.00719
MultinomialNB,0.00026,0.00018,0.00025,0.00025,0.00018,0.00022,0.00238
RandomForestClassifier,0.02071,0.00898,0.00959,0.01935,0.00913,0.01265,0.02748
SVC,0.05809,0.04381,0.0616,0.04538,0.04241,0.05903,1.3367


### Modelo escolhido e salvo

In [49]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(n_components=70, random_state=0)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf)

svm = LinearSVC(max_iter=150, random_state=0)
model = CalibratedClassifierCV(svm) 
model.fit(X_svd, avalencias)

accuracy = np.round(model.score(X_svd, avalencias) * 100, 2)
print(f'Modelo   : {model.__class__.__name__}')
print(f'Acurácia : {accuracy}%')

filename = 'tfidf_valence.sav'
joblib.dump(vec_tfidf, filename)

filename = 'lsa_valence.sav'
joblib.dump(lsa, filename)

filename = 'model_valence.sav'
joblib.dump(model, filename)

Modelo   : CalibratedClassifierCV
Acurácia : 63.44%


['model_valence.sav']

In [32]:
model.classes_

array(['NEGATIVO', 'NEUTRO', 'POSITIVO'], dtype='<U8')

In [33]:
y = model.predict_proba(X_svd)
list(np.round(y[0] * 100, 2))

[15.93, 18.94, 65.12]