## Testes de valência para o projeto final de IA369Y 2 Semestre 2018

Passos para tratar os dados com valência, testar e escolher um classificador para utilizar no projeto final de IA369Y.

1) Remover espaços duplos, quebras de linha, números e links do dataset e das frases a serem testadas.

2) Remover stopwords e aplicar o stemmer.

3) Treinar os classificadores.

4) Realizar as predições com os classificadores.

5) Avaliar as medidas obtidas com os classificadores.

In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle
import time

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk import word_tokenize
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.externals import joblib

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

def highlight_max(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)


def highlight_min(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_min = data == data.min()
        return [attr if v else '' for v in is_min]
    else:  # from .apply(axis=None)
        is_min = data == data.min().min()
        return pd.DataFrame(np.where(is_min, attr, ''),
                            index=data.index, columns=data.columns)   

### Matriz de Resultados

In [2]:
classf = {
    'MultinomialNB': 0,
    'ComplementNB': 0,
    'LogisticRegression': 0,
    'RandomForestClassifier': 0,
    'KNeighborsClassifier': 0,
    'MLPClassifier': 0,
    'LinearSVC': 0,
    'SVC': 0
}

matriz_resultados = {
    'all': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    },
    'tweets': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),       
    },
    'titulo_noticias': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    }
}

train_time = {
    'all': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    },
    'tweets': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),       
    },
    'titulo_noticias': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    }
}

predict_time = {
    'all': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    },
    'tweets': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),       
    },
    'titulo_noticias': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    }
}

## Datasets

Para validar, serão utilizados dois datasets.

Os dos datasets foram obtidos do site minerando dados.

O primeiro deles tem tweets de política de Minas Gerais com rótulos de valência: positivo, negativo e neutro. Foi feito um tratamento para eliminar tweets repetidos e dessa forma sobraram 3016 tweets.

O segundo contém 2123 títulos de notícias com rótulos de valência: positivo, negativo e neutro.

In [3]:
path = '/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/valencia/'

#Carregando os datasets
def carregar(filename):
    frases = []
    with open(filename, 'r', encoding='utf-8') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            valencia = row[1].upper()
            if len(frase) > 5:
                frases.append((valencia, frase))
    return frases

In [4]:
#Carrega os datasets em separado
tweets_mg = carregar(f'{path}tweets_mg_tratados.csv')
shuffle(tweets_mg)
titulo_noticias = carregar(f'{path}titulo_noticias.txt')
shuffle(titulo_noticias)

frases = tweets_mg + titulo_noticias
shuffle(frases)

print(frases[:5])
print('-' * 20)
print(tweets_mg[:5])
print('-' * 20)
print(titulo_noticias[:5])

[('POSITIVO', 'minsaud acontec saud forc nacional sus reforc ajud estar min ger saudemg'), ('NEGATIVO', 'empres cop bat nã bolã bols'), ('POSITIVO', 'bovesp índic ibovesp fech alto quint feir abril'), ('NEGATIVO', 'pdg tend entrar rol trabalh análog escrav'), ('NEGATIVO', 'hong kong confirm grip suín cheg sud asiát grã bretanh primeir cas pacient nã méxic eua vir cas aument')]
--------------------
[('NEGATIVO', 'fórum regional ir eleg nov prefeit vereador colegi execut avanc particip popul'), ('NEUTRO', 'claudfeb mayrasmendoz cfkargentin andar con min pelotud pagarl con guit del estar'), ('POSITIVO', 'new post hom pres sab suspeit estupr roub celul'), ('NEUTRO', 'jov mort tir durant bail funk pampulh ger estar min'), ('NEUTRO', 'govern min ger ridícul entrar ipva veícul médi pass vergonh forapt')]
--------------------
[('POSITIVO', 'mpf investig president bovesp cas vend açõ ogx econom notíc ver'), ('NEUTRO', 'sabesp econom fabul açã reduz envi águ cotidian folh paul'), ('NEUTRO', 'rev

In [5]:
def carregar(filename):
    frases = []
    with open(filename, 'r', encoding='utf-8') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = ' '.join(row[0].strip().splitlines())
            valencia = row[1].upper()
            if len(frase) > 5:
                frases.append((valencia, frase))
    return frases

titulo_noticias2 = carregar(f'{path}titulo_noticias.txt')
only_neutro = []
for sent, new in titulo_noticias2:
    if sent == 'NEUTRO':
        only_neutro.append(new)
with open(f'{path}titulo_noticias_neutro.txt', 'w') as h:
    for k in only_neutro:
        h.write(f'{k}|NEUTRO\n')

In [6]:
#all_datasets
afrases = []
avalencias = []
for valencia, frase in frases:
    afrases.append(frase)
    avalencias.append(valencia)
    
print(afrases[:5])
print(avalencias[:5])
print('-' * 20)

#tweets_mg
atweets_mg = []
aval_tweets_mg = []
for valencia, frase in tweets_mg:
    atweets_mg.append(frase)
    aval_tweets_mg.append(valencia)

print(atweets_mg[:5])
print(aval_tweets_mg[:5])
print('-' * 20)

#titulo_noticias
atitulo_noticias = []
aval_titulo_noticias = []
for valencia, frase in titulo_noticias:
    atitulo_noticias.append(frase)
    aval_titulo_noticias.append(valencia)

print(atitulo_noticias[:5])
print(aval_titulo_noticias[:5])

['minsaud acontec saud forc nacional sus reforc ajud estar min ger saudemg', 'empres cop bat nã bolã bols', 'bovesp índic ibovesp fech alto quint feir abril', 'pdg tend entrar rol trabalh análog escrav', 'hong kong confirm grip suín cheg sud asiát grã bretanh primeir cas pacient nã méxic eua vir cas aument']
['POSITIVO', 'NEGATIVO', 'POSITIVO', 'NEGATIVO', 'NEGATIVO']
--------------------
['fórum regional ir eleg nov prefeit vereador colegi execut avanc particip popul', 'claudfeb mayrasmendoz cfkargentin andar con min pelotud pagarl con guit del estar', 'new post hom pres sab suspeit estupr roub celul', 'jov mort tir durant bail funk pampulh ger estar min', 'govern min ger ridícul entrar ipva veícul médi pass vergonh forapt']
['NEGATIVO', 'NEUTRO', 'POSITIVO', 'NEUTRO', 'NEUTRO']
--------------------
['mpf investig president bovesp cas vend açõ ogx econom notíc ver', 'sabesp econom fabul açã reduz envi águ cotidian folh paul', 'revist espírit livr blog archiv institut japã ir banc bras

In [7]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    start = time.time()
    model.fit(X_train, y_train)
    train_time = round(time.time() - start, 5)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    start = time.time()
    y_pred = model.predict(X_test)
    predict_time = round(time.time() - start, 5)
    print(f'Modelo      : {model.__class__.__name__}')
    print(f'Train Time  : {train_time}s')
    print(f'Predict Time: {predict_time}s')
    print(f'Acurácia    : {accuracy}%')
    print(classification_report(y_test, y_pred))
    print('Matrix de Confusão: ')
    print(confusion_matrix(y_test, y_pred))
    print('-' * 30)
    print()
    return accuracy, train_time, predict_time

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

## Classificadores

In [8]:
classifiers = (
    MultinomialNB(),
    ComplementNB(),
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=50, min_samples_split=8, random_state=10),
    KNeighborsClassifier(n_neighbors=8, algorithm='auto'),
    MLPClassifier(hidden_layer_sizes=(100, 25), max_iter=500, random_state=10),
    LinearSVC(max_iter=150, random_state=10),
    SVC(gamma='auto', max_iter=150),
)

## TF-IDF

In [9]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

vec_tfidf_tmg = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf_tmg = vec_tfidf_tmg.fit_transform(atweets_mg)

vec_tfidf_tn = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf_tn = vec_tfidf_tn.fit_transform(atitulo_noticias)

In [10]:
print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_tfidf, avalencias))
        matriz_resultados['all']['tfidf'][classifier.__class__.__name__] = acc
        train_time['all']['tfidf'][classifier.__class__.__name__] = tt
        predict_time['all']['tfidf'][classifier.__class__.__name__] = pt
    except:
        pass

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_tfidf_tmg, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf'][classifier.__class__.__name__] = acc
        train_time['tweets']['tfidf'][classifier.__class__.__name__] = tt
        predict_time['tweets']['tfidf'][classifier.__class__.__name__] = pt
    except:
        pass

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_tfidf_tn, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['tfidf'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['tfidf'][classifier.__class__.__name__] = pt
    except:
        pass      


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.00894s
Predict Time: 0.00095s
Acurácia    : 55.98%
              precision    recall  f1-score   support

    NEGATIVO       0.61      0.47      0.53       871
      NEUTRO       0.57      0.15      0.24       511
    POSITIVO       0.54      0.86      0.66       949

   micro avg       0.56      0.56      0.56      2331
   macro avg       0.57      0.50      0.48      2331
weighted avg       0.57      0.56      0.52      2331

Matrix de Confusão: 
[[410  40 421]
 [151  79 281]
 [113  20 816]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.00853s
Predict Time: 0.00071s
Acurácia    : 63.45%
              precision    recall  f1-score   support

    NEGATIVO       0.63      0.62      0.63       871
      NEUTRO       0.55      0.54      0.54       511
    POSITIVO       0.69      0.69      0.69       949

   micro avg       0.63      0.63      0.63      2331
   macro avg       0.62      0.62      0.62  

Modelo      : SVC
Train Time  : 0.16331s
Predict Time: 0.08716s
Acurácia    : 54.8%
              precision    recall  f1-score   support

    NEGATIVO       0.34      0.17      0.23       191
      NEUTRO       0.59      0.30      0.40       354
    POSITIVO       0.57      0.91      0.70       444

   micro avg       0.55      0.55      0.55       989
   macro avg       0.50      0.46      0.44       989
weighted avg       0.53      0.55      0.50       989

Matrix de Confusão: 
[[ 33  51 107]
 [ 48 106 200]
 [ 17  24 403]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.00456s
Predict Time: 0.00053s
Acurácia    : 65.57%
              precision    recall  f1-score   support

    NEGATIVO       0.64      0.87      0.74       632
      NEUTRO       1.00      0.01      0.02       132
    POSITIVO       0.68      0.57      0.62       578

   micro avg       0.66      0.66      0.66      1342
   macro avg       0.77      0.48      0.46      134

## LSA (usando TF-IDF)

In [11]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd = vec_tfidf.fit_transform(afrases)

#all_datasets
svd = TruncatedSVD(100, random_state=10)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf_svd)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, avalencias))
        matriz_resultados['all']['tfidf+lsa'][classifier.__class__.__name__] = acc
        train_time['all']['tfidf+lsa'][classifier.__class__.__name__] = tt
        predict_time['all']['tfidf+lsa'][classifier.__class__.__name__] = pt
    except Exception as e:
        print(e)

#tweets_mg
vec_tfidf_tmg = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd_tmg = vec_tfidf_tmg.fit_transform(atweets_mg)
X_svd = lsa.fit_transform(X_tfidf_svd_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf+lsa'][classifier.__class__.__name__] = acc
        train_time['tweets']['tfidf+lsa'][classifier.__class__.__name__] = tt
        predict_time['tweets']['tfidf+lsa'][classifier.__class__.__name__] = pt
    except:
        pass


#titulo_noticias
vec_tfidf_tn = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd_tn = vec_tfidf_tn.fit_transform(atitulo_noticias)
X_svd = lsa.fit_transform(X_tfidf_svd_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf+lsa'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['tfidf+lsa'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['tfidf+lsa'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.00788s
Predict Time: 0.00075s
Acurácia    : 40.84%
              precision    recall  f1-score   support

    NEGATIVO       0.00      0.00      0.00       871
      NEUTRO       1.00      0.01      0.01       511
    POSITIVO       0.41      1.00      0.58       949

   micro avg       0.41      0.41      0.41      2331
   macro avg       0.47      0.34      0.20      2331
weighted avg       0.39      0.41      0.24      2331

Matrix de Confusão: 
[[  0   0 871]
 [  0   3 508]
 [  0   0 949]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.00773s
Predict Time: 0.00067s
Acurácia    : 54.14%
              precision    recall  f1-score   support

    NEGATIVO       0.58      0.41      0.48       871
      NEUTRO       0.49      0.56      0.52       511
    POSITIVO       0.55      0.65      0.60       949

   micro avg       0.54      0.54      0.54      2331
   macro avg       0.54      0.54      0.53  

Modelo      : SVC
Train Time  : 0.26797s
Predict Time: 0.13386s
Acurácia    : 48.13%
              precision    recall  f1-score   support

    NEGATIVO       0.18      0.06      0.09       191
      NEUTRO       0.52      0.23      0.32       354
    POSITIVO       0.50      0.86      0.63       444

   micro avg       0.48      0.48      0.48       989
   macro avg       0.40      0.39      0.35       989
weighted avg       0.44      0.48      0.42       989

Matrix de Confusão: 
[[ 12  39 140]
 [ 29  83 242]
 [ 24  39 381]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.01203s
Predict Time: 0.00201s
Acurácia    : 47.17%
              precision    recall  f1-score   support

    NEGATIVO       0.47      1.00      0.64       632
      NEUTRO       0.00      0.00      0.00       132
    POSITIVO       1.00      0.00      0.00       578

   micro avg       0.47      0.47      0.47      1342
   macro avg       0.49      0.33      0.21      13

## LDA (usando TF-IDF)

In [12]:
#all_datasets
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd = vec_tfidf.fit_transform(afrases)

lda = LatentDirichletAllocation(n_components=100, max_iter=25, random_state=10, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_tfidf_svd)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, avalencias))
        matriz_resultados['all']['tfidf+lda'][classifier.__class__.__name__] = acc
        train_time['all']['tfidf+lda'][classifier.__class__.__name__] = tt
        predict_time['all']['tfidf+lda'][classifier.__class__.__name__] = pt
    except:
        pass


#tweets_mg
vec_tfidf_tmg = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd_tmg = vec_tfidf_tmg.fit_transform(atweets_mg)
X_lda = lda.fit_transform(X_tfidf_svd_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf+lda'][classifier.__class__.__name__] = acc
        train_time['tweets']['tfidf+lda'][classifier.__class__.__name__] = tt
        predict_time['tweets']['tfidf+lda'][classifier.__class__.__name__] = pt
    except:
        pass


#titulo_noticias
vec_tfidf_tn = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd_tn = vec_tfidf_tn.fit_transform(atitulo_noticias)
X_lda = lda.fit_transform(X_tfidf_svd_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf+lda'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['tfidf+lda'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['tfidf+lda'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.00651s
Predict Time: 0.00103s
Acurácia    : 48.82%
              precision    recall  f1-score   support

    NEGATIVO       0.50      0.40      0.44       871
      NEUTRO       0.44      0.08      0.13       511
    POSITIVO       0.48      0.79      0.60       949

   micro avg       0.49      0.49      0.49      2331
   macro avg       0.48      0.42      0.39      2331
weighted avg       0.48      0.49      0.44      2331

Matrix de Confusão: 
[[346  33 492]
 [164  40 307]
 [180  17 752]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.01244s
Predict Time: 0.0013s
Acurácia    : 50.92%
              precision    recall  f1-score   support

    NEGATIVO       0.49      0.55      0.52       871
      NEUTRO       0.41      0.48      0.44       511
    POSITIVO       0.61      0.48      0.54       949

   micro avg       0.51      0.51      0.51      2331
   macro avg       0.50      0.51      0.50   

Modelo      : SVC
Train Time  : 0.25222s
Predict Time: 0.13112s
Acurácia    : 45.1%
              precision    recall  f1-score   support

    NEGATIVO       0.16      0.11      0.13       191
      NEUTRO       0.45      0.29      0.35       354
    POSITIVO       0.52      0.72      0.60       444

   micro avg       0.45      0.45      0.45       989
   macro avg       0.37      0.38      0.36       989
weighted avg       0.42      0.45      0.42       989

Matrix de Confusão: 
[[ 21  56 114]
 [ 62 104 188]
 [ 50  73 321]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.00353s
Predict Time: 0.0004s
Acurácia    : 55.89%
              precision    recall  f1-score   support

    NEGATIVO       0.55      0.83      0.66       632
      NEUTRO       0.00      0.00      0.00       132
    POSITIVO       0.58      0.39      0.47       578

   micro avg       0.56      0.56      0.56      1342
   macro avg       0.38      0.41      0.38      1342

## Count

In [13]:
#all_datasets
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_count, avalencias))
        matriz_resultados['all']['count'][classifier.__class__.__name__] = acc
        train_time['all']['count'][classifier.__class__.__name__] = tt
        predict_time['all']['count'][classifier.__class__.__name__] = pt
    except:
        pass

      
#tweets_mg
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count_tmg = vec_count.fit_transform(atweets_mg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_count_tmg, aval_tweets_mg))
        matriz_resultados['tweets']['count'][classifier.__class__.__name__] = acc
        train_time['tweets']['count'][classifier.__class__.__name__] = tt
        predict_time['tweets']['count'][classifier.__class__.__name__] = pt
    except:
        pass
      
      
#titulo_noticias
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count_tn = vec_count.fit_transform(atitulo_noticias)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_count_tn, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['count'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['count'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['count'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.00952s
Predict Time: 0.00103s
Acurácia    : 64.22%
              precision    recall  f1-score   support

    NEGATIVO       0.63      0.65      0.64       871
      NEUTRO       0.55      0.54      0.55       511
    POSITIVO       0.70      0.69      0.70       949

   micro avg       0.64      0.64      0.64      2331
   macro avg       0.63      0.63      0.63      2331
weighted avg       0.64      0.64      0.64      2331

Matrix de Confusão: 
[[566 142 163]
 [123 276 112]
 [211  83 655]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.01002s
Predict Time: 0.00089s
Acurácia    : 64.01%
              precision    recall  f1-score   support

    NEGATIVO       0.65      0.61      0.63       871
      NEUTRO       0.51      0.67      0.58       511
    POSITIVO       0.74      0.65      0.69       949

   micro avg       0.64      0.64      0.64      2331
   macro avg       0.63      0.64      0.63  

Modelo      : SVC
Train Time  : 0.20207s
Predict Time: 0.08333s
Acurácia    : 51.06%
              precision    recall  f1-score   support

    NEGATIVO       0.30      0.06      0.10       191
      NEUTRO       0.69      0.17      0.27       354
    POSITIVO       0.50      0.98      0.66       444

   micro avg       0.51      0.51      0.51       989
   macro avg       0.50      0.40      0.35       989
weighted avg       0.53      0.51      0.42       989

Matrix de Confusão: 
[[ 12  20 159]
 [ 24  60 270]
 [  4   7 433]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.00542s
Predict Time: 0.00063s
Acurácia    : 66.77%
              precision    recall  f1-score   support

    NEGATIVO       0.72      0.71      0.71       632
      NEUTRO       0.44      0.18      0.26       132
    POSITIVO       0.64      0.73      0.68       578

   micro avg       0.67      0.67      0.67      1342
   macro avg       0.60      0.54      0.55      13

## LSA (usando Count)

In [14]:
#all_datasets
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lsa = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(100, random_state=10)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(svd, normalizer)
X_svd = lda.fit_transform(X_count_lsa)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, avalencias))
        matriz_resultados['all']['count+lsa'][classifier.__class__.__name__] = acc
        train_time['all']['count+lsa'][classifier.__class__.__name__] = tt
        predict_time['all']['count+lsa'][classifier.__class__.__name__] = pt
    except:
        pass
      

#tweets_mg
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lsa_tmg = vec_count.fit_transform(atweets_mg)
X_svd = lda.fit_transform(X_count_lsa_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, aval_tweets_mg))
        matriz_resultados['tweets']['count+lsa'][classifier.__class__.__name__] = acc
        train_time['tweets']['count+lsa'][classifier.__class__.__name__] = tt
        predict_time['tweets']['count+lsa'][classifier.__class__.__name__] = pt
    except:
        pass
      
      
#titulos_noticias
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lsa_tn = vec_count.fit_transform(atitulo_noticias)
X_svd = lda.fit_transform(X_count_lsa_tn)

print("\ntitulos_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['count+lsa'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['count+lsa'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['count+lsa'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.01976s
Predict Time: 0.00305s
Acurácia    : 40.71%
              precision    recall  f1-score   support

    NEGATIVO       0.00      0.00      0.00       871
      NEUTRO       0.50      0.00      0.00       511
    POSITIVO       0.41      1.00      0.58       949

   micro avg       0.41      0.41      0.41      2331
   macro avg       0.30      0.33      0.19      2331
weighted avg       0.28      0.41      0.24      2331

Matrix de Confusão: 
[[  0   0 871]
 [  0   1 510]
 [  0   1 948]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.00754s
Predict Time: 0.00063s
Acurácia    : 49.25%
              precision    recall  f1-score   support

    NEGATIVO       0.72      0.15      0.25       871
      NEUTRO       0.43      0.55      0.48       511
    POSITIVO       0.49      0.77      0.60       949

   micro avg       0.49      0.49      0.49      2331
   macro avg       0.55      0.49      0.45  

Modelo      : SVC
Train Time  : 0.25278s
Predict Time: 0.12125s
Acurácia    : 45.8%
              precision    recall  f1-score   support

    NEGATIVO       0.20      0.07      0.10       191
      NEUTRO       0.44      0.12      0.19       354
    POSITIVO       0.48      0.89      0.62       444

   micro avg       0.46      0.46      0.46       989
   macro avg       0.37      0.36      0.31       989
weighted avg       0.41      0.46      0.37       989

Matrix de Confusão: 
[[ 13  24 154]
 [ 35  43 276]
 [ 16  31 397]]
------------------------------


titulos_noticias
Modelo      : MultinomialNB
Train Time  : 0.01165s
Predict Time: 0.00041s
Acurácia    : 47.09%
              precision    recall  f1-score   support

    NEGATIVO       0.47      1.00      0.64       632
      NEUTRO       0.00      0.00      0.00       132
    POSITIVO       0.00      0.00      0.00       578

   micro avg       0.47      0.47      0.47      1342
   macro avg       0.16      0.33      0.21      13

## LDA (usando Count)

In [15]:
#all_datasets
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lda = vec_tfidf.fit_transform(afrases)

lda = LatentDirichletAllocation(n_components=100, max_iter=25, random_state=10, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_count_lda)


print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, avalencias))
        matriz_resultados['all']['count+lda'][classifier.__class__.__name__] = acc
        train_time['all']['count+lda'][classifier.__class__.__name__] = tt
        predict_time['all']['count+lda'][classifier.__class__.__name__] = pt
    except:
        pass


#tweets_mg
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lda_tmg = vec_count.fit_transform(atweets_mg)
X_lda = lda.fit_transform(X_count_lda_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, aval_tweets_mg))
        matriz_resultados['tweets']['count+lda'][classifier.__class__.__name__] = acc
        train_time['tweets']['count+lda'][classifier.__class__.__name__] = tt
        predict_time['tweets']['count+lda'][classifier.__class__.__name__] = pt
    except:
        pass
      
      
#titulo_noticias
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lda_tn = vec_count.fit_transform(atitulo_noticias)
X_lda = lda.fit_transform(X_count_lda_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['count+lda'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['count+lda'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['count+lda'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.00611s
Predict Time: 0.00062s
Acurácia    : 51.61%
              precision    recall  f1-score   support

    NEGATIVO       0.52      0.43      0.47       871
      NEUTRO       0.57      0.20      0.29       511
    POSITIVO       0.51      0.76      0.61       949

   micro avg       0.52      0.52      0.52      2331
   macro avg       0.53      0.46      0.46      2331
weighted avg       0.53      0.52      0.49      2331

Matrix de Confusão: 
[[378  50 443]
 [155 100 256]
 [199  25 725]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.00754s
Predict Time: 0.0006s
Acurácia    : 54.27%
              precision    recall  f1-score   support

    NEGATIVO       0.52      0.61      0.56       871
      NEUTRO       0.47      0.57      0.51       511
    POSITIVO       0.64      0.47      0.54       949

   micro avg       0.54      0.54      0.54      2331
   macro avg       0.54      0.55      0.54   

Modelo      : SVC
Train Time  : 0.25544s
Predict Time: 0.14572s
Acurácia    : 49.04%
              precision    recall  f1-score   support

    NEGATIVO       0.26      0.26      0.26       191
      NEUTRO       0.47      0.21      0.29       354
    POSITIVO       0.57      0.81      0.67       444

   micro avg       0.49      0.49      0.49       989
   macro avg       0.43      0.43      0.41       989
weighted avg       0.47      0.49      0.45       989

Matrix de Confusão: 
[[ 50  48  93]
 [ 98  75 181]
 [ 47  37 360]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.00356s
Predict Time: 0.00041s
Acurácia    : 48.06%
              precision    recall  f1-score   support

    NEGATIVO       0.49      0.66      0.57       632
      NEUTRO       0.00      0.00      0.00       132
    POSITIVO       0.46      0.39      0.42       578

   micro avg       0.48      0.48      0.48      1342
   macro avg       0.32      0.35      0.33      13

## Count + TF-IDF + Word2Vec

In [16]:
#all_datasets
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(afrases)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(afrases)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in afrases:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=5,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=500)

(32538793, 35214000)

In [17]:
#all_datasets
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X, avalencias))
        matriz_resultados['all']['tfidf+count+w2c'][classifier.__class__.__name__] = acc
        train_time['all']['tfidf+count+w2c'][classifier.__class__.__name__] = tt
        predict_time['all']['tfidf+count+w2c'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : LogisticRegression
Train Time  : 14.13429s
Predict Time: 0.03728s
Acurácia    : 62.85%
              precision    recall  f1-score   support

    NEGATIVO       0.62      0.62      0.62       871
      NEUTRO       0.56      0.46      0.50       511
    POSITIVO       0.66      0.73      0.69       949

   micro avg       0.63      0.63      0.63      2331
   macro avg       0.61      0.60      0.61      2331
weighted avg       0.62      0.63      0.62      2331

Matrix de Confusão: 
[[540 114 217]
 [139 235 137]
 [188  71 690]]
------------------------------

Modelo      : RandomForestClassifier
Train Time  : 21.79987s
Predict Time: 0.24184s
Acurácia    : 63.62%
              precision    recall  f1-score   support

    NEGATIVO       0.65      0.61      0.63       871
      NEUTRO       0.54      0.55      0.55       511
    POSITIVO       0.67      0.71      0.69       949

   micro avg       0.64      0.64      0.64      2331
   macro avg       0.62     

In [18]:
#tweets_mg
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(atweets_mg)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(atweets_mg)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in atweets_mg:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=5,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=500)

(10651456, 13109500)

In [19]:
#tweets_mg
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf+count+w2c'][classifier.__class__.__name__] = acc
        train_time['tweets']['tfidf+count+w2c'][classifier.__class__.__name__] = tt
        predict_time['tweets']['tfidf+count+w2c'][classifier.__class__.__name__] = pt
    except:
        pass


tweets_mg
Modelo      : LogisticRegression
Train Time  : 1.77083s
Predict Time: 0.00777s
Acurácia    : 64.91%
              precision    recall  f1-score   support

    NEGATIVO       0.27      0.08      0.12       191
      NEUTRO       0.54      0.79      0.64       354
    POSITIVO       0.83      0.79      0.81       444

   micro avg       0.65      0.65      0.65       989
   macro avg       0.55      0.55      0.52       989
weighted avg       0.62      0.65      0.62       989

Matrix de Confusão: 
[[ 15 152  24]
 [ 31 278  45]
 [ 10  85 349]]
------------------------------

Modelo      : RandomForestClassifier
Train Time  : 2.27283s
Predict Time: 0.04731s
Acurácia    : 66.53%
              precision    recall  f1-score   support

    NEGATIVO       0.31      0.09      0.14       191
      NEUTRO       0.55      0.85      0.67       354
    POSITIVO       0.88      0.76      0.82       444

   micro avg       0.67      0.67      0.67       989
   macro avg       0.58      0.57

In [20]:
#titulo_noticias
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(atitulo_noticias)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(atitulo_noticias)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in atitulo_noticias:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=5,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=500)

(20773814, 22104500)

In [21]:
#titulo_noticias
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wjv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf+count+w2c'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['tfidf+count+w2c'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['tfidf+count+w2c'][classifier.__class__.__name__] = pt
    except:
        pass


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.07021s
Predict Time: 0.01795s
Acurácia    : 65.35%
              precision    recall  f1-score   support

    NEGATIVO       0.65      0.83      0.73       632
      NEUTRO       1.00      0.02      0.03       132
    POSITIVO       0.65      0.61      0.63       578

   micro avg       0.65      0.65      0.65      1342
   macro avg       0.77      0.48      0.46      1342
weighted avg       0.69      0.65      0.62      1342

Matrix de Confusão: 
[[523   0 109]
 [ 50   2  80]
 [226   0 352]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.04652s
Predict Time: 0.02224s
Acurácia    : 65.28%
              precision    recall  f1-score   support

    NEGATIVO       0.70      0.72      0.71       632
      NEUTRO       0.37      0.27      0.31       132
    POSITIVO       0.65      0.66      0.66       578

   micro avg       0.65      0.65      0.65      1342
   macro avg       0.57      0.55      0.5

### Resultado dos Classificadores

#### Resultado dos Classificadores para todas as frases

In [22]:
print('Acuraria (%):')
print('-' * 20)
df = pd.DataFrame.from_dict(matriz_resultados['all'])
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Acuraria (%):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,63.45%,54.14%,50.92%,64.01%,49.25%,54.27%,0.00%
KNeighborsClassifier,59.72%,55.90%,46.55%,32.35%,54.57%,50.49%,58.94%
LinearSVC,64.82%,59.63%,51.87%,63.02%,58.90%,55.43%,62.25%
LogisticRegression,61.86%,58.99%,52.17%,63.45%,57.83%,55.04%,62.85%
MLPClassifier,62.81%,59.24%,47.49%,62.72%,59.93%,50.92%,58.69%
MultinomialNB,55.98%,40.84%,48.82%,64.22%,40.71%,51.61%,0.00%
RandomForestClassifier,62.63%,60.15%,50.41%,63.49%,58.60%,56.67%,63.62%
SVC,46.93%,42.26%,42.26%,42.64%,42.26%,42.60%,49.03%


In [23]:
print('Tempo para treinar o classificador (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(train_time['all'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para treinar o classificador (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00853,0.00773,0.01244,0.01002,0.00754,0.00754,0.0
KNeighborsClassifier,0.00172,0.01878,0.01762,0.00185,0.01561,0.02024,2.30508
LinearSVC,0.1219,0.88439,0.32223,0.30744,0.82938,0.37529,0.15334
LogisticRegression,2.01204,0.88096,0.42078,2.5711,0.47245,0.50174,14.13429
MLPClassifier,319.87141,12.28083,22.40865,366.48793,6.74653,15.15768,138.23474
MultinomialNB,0.00894,0.00788,0.00651,0.00952,0.01976,0.00611,0.0
RandomForestClassifier,6.08885,2.34992,3.28108,7.75725,2.10836,1.0961,21.79987
SVC,0.36216,0.63271,0.61725,0.35699,0.59285,0.59125,38.83282


In [24]:
print('Tempo para predizer novos itens (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(predict_time['all'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para predizer novos itens (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00071,0.00067,0.0013,0.00089,0.00063,0.0006,0.0
KNeighborsClassifier,0.24803,3.10853,1.90963,0.28075,2.44237,2.10386,209.03827
LinearSVC,0.00056,0.00065,0.00064,0.00054,0.00063,0.00083,0.04324
LogisticRegression,0.00111,0.00065,0.00117,0.00136,0.00063,0.00075,0.03728
MLPClassifier,0.01042,0.00433,0.00728,0.01092,0.00403,0.00558,0.1199
MultinomialNB,0.00095,0.00075,0.00103,0.00103,0.00305,0.00062,0.0
RandomForestClassifier,0.08935,0.02412,0.02894,0.11073,0.02354,0.0358,0.24184
SVC,0.24023,0.34915,0.36848,0.23114,0.30691,0.36522,25.43513


#### Resultado dos Classificadores para os Tweets MG

In [25]:
print('Acuraria (%):')
print('-' * 20)
df = pd.DataFrame.from_dict(matriz_resultados['tweets'])
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Acuraria (%):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,61.78%,62.08%,56.93%,60.47%,59.66%,50.76%,0.00%
KNeighborsClassifier,61.38%,62.69%,54.80%,47.93%,61.58%,47.62%,63.30%
LinearSVC,65.12%,65.72%,58.85%,63.70%,65.72%,55.31%,64.31%
LogisticRegression,66.03%,66.13%,59.86%,64.71%,65.22%,55.81%,64.91%
MLPClassifier,62.49%,65.72%,56.32%,61.58%,65.62%,53.29%,62.08%
MultinomialNB,63.90%,53.59%,59.35%,62.89%,56.02%,56.12%,0.00%
RandomForestClassifier,65.62%,64.71%,57.43%,65.82%,62.69%,55.41%,66.53%
SVC,54.80%,48.13%,45.10%,51.06%,45.80%,49.04%,54.40%


In [26]:
print('Tempo para treinar o classificador (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(train_time['tweets'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para treinar o classificador (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00381,0.01396,0.00376,0.00399,0.00393,0.00373,0.0
KNeighborsClassifier,0.00089,0.01306,0.0062,0.00115,0.00596,0.00613,0.30196
LinearSVC,0.03377,0.30117,0.12221,0.10713,0.29865,0.1388,0.03622
LogisticRegression,0.58644,0.21441,0.0854,0.83963,0.1823,0.10357,1.77083
MLPClassifier,24.00586,4.93032,7.75917,18.81883,3.45911,7.35322,12.51817
MultinomialNB,0.00345,0.01112,0.00295,0.00353,0.00411,0.00289,0.0
RandomForestClassifier,1.11672,0.88895,0.87372,1.32617,0.75934,0.35525,2.27283
SVC,0.16331,0.26797,0.25222,0.20207,0.25278,0.25544,8.70687


In [27]:
print('Tempo para predizer novos itens (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(predict_time['tweets'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para predizer novos itens (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.0005,0.00173,0.00032,0.00033,0.00031,0.00033,0.0
KNeighborsClassifier,0.04198,0.51005,0.30036,0.04863,0.4584,0.28658,20.298
LinearSVC,0.00027,0.00035,0.00034,0.00022,0.00034,0.00036,0.00765
LogisticRegression,0.00041,0.00033,0.00034,0.00049,0.00033,0.00035,0.00777
MLPClassifier,0.0037,0.00302,0.00283,0.00404,0.00179,0.00225,0.02304
MultinomialNB,0.00029,0.00189,0.00033,0.00035,0.00033,0.00034,0.0
RandomForestClassifier,0.0239,0.01421,0.01359,0.02993,0.01114,0.01598,0.04731
SVC,0.08716,0.13386,0.13112,0.08333,0.12125,0.14572,4.86176


#### Resultados dos Classificadores para os Títulos de Notícias

In [28]:
print('Acuraria (%):')
print('-' * 20)
df = pd.DataFrame.from_dict(matriz_resultados['titulo_noticias'])
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Acuraria (%):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,65.65%,58.72%,52.16%,61.77%,48.36%,41.58%,65.28%
KNeighborsClassifier,62.44%,55.59%,51.64%,44.78%,58.05%,46.94%,62.00%
LinearSVC,68.11%,63.71%,56.48%,68.63%,62.97%,46.87%,67.59%
LogisticRegression,67.36%,64.08%,56.78%,69.08%,62.07%,46.65%,68.11%
MLPClassifier,67.06%,63.41%,49.03%,69.15%,62.07%,46.65%,66.24%
MultinomialNB,65.57%,47.17%,55.89%,66.77%,47.09%,48.06%,65.35%
RandomForestClassifier,64.23%,61.25%,52.16%,67.29%,61.40%,48.14%,64.61%
SVC,50.75%,44.93%,45.60%,45.45%,43.52%,42.77%,54.69%


In [29]:
print('Tempo para treinar o classificador (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(train_time['titulo_noticias'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para treinar o classificador (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00567,0.03205,0.00479,0.00808,0.00476,0.00468,0.04652
KNeighborsClassifier,0.00116,0.00823,0.00806,0.00124,0.00792,0.00829,0.72326
LinearSVC,0.07837,0.4377,0.16231,0.15687,0.43131,0.23308,0.06055
LogisticRegression,1.06921,0.2843,0.11679,1.1283,0.25938,0.10973,3.84153
MLPClassifier,74.25707,6.58972,7.84679,40.05524,7.25294,9.6936,43.92755
MultinomialNB,0.00456,0.01203,0.00353,0.00542,0.01165,0.00356,0.07021
RandomForestClassifier,3.257,1.13198,1.3543,3.63911,1.07433,0.64239,7.94276
SVC,0.27395,0.34762,0.36819,0.22916,0.34141,0.34013,16.0923


In [30]:
print('Tempo para predizer novos itens (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(predict_time['titulo_noticias'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para predizer novos itens (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00047,0.0004,0.00038,0.00072,0.0004,0.00038,0.02224
KNeighborsClassifier,0.0929,0.80643,0.67521,0.07224,0.82296,0.46336,48.35723
LinearSVC,0.00128,0.0004,0.00043,0.00034,0.0004,0.00041,0.01426
LogisticRegression,0.00094,0.0004,0.00041,0.00091,0.00079,0.0004,0.01599
MLPClassifier,0.00807,0.00248,0.00324,0.00678,0.0024,0.00297,0.04213
MultinomialNB,0.00053,0.00201,0.0004,0.00063,0.00041,0.00041,0.01795
RandomForestClassifier,0.05195,0.01462,0.01683,0.06257,0.01447,0.02358,0.10173
SVC,0.17547,0.17901,0.1814,0.13193,0.16259,0.18725,10.22473


### Modelo escolhido e salvo

In [11]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(100, random_state=10)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf_svd)

svm = LinearSVC(max_iter=150, random_state=10)
model = CalibratedClassifierCV(svm) 
model.fit(X_svd, avalencias)

accuracy = np.round(model.score(X_svd, avalencias) * 100, 2)
print(f'Modelo   : {model.__class__.__name__}')
print(f'Acurácia : {accuracy}%')

filename = 'tfidf_valence.sav'
joblib.dump(vec_tfidf, filename)

filename = 'lsa_valence.sav'
joblib.dump(lsa, filename)

filename = 'model_valence.sav'
joblib.dump(model, filename)

Modelo   : CalibratedClassifierCV
Acurácia : 62.33%


['model_valence.sav']

In [32]:
model.classes_

array(['NEGATIVO', 'NEUTRO', 'POSITIVO'], dtype='<U8')

In [33]:
y = model.predict_proba(X_svd)
list(np.round(y[0] * 100, 2))

[28.65, 32.32, 39.03]