## Testes de valência para o projeto final de IA369Y 2 Semestre 2018

Passos para tratar os dados com valência, testar e escolher um classificador para utilizar no projeto final de IA369Y.

1) Remover espaços duplos, quebras de linha, números e links do dataset e das frases a serem testadas.

2) Remover stopwords e aplicar o stemmer.

3) Treinar os classificadores.

4) Realizar as predições com os classificadores.

5) Avaliar as medidas obtidas com os classificadores.

In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle
import time

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk import word_tokenize
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.externals import joblib

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

def highlight_max(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)


def highlight_min(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_min = data == data.min()
        return [attr if v else '' for v in is_min]
    else:  # from .apply(axis=None)
        is_min = data == data.min().min()
        return pd.DataFrame(np.where(is_min, attr, ''),
                            index=data.index, columns=data.columns)   

### Matriz de Resultados

In [2]:
classf = {
    'MultinomialNB': 0,
    'ComplementNB': 0,
    'LogisticRegression': 0,
    'RandomForestClassifier': 0,
    'KNeighborsClassifier': 0,
    'MLPClassifier': 0,
    'LinearSVC': 0,
    'SVC': 0
}

matriz_resultados = {
    'all': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    },
    'tweets': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),       
    },
    'titulo_noticias': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    }
}

train_time = {
    'all': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    },
    'tweets': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),       
    },
    'titulo_noticias': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    }
}

predict_time = {
    'all': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    },
    'tweets': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),       
    },
    'titulo_noticias': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    }
}

## Datasets

Para validar, serão utilizados dois datasets.

Os dos datasets foram obtidos do site minerando dados.

O primeiro deles tem tweets de política de Minas Gerais com rótulos de valência: positivo, negativo e neutro. Foi feito um tratamento para eliminar tweets repetidos e dessa forma sobraram 3016 tweets.

O segundo contém 2123 títulos de notícias com rótulos de valência: positivo, negativo e neutro.

In [3]:
path = '/ssd/programas/sentiment-analysis-2018-president-election/dataset/valencia/'

#Carregando os datasets
def carregar(filename):
    frases = []
    with open(filename, 'r', encoding='utf-8') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            valencia = row[1].upper()
            if len(frase) > 5:
                frases.append((valencia, frase))
    return frases

In [4]:
#Carrega os datasets em separado
tweets_mg = carregar(f'{path}tweets_mg_tratados.csv')
shuffle(tweets_mg)
titulo_noticias = carregar(f'{path}titulo_noticias.txt')
shuffle(titulo_noticias)

frases = tweets_mg + titulo_noticias
shuffle(frases)

print(frases[:5])
print('-' * 20)
print(tweets_mg[:5])
print('-' * 20)
print(titulo_noticias[:5])

[('NEUTRO', 'implic org govern dilm pression ditadur venezuel pag dív bilionári odebrecht'), ('NEUTRO', 'par fing estar melhor min ger brasil'), ('NEUTRO', 'govern min cham vagabund ladrã shoping paul'), ('POSITIVO', 'pmg veícul produt roub recuper'), ('NEGATIVO', 'campanh eleitoral ver torped internet cri polêm irã jov bombard mail mensag polít celul país milhã internaut milhã usar aparelh móvel')]
--------------------
[('NEGATIVO', 'ser aind gent apó ver'), ('POSITIVO', 'sinomarp mut vacin contr febr amarel realiz ipating'), ('NEGATIVO', 'calam financeir govern compr dois helicópter'), ('POSITIVO', 'timbet glob betim contag intensific vacin contr febr amarel nest sáb'), ('NEUTRO', 'petist pimentel')]
--------------------
[('NEGATIVO', 'dia descobr beij pesso esquec outr bobag'), ('POSITIVO', 'usimin lucr milhã tri melhor result desd notic uol econom'), ('NEGATIVO', 'petrobr mult val recomend mais notíc rad'), ('NEGATIVO', 'maior pens sensibil sent pensament hom vulg sent viv pens sab

In [5]:
def carregar(filename):
    frases = []
    with open(filename, 'r', encoding='utf-8') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = ' '.join(row[0].strip().splitlines())
            valencia = row[1].upper()
            if len(frase) > 5:
                frases.append((valencia, frase))
    return frases

titulo_noticias2 = carregar(f'{path}titulo_noticias.txt')
only_neutro = []
for sent, new in titulo_noticias2:
    if sent == 'NEUTRO':
        only_neutro.append(new)
with open(f'{path}titulo_noticias_neutro.txt', 'w') as h:
    for k in only_neutro:
        h.write(f'{k}|NEUTRO\n')

In [6]:
#all_datasets
afrases = []
avalencias = []
for valencia, frase in frases:
    afrases.append(frase)
    avalencias.append(valencia)
    
print(afrases[:5])
print(avalencias[:5])
print('-' * 20)

#tweets_mg
atweets_mg = []
aval_tweets_mg = []
for valencia, frase in tweets_mg:
    atweets_mg.append(frase)
    aval_tweets_mg.append(valencia)

print(atweets_mg[:5])
print(aval_tweets_mg[:5])
print('-' * 20)

#titulo_noticias
atitulo_noticias = []
aval_titulo_noticias = []
for valencia, frase in titulo_noticias:
    atitulo_noticias.append(frase)
    aval_titulo_noticias.append(valencia)

print(atitulo_noticias[:5])
print(aval_titulo_noticias[:5])

['implic org govern dilm pression ditadur venezuel pag dív bilionári odebrecht', 'par fing estar melhor min ger brasil', 'govern min cham vagabund ladrã shoping paul', 'pmg veícul produt roub recuper', 'campanh eleitoral ver torped internet cri polêm irã jov bombard mail mensag polít celul país milhã internaut milhã usar aparelh móvel']
['NEUTRO', 'NEUTRO', 'NEUTRO', 'POSITIVO', 'NEGATIVO']
--------------------
['ser aind gent apó ver', 'sinomarp mut vacin contr febr amarel realiz ipating', 'calam financeir govern compr dois helicópter', 'timbet glob betim contag intensific vacin contr febr amarel nest sáb', 'petist pimentel']
['NEGATIVO', 'POSITIVO', 'NEGATIVO', 'POSITIVO', 'NEUTRO']
--------------------
['dia descobr beij pesso esquec outr bobag', 'usimin lucr milhã tri melhor result desd notic uol econom', 'petrobr mult val recomend mais notíc rad', 'maior pens sensibil sent pensament hom vulg sent viv pens sab viv mim pens viv sent nã mais aliment pens', 'cientist usar twit questio

In [7]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    start = time.time()
    model.fit(X_train, y_train)
    train_time = round(time.time() - start, 5)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    start = time.time()
    y_pred = model.predict(X_test)
    predict_time = round(time.time() - start, 5)
    print(f'Modelo      : {model.__class__.__name__}')
    print(f'Train Time  : {train_time}s')
    print(f'Predict Time: {predict_time}s')
    print(f'Acurácia    : {accuracy}%')
    print(classification_report(y_test, y_pred))
    print('Matrix de Confusão: ')
    print(confusion_matrix(y_test, y_pred))
    print('-' * 30)
    print()
    return accuracy, train_time, predict_time

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

## Classificadores

In [8]:
classifiers = (
    MultinomialNB(),
    ComplementNB(),
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=50, min_samples_split=8, random_state=10),
    KNeighborsClassifier(n_neighbors=8, algorithm='auto'),
    MLPClassifier(hidden_layer_sizes=(100, 25), max_iter=500, random_state=10),
    LinearSVC(max_iter=150, random_state=10),
    SVC(gamma='auto', max_iter=150),
)

## TF-IDF

In [9]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

vec_tfidf_tmg = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf_tmg = vec_tfidf_tmg.fit_transform(atweets_mg)

vec_tfidf_tn = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf_tn = vec_tfidf_tn.fit_transform(atitulo_noticias)

In [10]:
print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_tfidf, avalencias))
        matriz_resultados['all']['tfidf'][classifier.__class__.__name__] = acc
        train_time['all']['tfidf'][classifier.__class__.__name__] = tt
        predict_time['all']['tfidf'][classifier.__class__.__name__] = pt
    except:
        pass

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_tfidf_tmg, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf'][classifier.__class__.__name__] = acc
        train_time['tweets']['tfidf'][classifier.__class__.__name__] = tt
        predict_time['tweets']['tfidf'][classifier.__class__.__name__] = pt
    except:
        pass

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_tfidf_tn, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['tfidf'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['tfidf'][classifier.__class__.__name__] = pt
    except:
        pass      


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.01453s
Predict Time: 0.00138s
Acurácia    : 58.86%
              precision    recall  f1-score   support

    NEGATIVO       0.54      0.60      0.57       790
      NEUTRO       0.61      0.17      0.26       529
    POSITIVO       0.62      0.79      0.69      1029

   micro avg       0.59      0.59      0.59      2348
   macro avg       0.59      0.52      0.51      2348
weighted avg       0.59      0.59      0.56      2348

Matrix de Confusão: 
[[476  36 278]
 [207  89 233]
 [191  21 817]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.01132s
Predict Time: 0.00073s
Acurácia    : 63.12%
              precision    recall  f1-score   support

    NEGATIVO       0.57      0.65      0.61       790
      NEUTRO       0.55      0.51      0.53       529
    POSITIVO       0.73      0.68      0.70      1029

   micro avg       0.63      0.63      0.63      2348
   macro avg       0.62      0.61      0.61  

[[ 32 149  20]
 [ 48 273  41]
 [ 12  87 327]]
------------------------------

Modelo      : SVC
Train Time  : 0.60397s
Predict Time: 0.34286s
Acurácia    : 53.69%
              precision    recall  f1-score   support

    NEGATIVO       0.33      0.22      0.26       201
      NEUTRO       0.52      0.35      0.42       362
    POSITIVO       0.59      0.85      0.69       426

   micro avg       0.54      0.54      0.54       989
   macro avg       0.48      0.47      0.46       989
weighted avg       0.51      0.54      0.51       989

Matrix de Confusão: 
[[ 44  69  88]
 [ 68 127 167]
 [ 20  46 360]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.01788s
Predict Time: 0.00189s
Acurácia    : 65.34%
              precision    recall  f1-score   support

    NEGATIVO       0.65      0.84      0.73       635
      NEUTRO       1.00      0.01      0.02       130
    POSITIVO       0.66      0.60      0.63       594

   micro avg       0.65    

## LSA (usando TF-IDF)

In [11]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd = vec_tfidf.fit_transform(afrases)

#all_datasets
svd = TruncatedSVD(100, random_state=10)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf_svd)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, avalencias))
        matriz_resultados['all']['tfidf+lsa'][classifier.__class__.__name__] = acc
        train_time['all']['tfidf+lsa'][classifier.__class__.__name__] = tt
        predict_time['all']['tfidf+lsa'][classifier.__class__.__name__] = pt
    except Exception as e:
        print(e)

#tweets_mg
vec_tfidf_tmg = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd_tmg = vec_tfidf_tmg.fit_transform(atweets_mg)
X_svd = lsa.fit_transform(X_tfidf_svd_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf+lsa'][classifier.__class__.__name__] = acc
        train_time['tweets']['tfidf+lsa'][classifier.__class__.__name__] = tt
        predict_time['tweets']['tfidf+lsa'][classifier.__class__.__name__] = pt
    except:
        pass


#titulo_noticias
vec_tfidf_tn = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd_tn = vec_tfidf_tn.fit_transform(atitulo_noticias)
X_svd = lsa.fit_transform(X_tfidf_svd_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf+lsa'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['tfidf+lsa'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['tfidf+lsa'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.0242s
Predict Time: 0.0012s
Acurácia    : 43.91%
              precision    recall  f1-score   support

    NEGATIVO       0.00      0.00      0.00       790
      NEUTRO       1.00      0.00      0.01       529
    POSITIVO       0.44      1.00      0.61      1029

   micro avg       0.44      0.44      0.44      2348
   macro avg       0.48      0.33      0.21      2348
weighted avg       0.42      0.44      0.27      2348

Matrix de Confusão: 
[[   0    0  790]
 [   0    2  527]
 [   0    0 1029]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.02366s
Predict Time: 0.00117s
Acurácia    : 54.64%
              precision    recall  f1-score   support

    NEGATIVO       0.53      0.45      0.49       790
      NEUTRO       0.47      0.57      0.52       529
    POSITIVO       0.61      0.60      0.60      1029

   micro avg       0.55      0.55      0.55      2348
   macro avg       0.54      0.54     

Modelo      : SVC
Train Time  : 0.76614s
Predict Time: 0.38243s
Acurácia    : 46.01%
              precision    recall  f1-score   support

    NEGATIVO       0.28      0.11      0.16       201
      NEUTRO       0.48      0.11      0.18       362
    POSITIVO       0.48      0.92      0.63       426

   micro avg       0.46      0.46      0.46       989
   macro avg       0.41      0.38      0.32       989
weighted avg       0.44      0.46      0.37       989

Matrix de Confusão: 
[[ 22  28 151]
 [ 39  40 283]
 [ 17  16 393]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.01477s
Predict Time: 0.00198s
Acurácia    : 46.73%
              precision    recall  f1-score   support

    NEGATIVO       0.47      1.00      0.64       635
      NEUTRO       0.00      0.00      0.00       130
    POSITIVO       0.00      0.00      0.00       594

   micro avg       0.47      0.47      0.47      1359
   macro avg       0.16      0.33      0.21      13

## LDA (usando TF-IDF)

In [12]:
#all_datasets
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd = vec_tfidf.fit_transform(afrases)

lda = LatentDirichletAllocation(n_components=100, max_iter=25, random_state=10, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_tfidf_svd)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, avalencias))
        matriz_resultados['all']['tfidf+lda'][classifier.__class__.__name__] = acc
        train_time['all']['tfidf+lda'][classifier.__class__.__name__] = tt
        predict_time['all']['tfidf+lda'][classifier.__class__.__name__] = pt
    except:
        pass


#tweets_mg
vec_tfidf_tmg = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd_tmg = vec_tfidf_tmg.fit_transform(atweets_mg)
X_lda = lda.fit_transform(X_tfidf_svd_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf+lda'][classifier.__class__.__name__] = acc
        train_time['tweets']['tfidf+lda'][classifier.__class__.__name__] = tt
        predict_time['tweets']['tfidf+lda'][classifier.__class__.__name__] = pt
    except:
        pass


#titulo_noticias
vec_tfidf_tn = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd_tn = vec_tfidf_tn.fit_transform(atitulo_noticias)
X_lda = lda.fit_transform(X_tfidf_svd_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf+lda'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['tfidf+lda'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['tfidf+lda'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.0186s
Predict Time: 0.00125s
Acurácia    : 48.47%
              precision    recall  f1-score   support

    NEGATIVO       0.42      0.47      0.44       790
      NEUTRO       0.57      0.12      0.19       529
    POSITIVO       0.52      0.69      0.59      1029

   micro avg       0.48      0.48      0.48      2348
   macro avg       0.50      0.42      0.41      2348
weighted avg       0.50      0.48      0.45      2348

Matrix de Confusão: 
[[370  23 397]
 [216  62 251]
 [299  24 706]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.02311s
Predict Time: 0.00117s
Acurácia    : 50.0%
              precision    recall  f1-score   support

    NEGATIVO       0.44      0.52      0.48       790
      NEUTRO       0.43      0.48      0.45       529
    POSITIVO       0.62      0.49      0.55      1029

   micro avg       0.50      0.50      0.50      2348
   macro avg       0.50      0.50      0.49    

Modelo      : SVC
Train Time  : 0.76851s
Predict Time: 0.43126s
Acurácia    : 46.21%
              precision    recall  f1-score   support

    NEGATIVO       0.24      0.14      0.18       201
      NEUTRO       0.49      0.18      0.26       362
    POSITIVO       0.49      0.85      0.62       426

   micro avg       0.46      0.46      0.46       989
   macro avg       0.41      0.39      0.36       989
weighted avg       0.44      0.46      0.40       989

Matrix de Confusão: 
[[ 29  37 135]
 [ 59  65 238]
 [ 33  30 363]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.0108s
Predict Time: 0.00099s
Acurácia    : 53.57%
              precision    recall  f1-score   support

    NEGATIVO       0.54      0.76      0.63       635
      NEUTRO       0.00      0.00      0.00       130
    POSITIVO       0.53      0.41      0.46       594

   micro avg       0.54      0.54      0.54      1359
   macro avg       0.36      0.39      0.36      135

## Count

In [13]:
#all_datasets
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_count, avalencias))
        matriz_resultados['all']['count'][classifier.__class__.__name__] = acc
        train_time['all']['count'][classifier.__class__.__name__] = tt
        predict_time['all']['count'][classifier.__class__.__name__] = pt
    except:
        pass

      
#tweets_mg
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count_tmg = vec_count.fit_transform(atweets_mg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_count_tmg, aval_tweets_mg))
        matriz_resultados['tweets']['count'][classifier.__class__.__name__] = acc
        train_time['tweets']['count'][classifier.__class__.__name__] = tt
        predict_time['tweets']['count'][classifier.__class__.__name__] = pt
    except:
        pass
      
      
#titulo_noticias
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count_tn = vec_count.fit_transform(atitulo_noticias)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_count_tn, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['count'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['count'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['count'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.0262s
Predict Time: 0.00281s
Acurácia    : 63.2%
              precision    recall  f1-score   support

    NEGATIVO       0.57      0.65      0.61       790
      NEUTRO       0.55      0.51      0.53       529
    POSITIVO       0.74      0.68      0.71      1029

   micro avg       0.63      0.63      0.63      2348
   macro avg       0.62      0.61      0.61      2348
weighted avg       0.64      0.63      0.63      2348

Matrix de Confusão: 
[[516 138 136]
 [147 270 112]
 [244  87 698]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.035s
Predict Time: 0.00291s
Acurácia    : 63.12%
              precision    recall  f1-score   support

    NEGATIVO       0.60      0.61      0.60       790
      NEUTRO       0.50      0.65      0.56       529
    POSITIVO       0.77      0.64      0.70      1029

   micro avg       0.63      0.63      0.63      2348
   macro avg       0.62      0.63      0.62      

Modelo      : SVC
Train Time  : 0.25913s
Predict Time: 0.18555s
Acurácia    : 48.63%
              precision    recall  f1-score   support

    NEGATIVO       0.34      0.10      0.16       201
      NEUTRO       0.54      0.17      0.26       362
    POSITIVO       0.49      0.94      0.64       426

   micro avg       0.49      0.49      0.49       989
   macro avg       0.46      0.40      0.35       989
weighted avg       0.48      0.49      0.40       989

Matrix de Confusão: 
[[ 21  32 148]
 [ 33  61 268]
 [  7  20 399]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.00879s
Predict Time: 0.00106s
Acurácia    : 65.56%
              precision    recall  f1-score   support

    NEGATIVO       0.71      0.67      0.69       635
      NEUTRO       0.48      0.22      0.31       130
    POSITIVO       0.62      0.73      0.67       594

   micro avg       0.66      0.66      0.66      1359
   macro avg       0.61      0.54      0.56      13

## LSA (usando Count)

In [14]:
#all_datasets
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lsa = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(100, random_state=10)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(svd, normalizer)
X_svd = lda.fit_transform(X_count_lsa)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, avalencias))
        matriz_resultados['all']['count+lsa'][classifier.__class__.__name__] = acc
        train_time['all']['count+lsa'][classifier.__class__.__name__] = tt
        predict_time['all']['count+lsa'][classifier.__class__.__name__] = pt
    except:
        pass
      

#tweets_mg
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lsa_tmg = vec_count.fit_transform(atweets_mg)
X_svd = lda.fit_transform(X_count_lsa_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, aval_tweets_mg))
        matriz_resultados['tweets']['count+lsa'][classifier.__class__.__name__] = acc
        train_time['tweets']['count+lsa'][classifier.__class__.__name__] = tt
        predict_time['tweets']['count+lsa'][classifier.__class__.__name__] = pt
    except:
        pass
      
      
#titulos_noticias
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lsa_tn = vec_count.fit_transform(atitulo_noticias)
X_svd = lda.fit_transform(X_count_lsa_tn)

print("\ntitulos_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['count+lsa'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['count+lsa'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['count+lsa'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.02481s
Predict Time: 0.00102s
Acurácia    : 43.82%
              precision    recall  f1-score   support

    NEGATIVO       0.00      0.00      0.00       790
      NEUTRO       0.33      0.00      0.00       529
    POSITIVO       0.44      1.00      0.61      1029

   micro avg       0.44      0.44      0.44      2348
   macro avg       0.26      0.33      0.20      2348
weighted avg       0.27      0.44      0.27      2348

Matrix de Confusão: 
[[   0    1  789]
 [   0    1  528]
 [   0    1 1028]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.01517s
Predict Time: 0.00157s
Acurácia    : 50.6%
              precision    recall  f1-score   support

    NEGATIVO       0.61      0.18      0.27       790
      NEUTRO       0.43      0.57      0.49       529
    POSITIVO       0.53      0.73      0.61      1029

   micro avg       0.51      0.51      0.51      2348
   macro avg       0.52      0.49    

Modelo      : SVC
Train Time  : 0.34286s
Predict Time: 0.16097s
Acurácia    : 44.49%
              precision    recall  f1-score   support

    NEGATIVO       0.29      0.10      0.15       201
      NEUTRO       0.45      0.08      0.14       362
    POSITIVO       0.46      0.92      0.61       426

   micro avg       0.44      0.44      0.44       989
   macro avg       0.40      0.37      0.30       989
weighted avg       0.42      0.44      0.34       989

Matrix de Confusão: 
[[ 20  12 169]
 [ 37  29 296]
 [ 11  24 391]]
------------------------------


titulos_noticias
Modelo      : MultinomialNB
Train Time  : 0.00921s
Predict Time: 0.00057s
Acurácia    : 46.73%
              precision    recall  f1-score   support

    NEGATIVO       0.47      1.00      0.64       635
      NEUTRO       0.00      0.00      0.00       130
    POSITIVO       0.00      0.00      0.00       594

   micro avg       0.47      0.47      0.47      1359
   macro avg       0.16      0.33      0.21      1

## LDA (usando Count)

In [15]:
#all_datasets
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lda = vec_tfidf.fit_transform(afrases)

lda = LatentDirichletAllocation(n_components=100, max_iter=25, random_state=10, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_count_lda)


print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, avalencias))
        matriz_resultados['all']['count+lda'][classifier.__class__.__name__] = acc
        train_time['all']['count+lda'][classifier.__class__.__name__] = tt
        predict_time['all']['count+lda'][classifier.__class__.__name__] = pt
    except:
        pass


#tweets_mg
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lda_tmg = vec_count.fit_transform(atweets_mg)
X_lda = lda.fit_transform(X_count_lda_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, aval_tweets_mg))
        matriz_resultados['tweets']['count+lda'][classifier.__class__.__name__] = acc
        train_time['tweets']['count+lda'][classifier.__class__.__name__] = tt
        predict_time['tweets']['count+lda'][classifier.__class__.__name__] = pt
    except:
        pass
      
      
#titulo_noticias
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lda_tn = vec_count.fit_transform(atitulo_noticias)
X_lda = lda.fit_transform(X_count_lda_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['count+lda'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['count+lda'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['count+lda'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : MultinomialNB
Train Time  : 0.00936s
Predict Time: 0.0009s
Acurácia    : 50.55%
              precision    recall  f1-score   support

    NEGATIVO       0.44      0.51      0.47       790
      NEUTRO       0.53      0.18      0.27       529
    POSITIVO       0.55      0.67      0.60      1029

   micro avg       0.51      0.51      0.51      2348
   macro avg       0.51      0.45      0.45      2348
weighted avg       0.51      0.51      0.48      2348

Matrix de Confusão: 
[[399  45 346]
 [206  96 227]
 [296  41 692]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.01651s
Predict Time: 0.00087s
Acurácia    : 53.88%
              precision    recall  f1-score   support

    NEGATIVO       0.49      0.55      0.52       790
      NEUTRO       0.47      0.56      0.51       529
    POSITIVO       0.65      0.52      0.57      1029

   micro avg       0.54      0.54      0.54      2348
   macro avg       0.54      0.54      0.54   

Modelo      : SVC
Train Time  : 0.32485s
Predict Time: 0.15401s
Acurácia    : 44.29%
              precision    recall  f1-score   support

    NEGATIVO       0.26      0.17      0.21       201
      NEUTRO       0.50      0.04      0.08       362
    POSITIVO       0.47      0.91      0.62       426

   micro avg       0.44      0.44      0.44       989
   macro avg       0.41      0.37      0.30       989
weighted avg       0.44      0.44      0.34       989

Matrix de Confusão: 
[[ 34   9 158]
 [ 65  16 281]
 [ 31   7 388]]
------------------------------


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.00855s
Predict Time: 0.00631s
Acurácia    : 47.46%
              precision    recall  f1-score   support

    NEGATIVO       0.48      0.65      0.55       635
      NEUTRO       0.00      0.00      0.00       130
    POSITIVO       0.46      0.39      0.42       594

   micro avg       0.47      0.47      0.47      1359
   macro avg       0.31      0.35      0.33      13

## Count + TF-IDF + Word2Vec

In [16]:
#all_datasets
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(afrases)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(afrases)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in afrases:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=5,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=500)

(32746561, 35424500)

In [17]:
#all_datasets
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X, avalencias))
        matriz_resultados['all']['tfidf+count+w2c'][classifier.__class__.__name__] = acc
        train_time['all']['tfidf+count+w2c'][classifier.__class__.__name__] = tt
        predict_time['all']['tfidf+count+w2c'][classifier.__class__.__name__] = pt
    except:
        pass


all_datasets
Modelo      : LogisticRegression
Train Time  : 21.4986s
Predict Time: 0.08318s
Acurácia    : 64.22%
              precision    recall  f1-score   support

    NEGATIVO       0.58      0.67      0.62       790
      NEUTRO       0.57      0.43      0.49       529
    POSITIVO       0.73      0.73      0.73      1029

   micro avg       0.64      0.64      0.64      2348
   macro avg       0.62      0.61      0.61      2348
weighted avg       0.64      0.64      0.64      2348

Matrix de Confusão: 
[[532 106 152]
 [174 229 126]
 [213  69 747]]
------------------------------

Modelo      : RandomForestClassifier
Train Time  : 28.98565s
Predict Time: 0.29381s
Acurácia    : 62.95%
              precision    recall  f1-score   support

    NEGATIVO       0.57      0.65      0.61       790
      NEUTRO       0.56      0.51      0.53       529
    POSITIVO       0.72      0.67      0.69      1029

   micro avg       0.63      0.63      0.63      2348
   macro avg       0.62      

In [18]:
#tweets_mg
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(atweets_mg)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(atweets_mg)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in atweets_mg:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=5,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=500)

(10651726, 13109500)

In [19]:
#tweets_mg
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf+count+w2c'][classifier.__class__.__name__] = acc
        train_time['tweets']['tfidf+count+w2c'][classifier.__class__.__name__] = tt
        predict_time['tweets']['tfidf+count+w2c'][classifier.__class__.__name__] = pt
    except:
        pass


tweets_mg
Modelo      : LogisticRegression
Train Time  : 2.02521s
Predict Time: 0.00892s
Acurácia    : 62.49%
              precision    recall  f1-score   support

    NEGATIVO       0.27      0.05      0.08       201
      NEUTRO       0.52      0.78      0.63       362
    POSITIVO       0.79      0.76      0.78       426

   micro avg       0.62      0.62      0.62       989
   macro avg       0.53      0.53      0.50       989
weighted avg       0.59      0.62      0.58       989

Matrix de Confusão: 
[[ 10 163  28]
 [ 22 284  56]
 [  5  97 324]]
------------------------------

Modelo      : RandomForestClassifier
Train Time  : 1.11889s
Predict Time: 0.03274s
Acurácia    : 63.9%
              precision    recall  f1-score   support

    NEGATIVO       0.33      0.10      0.15       201
      NEUTRO       0.53      0.82      0.64       362
    POSITIVO       0.87      0.74      0.80       426

   micro avg       0.64      0.64      0.64       989
   macro avg       0.57      0.55 

In [20]:
#titulo_noticias
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(atitulo_noticias)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(atitulo_noticias)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in atitulo_noticias:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=5,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=500)

(20972654, 22315000)

In [21]:
#titulo_noticias
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wjv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf+count+w2c'][classifier.__class__.__name__] = acc
        train_time['titulo_noticias']['tfidf+count+w2c'][classifier.__class__.__name__] = tt
        predict_time['titulo_noticias']['tfidf+count+w2c'][classifier.__class__.__name__] = pt
    except:
        pass


titulo_noticias
Modelo      : MultinomialNB
Train Time  : 0.03827s
Predict Time: 0.00794s
Acurácia    : 66.23%
              precision    recall  f1-score   support

    NEGATIVO       0.67      0.79      0.73       635
      NEUTRO       1.00      0.03      0.06       130
    POSITIVO       0.65      0.66      0.65       594

   micro avg       0.66      0.66      0.66      1359
   macro avg       0.77      0.49      0.48      1359
weighted avg       0.69      0.66      0.63      1359

Matrix de Confusão: 
[[503   0 132]
 [ 42   4  84]
 [201   0 393]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.03733s
Predict Time: 0.00786s
Acurácia    : 64.61%
              precision    recall  f1-score   support

    NEGATIVO       0.70      0.69      0.70       635
      NEUTRO       0.41      0.31      0.35       130
    POSITIVO       0.63      0.67      0.65       594

   micro avg       0.65      0.65      0.65      1359
   macro avg       0.58      0.56      0.5

### Resultado dos Classificadores

#### Resultado dos Classificadores para todas as frases

In [22]:
print('Acuraria (%):')
print('-' * 20)
df = pd.DataFrame.from_dict(matriz_resultados['all'])
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Acuraria (%):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,63.12%,54.64%,50.00%,63.12%,50.60%,53.88%,0.00%
KNeighborsClassifier,57.84%,54.98%,45.36%,34.11%,54.43%,50.68%,59.41%
LinearSVC,64.78%,58.99%,50.13%,62.73%,58.90%,54.05%,62.52%
LogisticRegression,62.69%,58.73%,50.13%,64.18%,59.28%,54.22%,64.22%
MLPClassifier,61.97%,57.37%,48.94%,61.75%,59.75%,50.77%,59.28%
MultinomialNB,58.86%,43.91%,48.47%,63.20%,43.82%,50.55%,0.00%
RandomForestClassifier,62.61%,57.75%,50.21%,62.82%,59.07%,53.32%,62.95%
SVC,48.17%,46.25%,41.23%,48.04%,44.85%,46.64%,46.59%


In [23]:
print('Tempo para treinar o classificador (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(train_time['all'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para treinar o classificador (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.01132,0.02366,0.02311,0.035,0.01517,0.01651,0.0
KNeighborsClassifier,0.00243,0.04872,0.02413,0.00272,0.02668,0.01475,3.27147
LinearSVC,0.22786,2.21711,0.79302,0.33628,0.77893,0.31192,0.08492
LogisticRegression,6.29359,1.06266,0.73606,10.64482,0.90832,0.56125,21.4986
MLPClassifier,500.15232,8.38732,32.79967,420.74058,5.18085,17.66538,66.92027
MultinomialNB,0.01453,0.0242,0.0186,0.0262,0.02481,0.00936,0.0
RandomForestClassifier,8.89272,6.65577,8.03179,21.97293,2.45774,1.15241,28.98565
SVC,0.8579,1.80184,1.81846,0.43936,0.61788,0.62617,25.01459


In [24]:
print('Tempo para predizer novos itens (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(predict_time['all'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para predizer novos itens (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00073,0.00117,0.00117,0.00291,0.00157,0.00087,0.0
KNeighborsClassifier,0.42257,7.79778,2.84385,0.62637,4.58188,2.04595,269.98029
LinearSVC,0.00103,0.00967,0.00147,0.00056,0.00067,0.00079,0.02392
LogisticRegression,0.00242,0.00126,0.00124,0.00464,0.00079,0.0016,0.08318
MLPClassifier,0.02753,0.00986,0.01163,0.01651,0.00675,0.00676,0.04415
MultinomialNB,0.00138,0.0012,0.00125,0.00281,0.00102,0.0009,0.0
RandomForestClassifier,0.13253,0.07394,0.0438,0.15753,0.03132,0.03065,0.29381
SVC,0.50209,1.08235,1.0698,0.30641,0.27102,0.38366,16.56266


#### Resultado dos Classificadores para os Tweets MG

In [25]:
print('Acuraria (%):')
print('-' * 20)
df = pd.DataFrame.from_dict(matriz_resultados['tweets'])
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Acuraria (%):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,59.25%,57.84%,52.07%,59.56%,57.13%,50.35%,0.00%
KNeighborsClassifier,61.17%,60.47%,53.59%,50.76%,58.04%,48.33%,61.38%
LinearSVC,63.90%,64.31%,55.71%,61.48%,62.49%,54.80%,62.89%
LogisticRegression,63.40%,63.40%,55.31%,63.30%,61.98%,54.50%,62.49%
MLPClassifier,61.68%,60.26%,53.69%,61.07%,64.21%,53.19%,61.27%
MultinomialNB,61.98%,50.56%,55.71%,60.57%,53.08%,53.69%,0.00%
RandomForestClassifier,63.40%,61.27%,53.79%,62.59%,60.97%,53.19%,63.90%
SVC,53.69%,46.01%,46.21%,48.63%,44.49%,44.29%,55.71%


In [26]:
print('Tempo para treinar o classificador (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(train_time['tweets'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para treinar o classificador (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00645,0.02089,0.01123,0.0062,0.007,0.00678,0.0
KNeighborsClassifier,0.00186,0.02979,0.01879,0.00149,0.00686,0.00604,0.19286
LinearSVC,0.11271,0.82166,0.31729,0.1838,0.47325,0.17695,0.02654
LogisticRegression,2.41728,0.74969,0.30723,2.64707,0.34512,0.17787,2.02521
MLPClassifier,41.46804,6.01657,12.12654,33.42182,2.52885,8.12206,10.24124
MultinomialNB,0.00563,0.01163,0.00858,0.00594,0.00694,0.00333,0.0
RandomForestClassifier,3.62224,2.3482,2.51137,1.71502,0.79983,0.41222,1.11889
SVC,0.60397,0.76614,0.76851,0.25913,0.34286,0.32485,5.61091


In [27]:
print('Tempo para predizer novos itens (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(predict_time['tweets'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para predizer novos itens (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00053,0.00066,0.00069,0.00041,0.00043,0.00047,0.0
KNeighborsClassifier,0.14588,1.00287,1.03105,0.10617,0.56981,0.26528,12.9362
LinearSVC,0.00067,0.00076,0.00181,0.00035,0.00059,0.01041,0.00368
LogisticRegression,0.00165,0.0147,0.00077,0.00103,0.00186,0.00045,0.00892
MLPClassifier,0.01048,0.00564,0.0057,0.0262,0.00231,0.00311,0.01035
MultinomialNB,0.00054,0.01165,0.00087,0.00087,0.00047,0.00041,0.0
RandomForestClassifier,0.05584,0.03586,0.04095,0.03709,0.01153,0.01528,0.03274
SVC,0.34286,0.38243,0.43126,0.18555,0.16097,0.15401,3.16858


#### Resultados dos Classificadores para os Títulos de Notícias

In [28]:
print('Acuraria (%):')
print('-' * 20)
df = pd.DataFrame.from_dict(matriz_resultados['titulo_noticias'])
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Acuraria (%):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,65.19%,57.17%,48.64%,60.49%,48.64%,39.74%,64.61%
KNeighborsClassifier,60.49%,54.97%,50.40%,47.76%,54.89%,47.83%,60.56%
LinearSVC,68.65%,62.99%,53.42%,65.86%,61.59%,47.53%,66.08%
LogisticRegression,65.86%,63.72%,53.79%,66.96%,60.71%,47.31%,66.52%
MLPClassifier,66.00%,61.52%,48.86%,67.18%,58.65%,46.43%,64.75%
MultinomialNB,65.34%,46.73%,53.57%,65.56%,46.73%,47.46%,66.23%
RandomForestClassifier,62.62%,60.41%,52.24%,63.80%,60.71%,48.93%,64.68%
SVC,51.29%,45.33%,45.18%,45.18%,46.14%,43.93%,53.79%


In [29]:
print('Tempo para treinar o classificador (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(train_time['titulo_noticias'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para treinar o classificador (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.0225,0.01448,0.01521,0.01115,0.00869,0.00805,0.03733
KNeighborsClassifier,0.00397,0.02049,0.02605,0.00167,0.01027,0.00959,0.38568
LinearSVC,0.17626,1.18968,0.42384,0.25415,0.55289,0.19812,0.04004
LogisticRegression,6.15197,0.65222,0.39617,6.46886,0.41299,0.2256,3.4431
MLPClassifier,149.8301,9.2845,17.96285,79.28158,4.4502,13.7403,20.13697
MultinomialNB,0.01788,0.01477,0.0108,0.00879,0.00921,0.00855,0.03827
RandomForestClassifier,11.44937,3.48411,4.32926,5.72836,1.08461,0.56448,3.97777
SVC,0.83014,1.0472,1.04952,0.33907,0.40392,0.36003,10.48507


In [30]:
print('Tempo para predizer novos itens (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(predict_time['titulo_noticias'])
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para predizer novos itens (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00187,0.00082,0.0008,0.00092,0.00051,0.00056,0.00786
KNeighborsClassifier,0.44656,1.30266,1.53888,0.15371,0.94733,0.43912,30.83668
LinearSVC,0.00096,0.00096,0.00084,0.00052,0.00048,0.00047,0.00716
LogisticRegression,0.00667,0.00164,0.00103,0.00307,0.00128,0.00056,0.01708
MLPClassifier,0.01744,0.00498,0.007,0.02133,0.00337,0.00434,0.01733
MultinomialNB,0.00189,0.00198,0.00099,0.00106,0.00057,0.00631,0.00794
RandomForestClassifier,0.19044,0.03911,0.0536,0.06809,0.01543,0.0211,0.06379
SVC,0.51707,0.55322,0.58414,0.18923,0.19878,0.18362,6.32095


### Modelo escolhido e salvo

In [31]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(100, random_state=10)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf_svd)

svm = LinearSVC(max_iter=150, random_state=10)
model = CalibratedClassifierCV(svm) 
model.fit(X_svd, avalencias)

accuracy = np.round(model.score(X_svd, avalencias) * 100, 2)
print(f'Modelo   : {model.__class__.__name__}')
print(f'Acurácia : {accuracy}%')

# filename = 'tfidf_valence.sav'
# joblib.dump(vec_tfidf, filename)

# filename = 'lsa_valence.sav'
# joblib.dump(lsa, filename)

# filename = 'model_valence.sav'
# joblib.dump(model, filename)

Modelo   : CalibratedClassifierCV
Acurácia : 62.03%


In [32]:
model.classes_

array(['NEGATIVO', 'NEUTRO', 'POSITIVO'], dtype='<U8')

In [33]:
y = model.predict_proba(X_svd)
list(np.round(y[0] * 100, 2))

[37.31, 41.47, 21.22]