## Testes de valência para o projeto final de IA369Y 2 Semestre 2018

Passos para tratar os dados com valência, testar e escolher um classificador para utilizar no projeto final de IA369Y.

1) Remover espaços duplos, quebras de linha, números e links do dataset e das frases a serem testadas.

2) Remover stopwords e aplicar o stemmer.

3) Treinar os classificadores.

4) Realizar as predições com os classificadores.

5) Avaliar as medidas obtidas com os classificadores.

In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk import word_tokenize
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.externals import joblib

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

def highlight_max(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)

### Matriz de Resultados

In [2]:
classf = {
    'MultinomialNB': 0,
    'ComplementNB': 0,
    'LogisticRegression': 0,
    'RandomForestClassifier': 0,
    'KNeighborsClassifier': 0,
    'MLPClassifier': 0,
    'LinearSVC': 0,
    'SVC': 0
}

matriz_resultados = {
    'all': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    },
    'tweets': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),       
    },
    'titulo_noticias': {
        'tfidf': copy.deepcopy(classf),
        'tfidf+lsa': copy.deepcopy(classf),
        'tfidf+lda': copy.deepcopy(classf),
        'count': copy.deepcopy(classf),
        'count+lsa': copy.deepcopy(classf),
        'count+lda': copy.deepcopy(classf),
        'tfidf+count+w2c': copy.deepcopy(classf),
    }
}

## Datasets

Para validar, serão utilizados dois datasets.

Os dos datasets foram obtidos do site minerando dados.

O primeiro deles tem tweets de política de Minas Gerais com rótulos de valência: positivo, negativo e neutro. Foi feito um tratamento para eliminar tweets repetidos e dessa forma sobraram 3016 tweets.

O segundo contém 2123 títulos de notícias com rótulos de valência: positivo, negativo e neutro.

In [3]:
path = '/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/valencia/'

#Carregando os datasets
def carregar(filename):
    frases = []
    with open(filename, 'r', encoding='utf-8') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            valencia = row[1].upper()
            if len(frase) > 5:
                frases.append((valencia, frase))
    return frases

In [4]:
#Carrega os datasets em separado
tweets_mg = carregar(f'{path}tweets_mg_tratados.csv')
shuffle(tweets_mg)
titulo_noticias = carregar(f'{path}titulo_noticias.txt')
shuffle(titulo_noticias)

frases = tweets_mg + titulo_noticias
shuffle(frases)

print(frases[:5])
print('-' * 20)
print(tweets_mg[:5])
print('-' * 20)
print(titulo_noticias[:5])

[('POSITIVO', 'quer ser líd dev ser primeir serv quer lider dev serv'), ('POSITIVO', 'prevençã contr febr amarel reforc zon rural sul'), ('NEGATIVO', 'bovesp desvaloriz set mundial terr brasil'), ('NEGATIVO', 'nã outr vid além dest vid brincadeir cruel hipocris maldad egoísm traiçã'), ('POSITIVO', 'escrev esquec literatur maneir mais agrad ignor vid músic embal arte visual anim arte viv danc arte represent entret primeir porém afast vid faz son segund contud nã afast vid umar porqu usar fórmul visível portant vital outr porqu viv mesm vid human')]
--------------------
[('POSITIVO', 'tradersclubr calam financeir compr dois helicópter'), ('POSITIVO', 'uol semanári francês charli hebd resist dois ano após atent estar min dois ano atent'), ('NEGATIVO', 'muit falt vergonh principal respeit popul assim outr maneir'), ('NEUTRO', 'bhlinux comun assim desgrac tom cont país govern vagabund'), ('NEUTRO', 'grenfield forc mudanc funcef econom estar min')]
--------------------
[('POSITIVO', 'políc d

In [5]:
def carregar(filename):
    frases = []
    with open(filename, 'r', encoding='utf-8') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = ' '.join(row[0].strip().splitlines())
            valencia = row[1].upper()
            if len(frase) > 5:
                frases.append((valencia, frase))
    return frases

titulo_noticias2 = carregar(f'{path}titulo_noticias.txt')
only_neutro = []
for sent, new in titulo_noticias2:
    if sent == 'NEUTRO':
        only_neutro.append(new)
with open(f'{path}titulo_noticias_neutro.txt', 'w') as h:
    for k in only_neutro:
        h.write(f'{k}|NEUTRO\n')

In [6]:
#all_datasets
afrases = []
avalencias = []
for valencia, frase in frases:
    afrases.append(frase)
    avalencias.append(valencia)
    
print(afrases[:5])
print(avalencias[:5])
print('-' * 20)

#tweets_mg
atweets_mg = []
aval_tweets_mg = []
for valencia, frase in tweets_mg:
    atweets_mg.append(frase)
    aval_tweets_mg.append(valencia)

print(atweets_mg[:5])
print(aval_tweets_mg[:5])
print('-' * 20)

#titulo_noticias
atitulo_noticias = []
aval_titulo_noticias = []
for valencia, frase in titulo_noticias:
    atitulo_noticias.append(frase)
    aval_titulo_noticias.append(valencia)

print(atitulo_noticias[:5])
print(aval_titulo_noticias[:5])

['quer ser líd dev ser primeir serv quer lider dev serv', 'prevençã contr febr amarel reforc zon rural sul', 'bovesp desvaloriz set mundial terr brasil', 'nã outr vid além dest vid brincadeir cruel hipocris maldad egoísm traiçã', 'escrev esquec literatur maneir mais agrad ignor vid músic embal arte visual anim arte viv danc arte represent entret primeir porém afast vid faz son segund contud nã afast vid umar porqu usar fórmul visível portant vital outr porqu viv mesm vid human']
['POSITIVO', 'POSITIVO', 'NEGATIVO', 'NEGATIVO', 'POSITIVO']
--------------------
['tradersclubr calam financeir compr dois helicópter', 'uol semanári francês charli hebd resist dois ano após atent estar min dois ano atent', 'muit falt vergonh principal respeit popul assim outr maneir', 'bhlinux comun assim desgrac tom cont país govern vagabund', 'grenfield forc mudanc funcef econom estar min']
['POSITIVO', 'POSITIVO', 'NEGATIVO', 'NEUTRO', 'NEUTRO']
--------------------
['políc distribu panflet incentiv denúnc

In [7]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    y_pred = model.predict(X_test)
    print(f'Modelo   : {model.__class__.__name__}')
    print(f'Acurácia : {accuracy}%')
    print(classification_report(y_test, y_pred))
    print('Matrix de Confusão: ')
    print(confusion_matrix(y_test, y_pred))
    print('-' * 30)
    print()
    return accuracy

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

## Classificadores

In [8]:
classifiers = (
    MultinomialNB(),
    ComplementNB(),
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=50, min_samples_split=5, random_state=0),
    KNeighborsClassifier(n_neighbors=8, algorithm='auto'),
    MLPClassifier(hidden_layer_sizes=(100, 25), max_iter=500, random_state=0),
    LinearSVC(max_iter=500),
    SVC(gamma='auto', max_iter=500),
)

## TF-IDF

In [9]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

vec_tfidf_tmg = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf_tmg = vec_tfidf_tmg.fit_transform(atweets_mg)

vec_tfidf_tn = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf_tn = vec_tfidf_tn.fit_transform(atitulo_noticias)

In [10]:
print("\nall_datasets")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_tfidf, avalencias))
        matriz_resultados['all']['tfidf'][classifier.__class__.__name__] = acc
    except:
        pass

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_tfidf_tmg, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf'][classifier.__class__.__name__] = acc
    except:
        pass

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_tfidf_tn, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf'][classifier.__class__.__name__] = acc
    except:
        pass      


all_datasets
Modelo   : MultinomialNB
Acurácia : 60.02%
              precision    recall  f1-score   support

    NEGATIVO       0.60      0.59      0.59       813
      NEUTRO       0.63      0.16      0.25       507
    POSITIVO       0.60      0.83      0.70      1011

   micro avg       0.60      0.60      0.60      2331
   macro avg       0.61      0.53      0.51      2331
weighted avg       0.61      0.60      0.56      2331

Matrix de Confusão: 
[[480  28 305]
 [171  80 256]
 [153  19 839]]
------------------------------

Modelo   : ComplementNB
Acurácia : 63.62%
              precision    recall  f1-score   support

    NEGATIVO       0.61      0.63      0.62       813
      NEUTRO       0.53      0.52      0.53       507
    POSITIVO       0.71      0.69      0.70      1011

   micro avg       0.64      0.64      0.64      2331
   macro avg       0.62      0.62      0.62      2331
weighted avg       0.64      0.64      0.64      2331

Matrix de Confusão: 
[[516 137 160]
 [12

Modelo   : LogisticRegression
Acurácia : 66.1%
              precision    recall  f1-score   support

    NEGATIVO       0.66      0.82      0.73       638
      NEUTRO       0.83      0.08      0.14       128
    POSITIVO       0.67      0.61      0.64       576

   micro avg       0.66      0.66      0.66      1342
   macro avg       0.72      0.50      0.50      1342
weighted avg       0.68      0.66      0.63      1342

Matrix de Confusão: 
[[523   0 115]
 [ 55  10  63]
 [220   2 354]]
------------------------------

Modelo   : RandomForestClassifier
Acurácia : 63.79%
              precision    recall  f1-score   support

    NEGATIVO       0.62      0.85      0.72       638
      NEUTRO       0.66      0.15      0.24       128
    POSITIVO       0.67      0.51      0.58       576

   micro avg       0.64      0.64      0.64      1342
   macro avg       0.65      0.50      0.51      1342
weighted avg       0.65      0.64      0.61      1342

Matrix de Confusão: 
[[541   3  94]
 [ 6

## LSA (usando TF-IDF)

In [11]:
#all_datasets
svd = TruncatedSVD(n_components=70, n_iter=50, random_state=0)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_svd, avalencias))
        matriz_resultados['all']['tfidf+lsa'][classifier.__class__.__name__] = acc
    except Exception as e:
        print(e)

#tweets_mg
X_svd = lsa.fit_transform(X_tfidf_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_svd, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf+lsa'][classifier.__class__.__name__] = acc
    except:
        pass


#titulo_noticias
X_svd = lsa.fit_transform(X_tfidf_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_svd, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf+lsa'][classifier.__class__.__name__] = acc
    except:
        pass


all_datasets
Modelo   : MultinomialNB
Acurácia : 43.37%
              precision    recall  f1-score   support

    NEGATIVO       0.00      0.00      0.00       813
      NEUTRO       0.00      0.00      0.00       507
    POSITIVO       0.43      1.00      0.61      1011

   micro avg       0.43      0.43      0.43      2331
   macro avg       0.14      0.33      0.20      2331
weighted avg       0.19      0.43      0.26      2331

Matrix de Confusão: 
[[   0    0  813]
 [   0    0  507]
 [   0    0 1011]]
------------------------------

Modelo   : ComplementNB
Acurácia : 53.8%
              precision    recall  f1-score   support

    NEGATIVO       0.55      0.57      0.56       813
      NEUTRO       0.42      0.52      0.46       507
    POSITIVO       0.61      0.52      0.56      1011

   micro avg       0.54      0.54      0.54      2331
   macro avg       0.53      0.54      0.53      2331
weighted avg       0.55      0.54      0.54      2331

Matrix de Confusão: 
[[465 146 2


titulo_noticias
Modelo   : MultinomialNB
Acurácia : 47.99%
              precision    recall  f1-score   support

    NEGATIVO       0.48      0.99      0.65       638
      NEUTRO       0.00      0.00      0.00       128
    POSITIVO       0.52      0.02      0.05       576

   micro avg       0.48      0.48      0.48      1342
   macro avg       0.33      0.34      0.23      1342
weighted avg       0.45      0.48      0.33      1342

Matrix de Confusão: 
[[630   0   8]
 [123   0   5]
 [562   0  14]]
------------------------------

Modelo   : ComplementNB
Acurácia : 55.07%
              precision    recall  f1-score   support

    NEGATIVO       0.56      0.73      0.64       638
      NEUTRO       0.30      0.23      0.26       128
    POSITIVO       0.58      0.43      0.49       576

   micro avg       0.55      0.55      0.55      1342
   macro avg       0.48      0.46      0.46      1342
weighted avg       0.55      0.55      0.54      1342

Matrix de Confusão: 
[[463  37 138]
 

## LDA (usando TF-IDF)

In [12]:
#all_datasets
lda = LatentDirichletAllocation(n_components=70, max_iter=50, random_state=0, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_tfidf)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_lda, avalencias))
        matriz_resultados['all']['tfidf+lda'][classifier.__class__.__name__] = acc
    except:
        pass


#tweets_mg
X_lda = lda.fit_transform(X_tfidf_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_lda, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf+lda'][classifier.__class__.__name__] = acc
    except:
        pass


#titulo_noticias
X_lda = lda.fit_transform(X_tfidf_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_lda, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf+lda'][classifier.__class__.__name__] = acc
    except:
        pass


all_datasets
Modelo   : MultinomialNB
Acurácia : 44.19%
              precision    recall  f1-score   support

    NEGATIVO       0.39      0.37      0.38       813
      NEUTRO       0.36      0.02      0.04       507
    POSITIVO       0.47      0.71      0.57      1011

   micro avg       0.44      0.44      0.44      2331
   macro avg       0.41      0.37      0.33      2331
weighted avg       0.42      0.44      0.39      2331

Matrix de Confusão: 
[[299  13 501]
 [181  12 314]
 [284   8 719]]
------------------------------

Modelo   : ComplementNB
Acurácia : 42.3%
              precision    recall  f1-score   support

    NEGATIVO       0.40      0.41      0.41       813
      NEUTRO       0.30      0.40      0.35       507
    POSITIVO       0.54      0.44      0.48      1011

   micro avg       0.42      0.42      0.42      2331
   macro avg       0.42      0.42      0.41      2331
weighted avg       0.44      0.42      0.43      2331

Matrix de Confusão: 
[[336 230 247]
 [169


titulo_noticias
Modelo   : MultinomialNB
Acurácia : 47.54%
              precision    recall  f1-score   support

    NEGATIVO       0.48      0.71      0.58       638
      NEUTRO       0.00      0.00      0.00       128
    POSITIVO       0.45      0.32      0.38       576

   micro avg       0.48      0.48      0.48      1342
   macro avg       0.31      0.34      0.32      1342
weighted avg       0.43      0.48      0.44      1342

Matrix de Confusão: 
[[452   0 186]
 [ 90   0  38]
 [390   0 186]]
------------------------------

Modelo   : ComplementNB
Acurácia : 41.95%
              precision    recall  f1-score   support

    NEGATIVO       0.51      0.43      0.47       638
      NEUTRO       0.12      0.23      0.16       128
    POSITIVO       0.47      0.45      0.46       576

   micro avg       0.42      0.42      0.42      1342
   macro avg       0.37      0.37      0.36      1342
weighted avg       0.46      0.42      0.43      1342

Matrix de Confusão: 
[[274 118 246]
 

## Count

In [13]:
#all_datasets
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_count, avalencias))
        matriz_resultados['all']['count'][classifier.__class__.__name__] = acc
    except:
        pass

      
#tweets_mg
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count_tmg = vec_count.fit_transform(atweets_mg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_count_tmg, aval_tweets_mg))
        matriz_resultados['tweets']['count'][classifier.__class__.__name__] = acc
    except:
        pass
      
      
#titulo_noticias
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count_tn = vec_count.fit_transform(atitulo_noticias)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_count_tn, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['count'][classifier.__class__.__name__] = acc
    except:
        pass


all_datasets
Modelo   : MultinomialNB
Acurácia : 64.14%
              precision    recall  f1-score   support

    NEGATIVO       0.61      0.65      0.62       813
      NEUTRO       0.53      0.53      0.53       507
    POSITIVO       0.73      0.69      0.71      1011

   micro avg       0.64      0.64      0.64      2331
   macro avg       0.62      0.62      0.62      2331
weighted avg       0.64      0.64      0.64      2331

Matrix de Confusão: 
[[525 141 147]
 [125 270 112]
 [217  94 700]]
------------------------------

Modelo   : ComplementNB
Acurácia : 63.45%
              precision    recall  f1-score   support

    NEGATIVO       0.62      0.60      0.61       813
      NEUTRO       0.49      0.65      0.56       507
    POSITIVO       0.75      0.65      0.70      1011

   micro avg       0.63      0.63      0.63      2331
   macro avg       0.62      0.64      0.62      2331
weighted avg       0.65      0.63      0.64      2331

Matrix de Confusão: 
[[489 195 129]
 [ 9

Modelo   : ComplementNB
Acurácia : 59.76%
              precision    recall  f1-score   support

    NEGATIVO       0.72      0.59      0.65       638
      NEUTRO       0.25      0.51      0.33       128
    POSITIVO       0.65      0.63      0.64       576

   micro avg       0.60      0.60      0.60      1342
   macro avg       0.54      0.57      0.54      1342
weighted avg       0.64      0.60      0.61      1342

Matrix de Confusão: 
[[376 109 153]
 [ 19  65  44]
 [127  88 361]]
------------------------------

Modelo   : LogisticRegression
Acurácia : 66.77%
              precision    recall  f1-score   support

    NEGATIVO       0.69      0.75      0.72       638
      NEUTRO       0.51      0.30      0.38       128
    POSITIVO       0.66      0.66      0.66       576

   micro avg       0.67      0.67      0.67      1342
   macro avg       0.62      0.57      0.59      1342
weighted avg       0.66      0.67      0.66      1342

Matrix de Confusão: 
[[478  17 143]
 [ 39  39  50

## LSA (usando Count)

In [14]:
#all_datasets
svd = TruncatedSVD(n_components=70, n_iter=50, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(svd, normalizer)
X_svd = lda.fit_transform(X_count)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_svd, avalencias))
        matriz_resultados['all']['count+lsa'][classifier.__class__.__name__] = acc
    except:
        pass
      

#tweets_mg
X_svd = lda.fit_transform(X_count_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_svd, aval_tweets_mg))
        matriz_resultados['tweets']['count+lsa'][classifier.__class__.__name__] = acc
    except:
        pass
      
      
#titulos_noticias
X_svd = lda.fit_transform(X_count_tn)

print("\ntitulos_noticias")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_svd, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['count+lsa'][classifier.__class__.__name__] = acc
    except:
        pass


all_datasets
Modelo   : MultinomialNB
Acurácia : 43.37%
              precision    recall  f1-score   support

    NEGATIVO       0.00      0.00      0.00       813
      NEUTRO       0.50      0.00      0.00       507
    POSITIVO       0.43      1.00      0.60      1011

   micro avg       0.43      0.43      0.43      2331
   macro avg       0.31      0.33      0.20      2331
weighted avg       0.30      0.43      0.26      2331

Matrix de Confusão: 
[[   0    0  813]
 [   0    1  506]
 [   0    1 1010]]
------------------------------

Modelo   : ComplementNB
Acurácia : 49.38%
              precision    recall  f1-score   support

    NEGATIVO       0.65      0.14      0.23       813
      NEUTRO       0.42      0.56      0.48       507
    POSITIVO       0.51      0.75      0.61      1011

   micro avg       0.49      0.49      0.49      2331
   macro avg       0.53      0.48      0.44      2331
weighted avg       0.54      0.49      0.45      2331

Matrix de Confusão: 
[[112 181 


titulos_noticias
Modelo   : MultinomialNB
Acurácia : 47.54%
              precision    recall  f1-score   support

    NEGATIVO       0.48      1.00      0.64       638
      NEUTRO       0.00      0.00      0.00       128
    POSITIVO       0.00      0.00      0.00       576

   micro avg       0.48      0.48      0.48      1342
   macro avg       0.16      0.33      0.21      1342
weighted avg       0.23      0.48      0.31      1342

Matrix de Confusão: 
[[638   0   0]
 [128   0   0]
 [576   0   0]]
------------------------------

Modelo   : ComplementNB
Acurácia : 47.17%
              precision    recall  f1-score   support

    NEGATIVO       0.64      0.52      0.57       638
      NEUTRO       0.17      0.70      0.28       128
    POSITIVO       0.68      0.37      0.48       576

   micro avg       0.47      0.47      0.47      1342
   macro avg       0.50      0.53      0.44      1342
weighted avg       0.61      0.47      0.51      1342

Matrix de Confusão: 
[[329 224  85]


## LDA (usando Count)

In [15]:
#all_datasets
lda = LatentDirichletAllocation(n_components=70, max_iter=50, random_state=0, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_count)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_lda, avalencias))
        matriz_resultados['all']['count+lda'][classifier.__class__.__name__] = acc
    except:
        pass


#tweets_mg
X_lda = lda.fit_transform(X_count_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_lda, aval_tweets_mg))
        matriz_resultados['tweets']['count+lda'][classifier.__class__.__name__] = acc
    except:
        pass
      
      
#titulo_noticias
X_lda = lda.fit_transform(X_count_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_lda, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['count+lda'][classifier.__class__.__name__] = acc
    except:
        pass


all_datasets
Modelo   : MultinomialNB
Acurácia : 44.87%
              precision    recall  f1-score   support

    NEGATIVO       0.40      0.41      0.40       813
      NEUTRO       0.43      0.08      0.14       507
    POSITIVO       0.48      0.67      0.56      1011

   micro avg       0.45      0.45      0.45      2331
   macro avg       0.44      0.39      0.37      2331
weighted avg       0.44      0.45      0.41      2331

Matrix de Confusão: 
[[331  27 455]
 [192  42 273]
 [310  28 673]]
------------------------------

Modelo   : ComplementNB
Acurácia : 44.14%
              precision    recall  f1-score   support

    NEGATIVO       0.42      0.44      0.43       813
      NEUTRO       0.34      0.45      0.39       507
    POSITIVO       0.56      0.44      0.49      1011

   micro avg       0.44      0.44      0.44      2331
   macro avg       0.44      0.44      0.43      2331
weighted avg       0.46      0.44      0.45      2331

Matrix de Confusão: 
[[358 221 234]
 [16


titulo_noticias
Modelo   : MultinomialNB
Acurácia : 47.76%
              precision    recall  f1-score   support

    NEGATIVO       0.49      0.69      0.57       638
      NEUTRO       0.00      0.00      0.00       128
    POSITIVO       0.45      0.35      0.40       576

   micro avg       0.48      0.48      0.48      1342
   macro avg       0.31      0.35      0.32      1342
weighted avg       0.43      0.48      0.44      1342

Matrix de Confusão: 
[[439   0 199]
 [ 83   0  45]
 [374   0 202]]
------------------------------

Modelo   : ComplementNB
Acurácia : 39.12%
              precision    recall  f1-score   support

    NEGATIVO       0.52      0.43      0.47       638
      NEUTRO       0.11      0.26      0.16       128
    POSITIVO       0.42      0.38      0.40       576

   micro avg       0.39      0.39      0.39      1342
   macro avg       0.35      0.36      0.34      1342
weighted avg       0.44      0.39      0.41      1342

Matrix de Confusão: 
[[272 119 247]
 

## Count + TF-IDF + Word2Vec

In [16]:
#all_datasets
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(afrases)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(afrases)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in afrases:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=25,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(65077171, 70428000)

In [17]:
#all_datasets
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\nall_datasets")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X, avalencias))
        matriz_resultados['all']['tfidf+count+w2c'][classifier.__class__.__name__] = acc
    except:
        pass


all_datasets
Modelo   : LogisticRegression
Acurácia : 63.11%
              precision    recall  f1-score   support

    NEGATIVO       0.60      0.63      0.62       813
      NEUTRO       0.52      0.41      0.46       507
    POSITIVO       0.70      0.74      0.72      1011

   micro avg       0.63      0.63      0.63      2331
   macro avg       0.61      0.59      0.60      2331
weighted avg       0.62      0.63      0.63      2331

Matrix de Confusão: 
[[516 117 180]
 [154 209 144]
 [186  79 746]]
------------------------------

Modelo   : RandomForestClassifier
Acurácia : 61.95%
              precision    recall  f1-score   support

    NEGATIVO       0.59      0.63      0.61       813
      NEUTRO       0.52      0.49      0.50       507
    POSITIVO       0.69      0.68      0.69      1011

   micro avg       0.62      0.62      0.62      2331
   macro avg       0.60      0.60      0.60      2331
weighted avg       0.62      0.62      0.62      2331

Matrix de Confusão: 
[[51

In [18]:
#tweets_mg
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(atweets_mg)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(atweets_mg)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in atweets_mg:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=25,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(21304093, 26219000)

In [19]:
#tweets_mg
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X, aval_tweets_mg))
        matriz_resultados['tweets']['tfidf+count+w2c'][classifier.__class__.__name__] = acc
    except:
        pass


tweets_mg
Modelo   : LogisticRegression
Acurácia : 65.42%
              precision    recall  f1-score   support

    NEGATIVO       0.33      0.09      0.14       191
      NEUTRO       0.54      0.84      0.66       363
    POSITIVO       0.87      0.75      0.80       435

   micro avg       0.65      0.65      0.65       989
   macro avg       0.58      0.56      0.53       989
weighted avg       0.65      0.65      0.62       989

Matrix de Confusão: 
[[ 17 157  17]
 [ 26 305  32]
 [  8 102 325]]
------------------------------

Modelo   : RandomForestClassifier
Acurácia : 64.21%
              precision    recall  f1-score   support

    NEGATIVO       0.26      0.09      0.13       191
      NEUTRO       0.53      0.82      0.65       363
    POSITIVO       0.88      0.73      0.80       435

   micro avg       0.64      0.64      0.64       989
   macro avg       0.56      0.55      0.53       989
weighted avg       0.63      0.64      0.62       989

Matrix de Confusão: 
[[ 17 1

In [20]:
#titulo_noticias
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(atitulo_noticias)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(atitulo_noticias)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in atitulo_noticias:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=25,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(41547632, 44209000)

In [21]:
#titulo_noticias
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wjv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X, aval_titulo_noticias))
        matriz_resultados['titulo_noticias']['tfidf+count+w2c'][classifier.__class__.__name__] = acc
    except:
        pass


titulo_noticias
Modelo   : MultinomialNB
Acurácia : 64.38%
              precision    recall  f1-score   support

    NEGATIVO       0.67      0.77      0.72       638
      NEUTRO       1.00      0.03      0.06       128
    POSITIVO       0.61      0.64      0.63       576

   micro avg       0.64      0.64      0.64      1342
   macro avg       0.76      0.48      0.47      1342
weighted avg       0.68      0.64      0.61      1342

Matrix de Confusão: 
[[492   0 146]
 [ 37   4  87]
 [208   0 368]]
------------------------------

Modelo   : ComplementNB
Acurácia : 64.31%
              precision    recall  f1-score   support

    NEGATIVO       0.69      0.69      0.69       638
      NEUTRO       0.41      0.31      0.36       128
    POSITIVO       0.63      0.66      0.64       576

   micro avg       0.64      0.64      0.64      1342
   macro avg       0.58      0.56      0.56      1342
weighted avg       0.64      0.64      0.64      1342

Matrix de Confusão: 
[[443  28 167]
 

### Resultado dos Classificadores

#### Resultado dos Classificadores para todas as frases

In [22]:
df = pd.DataFrame.from_dict(matriz_resultados['all'])
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,63.62%,53.80%,42.30%,63.45%,49.38%,44.14%,0.00%
KNeighborsClassifier,59.80%,56.63%,42.94%,35.14%,55.08%,45.60%,59.67%
LinearSVC,65.17%,58.86%,43.72%,63.84%,59.42%,45.39%,63.49%
LogisticRegression,63.58%,57.79%,43.93%,64.78%,57.66%,45.35%,63.11%
MLPClassifier,62.93%,57.96%,44.70%,62.03%,58.09%,45.47%,59.67%
MultinomialNB,60.02%,43.37%,44.19%,64.14%,43.37%,44.87%,0.00%
RandomForestClassifier,62.81%,59.33%,44.40%,63.28%,59.24%,48.05%,61.95%
SVC,54.14%,43.29%,43.54%,45.60%,45.39%,44.83%,50.36%


#### Resultado dos Classificadores para os Tweets MG

In [23]:
df = pd.DataFrame.from_dict(matriz_resultados['tweets'])
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,60.16%,57.43%,46.11%,60.06%,60.57%,52.88%,0.00%
KNeighborsClassifier,58.85%,60.36%,46.31%,49.34%,60.77%,50.66%,63.50%
LinearSVC,64.00%,65.32%,49.85%,63.60%,64.51%,56.52%,64.71%
LogisticRegression,65.42%,65.72%,51.37%,64.21%,65.22%,56.02%,65.42%
MLPClassifier,62.29%,62.99%,51.16%,62.08%,64.51%,52.78%,59.35%
MultinomialNB,62.59%,45.80%,51.06%,61.17%,54.40%,56.83%,0.00%
RandomForestClassifier,64.91%,61.98%,50.46%,64.71%,61.68%,58.24%,64.21%
SVC,62.49%,48.74%,47.62%,45.20%,55.11%,53.49%,62.99%


#### Resultados dos Classificadores para os Títulos de Notícias

In [24]:
df = pd.DataFrame.from_dict(matriz_resultados['titulo_noticias'])
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,65.20%,55.07%,41.95%,59.76%,47.17%,39.12%,64.31%
KNeighborsClassifier,62.22%,59.31%,46.87%,49.33%,57.15%,45.45%,61.70%
LinearSVC,67.59%,63.34%,48.06%,65.87%,62.22%,47.84%,66.32%
LogisticRegression,66.10%,63.19%,47.99%,66.77%,62.07%,47.84%,66.39%
MLPClassifier,66.99%,62.74%,46.94%,66.32%,61.40%,47.32%,63.79%
MultinomialNB,65.20%,47.99%,47.54%,65.72%,47.54%,47.76%,64.38%
RandomForestClassifier,63.79%,62.30%,45.31%,63.93%,59.46%,47.24%,65.35%
SVC,52.61%,45.38%,43.22%,44.26%,45.83%,43.67%,47.54%


### Modelo escolhido e salvo

In [25]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(n_components=70, n_iter=50, random_state=0)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf)

svm = LinearSVC(max_iter=1200)
model = CalibratedClassifierCV(svm) 
model.fit(X_svd, avalencias)

accuracy = np.round(model.score(X_svd, avalencias) * 100, 2)
print(f'Modelo   : {model.__class__.__name__}')
print(f'Acurácia : {accuracy}%')


filename = 'tfidf_valence.sav'
joblib.dump(vec_tfidf, filename)

filename = 'lsa_valence.sav'
joblib.dump(lsa, filename)

filename = 'model_valence.sav'
joblib.dump(model, filename)

Modelo   : CalibratedClassifierCV
Acurácia : 60.15%


['model_valence.sav']

In [26]:
model.classes_

array(['NEGATIVO', 'NEUTRO', 'POSITIVO'], dtype='<U8')

In [27]:
y = model.predict_proba(X_svd)
list(np.round(y[0] * 100, 2))

[39.18, 10.02, 50.8]