In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle
import time

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk import word_tokenize
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.externals import joblib

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

def highlight_max(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)


def highlight_min(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_min = data == data.min()
        return [attr if v else '' for v in is_min]
    else:  # from .apply(axis=None)
        is_min = data == data.min().min()
        return pd.DataFrame(np.where(is_min, attr, ''),
                            index=data.index, columns=data.columns)   

### Matriz de resultados

In [2]:
classf = {
    'MultinomialNB': 0,
    'ComplementNB': 0,
    'LogisticRegression': 0,
    'RandomForestClassifier': 0,
    'KNeighborsClassifier': 0,
    'MLPClassifier': 0,
    'LinearSVC': 0,
    'SVC': 0
}

matriz_resultados = {
    'tfidf': copy.deepcopy(classf),
    'tfidf+lsa': copy.deepcopy(classf),
    'tfidf+lda': copy.deepcopy(classf),
    'count': copy.deepcopy(classf),
    'count+lsa': copy.deepcopy(classf),
    'count+lda': copy.deepcopy(classf),
    'tfidf+count+w2c': copy.deepcopy(classf),
}

train_time = {
    'tfidf': copy.deepcopy(classf),
    'tfidf+lsa': copy.deepcopy(classf),
    'tfidf+lda': copy.deepcopy(classf),
    'count': copy.deepcopy(classf),
    'count+lsa': copy.deepcopy(classf),
    'count+lda': copy.deepcopy(classf),
    'tfidf+count+w2c': copy.deepcopy(classf),
}

predict_time = {
    'tfidf': copy.deepcopy(classf),
    'tfidf+lsa': copy.deepcopy(classf),
    'tfidf+lda': copy.deepcopy(classf),
    'count': copy.deepcopy(classf),
    'count+lsa': copy.deepcopy(classf),
    'count+lda': copy.deepcopy(classf),
    'tfidf+count+w2c': copy.deepcopy(classf),
}

### Dataset

Fornecido pela equipe da PUC-PR através do site [EMOÇÕES.BR](http://www.ppgia.pucpr.br/~paraiso/mineracaodeemocoes/index.php), contém cerca de 1000 frases, categorizadas usando as 6 emoções de Ekman (alegria, surpresa, tristeza, medo, raiva, desgosto).
Alem disso, foi acrescido em torno de 700 novas frases rotuladas, totalizando 1720 frases.

In [3]:
def carregar(filename):
    frases = []
    with open(filename, 'r') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            sentimento = row[1].upper()
            if len(frase) > 5:
                frases.append((sentimento, frase))
    return frases

frases = carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/frases/frases_todas.txt')

shuffle(frases)
print(frases[:5])

[('NEUTRO', 'hor vend açõ banc brasileir morgan stanley'), ('NEUTRO', 'justic mand tir fot rich sit copel sanep jornal pov paran notíc opiniã'), ('ALEGRIA', 'oposiçã confirm tim cpi petrobr president psdb sérgi guerr indic bas ali escal líd govern'), ('NEUTRO', 'central móvel sema rir pret receb requer restituiçã'), ('TRISTEZA', 'sen referend salári mínim votaçã confirm valor vigor desd fevereir relator rejeit propost alter númer')]


In [4]:
afrases = []
asentimentos =[]
for sentimento, frase in frases:
    afrases.append(frase)
    asentimentos.append(sentimento)

### Funções de Apoio

In [5]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    start = time.time()
    model.fit(X_train, y_train)
    train_time = round(time.time() - start, 5)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    start = time.time()
    y_pred = model.predict(X_test)
    predict_time = round(time.time() - start, 5)
    print(f'Modelo      : {model.__class__.__name__}')
    print(f'Train Time  : {train_time}s')
    print(f'Predict Time: {predict_time}s')
    print(f'Acurácia    : {accuracy}%')
    print(classification_report(y_test, y_pred))
    print('Matrix de Confusão: ')
    print(confusion_matrix(y_test, y_pred))
    print('-' * 30)
    print()
    return accuracy, train_time, predict_time

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

### Classificadores

In [6]:
classifiers = (
    MultinomialNB(),
    ComplementNB(),
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=50, min_samples_split=8, random_state=10),
    KNeighborsClassifier(n_neighbors=8, algorithm='auto'),
    MLPClassifier(hidden_layer_sizes=(100, 25), max_iter=500, random_state=10),
    LinearSVC(max_iter=150, random_state=10),
    SVC(gamma='auto', max_iter=150),
)

### TF-IDF

In [7]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

In [27]:
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_tfidf, asentimentos))
        matriz_resultados['tfidf'][classifier.__class__.__name__] = acc
        train_time['tfidf'][classifier.__class__.__name__] = tt
        predict_time['tfidf'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.02079s
Predict Time: 0.0027s
Acurácia    : 50.19%
              precision    recall  f1-score   support

     ALEGRIA       0.39      0.91      0.55       230
    DESGOSTO       1.00      0.01      0.02       121
        MEDO       0.00      0.00      0.00        35
      NEUTRO       0.99      0.64      0.78       144
       RAIVA       0.00      0.00      0.00        50
    SURPRESA       0.00      0.00      0.00        37
    TRISTEZA       0.58      0.54      0.56       190

   micro avg       0.50      0.50      0.50       807
   macro avg       0.42      0.30      0.27       807
weighted avg       0.57      0.50      0.43       807

Matrix de Confusão: 
[[209   0   0   1   0   0  20]
 [ 95   1   0   0   0   0  25]
 [ 26   0   0   0   0   0   9]
 [ 46   0   0  92   0   0   6]
 [ 41   0   0   0   0   0   9]
 [ 31   0   0   0   0   0   6]
 [ 87   0   0   0   0   0 103]]
------------------------------

Modelo      : ComplementNB
Train Time 

### LSA (usando TF-IDF)

In [28]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(100, random_state=10)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf_svd)

for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, asentimentos))
        matriz_resultados['tfidf+lsa'][classifier.__class__.__name__] = acc
        train_time['tfidf+lsa'][classifier.__class__.__name__] = tt
        predict_time['tfidf+lsa'][classifier.__class__.__name__] = pt
    except Exception as e:
        print(e)

Modelo      : MultinomialNB
Train Time  : 0.00831s
Predict Time: 0.00123s
Acurácia    : 28.62%
              precision    recall  f1-score   support

     ALEGRIA       0.29      1.00      0.44       230
    DESGOSTO       0.00      0.00      0.00       121
        MEDO       0.00      0.00      0.00        35
      NEUTRO       0.00      0.00      0.00       144
       RAIVA       0.00      0.00      0.00        50
    SURPRESA       0.00      0.00      0.00        37
    TRISTEZA       1.00      0.01      0.01       190

   micro avg       0.29      0.29      0.29       807
   macro avg       0.18      0.14      0.06       807
weighted avg       0.32      0.29      0.13       807

Matrix de Confusão: 
[[230   0   0   0   0   0   0]
 [121   0   0   0   0   0   0]
 [ 35   0   0   0   0   0   0]
 [144   0   0   0   0   0   0]
 [ 50   0   0   0   0   0   0]
 [ 37   0   0   0   0   0   0]
 [189   0   0   0   0   0   1]]
------------------------------

Modelo      : ComplementNB
Train Time

### LDA (usando TF-IDF)

In [11]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_lda = vec_tfidf.fit_transform(afrases)

lda = LatentDirichletAllocation(n_components=100, max_iter=25, random_state=10, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_tfidf_lda)

for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, asentimentos))
        matriz_resultados['tfidf+lda'][classifier.__class__.__name__] = acc
        train_time['tfidf+lda'][classifier.__class__.__name__] = tt
        predict_time['tfidf+lda'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.00296s
Predict Time: 0.00027s
Acurácia    : 40.02%
              precision    recall  f1-score   support

     ALEGRIA       0.36      0.80      0.50       230
    DESGOSTO       0.00      0.00      0.00       121
        MEDO       0.00      0.00      0.00        35
      NEUTRO       0.68      0.40      0.50       144
       RAIVA       0.00      0.00      0.00        50
    SURPRESA       0.00      0.00      0.00        37
    TRISTEZA       0.39      0.43      0.41       190

   micro avg       0.40      0.40      0.40       807
   macro avg       0.20      0.23      0.20       807
weighted avg       0.31      0.40      0.33       807

Matrix de Confusão: 
[[185   0   0   7   0   0  38]
 [ 82   0   0   7   0   0  32]
 [ 19   0   0   2   0   0  14]
 [ 69   0   0  57   0   0  18]
 [ 32   0   0   1   0   0  17]
 [ 26   0   0   2   0   0   9]
 [101   0   0   8   0   0  81]]
------------------------------

Modelo      : ComplementNB
Train Time

### Count

In [12]:
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

In [13]:
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_count, asentimentos))
        matriz_resultados['count'][classifier.__class__.__name__] = acc
        train_time['count'][classifier.__class__.__name__] = tt
        predict_time['count'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.01875s
Predict Time: 0.00195s
Acurácia    : 54.77%
              precision    recall  f1-score   support

     ALEGRIA       0.49      0.70      0.57       230
    DESGOSTO       0.42      0.36      0.39       121
        MEDO       0.00      0.00      0.00        35
      NEUTRO       0.91      0.85      0.88       144
       RAIVA       0.14      0.02      0.04        50
    SURPRESA       0.50      0.03      0.05        37
    TRISTEZA       0.51      0.59      0.55       190

   micro avg       0.55      0.55      0.55       807
   macro avg       0.42      0.36      0.35       807
weighted avg       0.51      0.55      0.51       807

Matrix de Confusão: 
[[162  19   2   5   1   1  40]
 [ 46  43   0   2   2   0  28]
 [ 16   4   0   0   2   0  13]
 [ 13   2   0 122   0   0   7]
 [ 30   3   1   2   1   0  13]
 [ 25   2   0   0   0   1   9]
 [ 42  29   2   3   1   0 113]]
------------------------------

Modelo      : ComplementNB
Train Time

### LSA (usando Count)

In [14]:
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lsa = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(100, random_state=10)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(svd, normalizer)
X_svd = lda.fit_transform(X_count_lsa)

for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, asentimentos))
        matriz_resultados['count+lsa'][classifier.__class__.__name__] = acc
        train_time['count+lsa'][classifier.__class__.__name__] = tt
        predict_time['count+lsa'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.0127s
Predict Time: 0.00143s
Acurácia    : 28.5%
              precision    recall  f1-score   support

     ALEGRIA       0.29      1.00      0.44       230
    DESGOSTO       0.00      0.00      0.00       121
        MEDO       0.00      0.00      0.00        35
      NEUTRO       0.00      0.00      0.00       144
       RAIVA       0.00      0.00      0.00        50
    SURPRESA       0.00      0.00      0.00        37
    TRISTEZA       0.00      0.00      0.00       190

   micro avg       0.29      0.29      0.29       807
   macro avg       0.04      0.14      0.06       807
weighted avg       0.08      0.29      0.13       807

Matrix de Confusão: 
[[230   0   0   0   0   0   0]
 [121   0   0   0   0   0   0]
 [ 35   0   0   0   0   0   0]
 [144   0   0   0   0   0   0]
 [ 50   0   0   0   0   0   0]
 [ 37   0   0   0   0   0   0]
 [190   0   0   0   0   0   0]]
------------------------------

Modelo      : ComplementNB
Train Time  

### LDA (usando Count)

In [15]:
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lda = vec_tfidf.fit_transform(afrases)

lda = LatentDirichletAllocation(n_components=100, max_iter=25, random_state=10, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_count_lda)

for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, asentimentos))
        matriz_resultados['count+lda'][classifier.__class__.__name__] = acc
        train_time['count+lda'][classifier.__class__.__name__] = tt
        predict_time['count+lda'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.0107s
Predict Time: 0.00028s
Acurácia    : 40.02%
              precision    recall  f1-score   support

     ALEGRIA       0.35      0.70      0.47       230
    DESGOSTO       0.57      0.03      0.06       121
        MEDO       0.00      0.00      0.00        35
      NEUTRO       0.61      0.54      0.58       144
       RAIVA       0.00      0.00      0.00        50
    SURPRESA       0.00      0.00      0.00        37
    TRISTEZA       0.37      0.42      0.39       190

   micro avg       0.40      0.40      0.40       807
   macro avg       0.27      0.24      0.21       807
weighted avg       0.38      0.40      0.34       807

Matrix de Confusão: 
[[162   3   0  16   0   0  49]
 [ 70   4   0  12   0   0  35]
 [ 24   0   0   1   0   0  10]
 [ 48   0   0  78   0   0  18]
 [ 34   0   0   2   0   0  14]
 [ 25   0   0   5   0   0   7]
 [ 98   0   0  13   0   0  79]]
------------------------------

Modelo      : ComplementNB
Train Time 

### Count + TF-IDF + Word2Vec

In [17]:
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(afrases)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(afrases)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in afrases:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=5,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=500)

(14728944, 15548000)

In [18]:
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

In [19]:
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X, asentimentos))
        matriz_resultados['tfidf+count+w2c'][classifier.__class__.__name__] = acc
        train_time['tfidf+count+w2c'][classifier.__class__.__name__] = tt
        predict_time['tfidf+count+w2c'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : LogisticRegression
Train Time  : 1.91151s
Predict Time: 0.00699s
Acurácia    : 56.51%
              precision    recall  f1-score   support

     ALEGRIA       0.49      0.79      0.60       230
    DESGOSTO       0.52      0.21      0.30       121
        MEDO       0.00      0.00      0.00        35
      NEUTRO       0.88      0.89      0.89       144
       RAIVA       0.00      0.00      0.00        50
    SURPRESA       0.00      0.00      0.00        37
    TRISTEZA       0.50      0.64      0.56       190

   micro avg       0.57      0.57      0.57       807
   macro avg       0.34      0.36      0.34       807
weighted avg       0.49      0.57      0.51       807

Matrix de Confusão: 
[[181   5   0   9   0   0  35]
 [ 63  26   0   0   0   0  32]
 [ 18   2   0   0   0   0  15]
 [ 10   0   0 128   0   0   6]
 [ 25   1   0   2   0   0  22]
 [ 24   0   0   1   0   0  12]
 [ 48  16   0   5   0   0 121]]
------------------------------

Modelo      : RandomForestClassi

### Resultado dos Classificadores

In [29]:
print('Acuraria (%):')
print('-' * 20)
df = pd.DataFrame.from_dict(matriz_resultados)
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Acuraria (%):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,54.89%,44.24%,35.94%,48.45%,35.81%,37.55%,0.00%
KNeighborsClassifier,52.54%,43.99%,33.83%,23.17%,40.64%,35.56%,51.92%
LinearSVC,56.38%,55.02%,40.52%,56.01%,53.78%,41.76%,55.39%
LogisticRegression,53.41%,54.40%,41.39%,56.26%,52.91%,40.77%,56.51%
MLPClassifier,51.92%,54.40%,33.33%,52.91%,50.81%,38.04%,51.55%
MultinomialNB,50.19%,28.62%,40.02%,54.77%,28.50%,40.02%,0.00%
RandomForestClassifier,55.27%,53.78%,39.03%,55.14%,48.33%,43.62%,56.51%
SVC,47.58%,37.30%,29.74%,30.11%,30.61%,28.13%,44.24%


In [30]:
print('Tempo para treinar o classificador (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(train_time)
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para treinar o classificador (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.01128,0.01027,0.00289,0.01124,0.0077,0.00279,0.0
KNeighborsClassifier,0.00277,0.00368,0.00427,0.00081,0.00435,0.0039,0.2689
LinearSVC,0.07074,0.41719,0.15406,0.19089,0.52021,0.2012,0.04311
LogisticRegression,1.42045,0.28492,0.24736,1.56225,0.22522,0.09658,1.91151
MLPClassifier,45.10419,2.86724,9.22549,59.31135,4.24025,8.95598,26.81558
MultinomialNB,0.02079,0.00831,0.00296,0.01875,0.0127,0.0107,0.0
RandomForestClassifier,1.47682,0.59461,0.72367,1.62414,0.54497,0.29329,2.92369
SVC,0.38333,0.44833,0.44278,0.41067,0.43031,0.47145,17.66588


In [31]:
print('Tempo para predizer novos itens (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(predict_time)
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para predizer novos itens (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00087,0.00307,0.00026,0.00084,0.00121,0.00026,0.0
KNeighborsClassifier,0.0273,0.25405,0.18748,0.02162,0.24313,0.1768,12.75137
LinearSVC,0.00048,0.00028,0.0003,0.00039,0.0003,0.00039,0.00676
LogisticRegression,0.00102,0.00028,0.00028,0.00085,0.00028,0.00028,0.00699
MLPClassifier,0.00466,0.01296,0.00169,0.00928,0.00148,0.00183,0.03122
MultinomialNB,0.0027,0.00123,0.00027,0.00195,0.00143,0.00028,0.0
RandomForestClassifier,0.0267,0.00918,0.0103,0.02552,0.01002,0.01352,0.04061
SVC,0.19052,0.17571,0.18105,0.17398,0.19168,0.18073,8.56553


### Modelo escolhido e salvo

In [35]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(100, random_state=10)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf_svd)

svm = LinearSVC(max_iter=150, random_state=10)
model = CalibratedClassifierCV(svm) 
model.fit(X_svd, asentimentos)

accuracy = np.round(model.score(X_svd, asentimentos) * 100, 2)
print(f'Modelo   : {model.__class__.__name__}')
print(f'Acurácia : {accuracy}%')

filename = 'tfidf_emotions.sav'
joblib.dump(vec_tfidf, filename)

filename = 'lsa_emotions.sav'
joblib.dump(lsa, filename)

filename = 'model_emotions.sav'
joblib.dump(model, filename)

Modelo   : CalibratedClassifierCV
Acurácia : 58.96%


['model_emotions.sav']

In [24]:
model.classes_

array(['ALEGRIA', 'DESGOSTO', 'MEDO', 'NEUTRO', 'RAIVA', 'SURPRESA',
       'TRISTEZA'], dtype='<U8')

In [25]:
y = model.predict_proba(X_svd)
list(np.round(y[0] * 100, 2))

[4.01, 5.03, 2.4, 72.65, 1.78, 2.24, 11.89]