## Testes de emocoes para o projeto final de IA369Y 2 Semestre 2018

Passos para tratar os dados de emocoes, testar e escolher um classificador para utilizar no projeto final de IA369Y.

1) Remover espaços duplos, quebras de linha, números e links do dataset e das frases a serem testadas.

2) Remover stopwords e aplicar o stemmer.

3) Treinar os classificadores.

4) Realizar as predições com os classificadores.

5) Avaliar as medidas obtidas com os classificadores.

In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle
import time

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk import word_tokenize
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, MinMaxScaler, StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.externals import joblib

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

def highlight_max(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)


def highlight_min(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_min = data == data.min()
        return [attr if v else '' for v in is_min]
    else:  # from .apply(axis=None)
        is_min = data == data.min().min()
        return pd.DataFrame(np.where(is_min, attr, ''),
                            index=data.index, columns=data.columns)   

### Matriz de resultados

In [2]:
classf = {
    'MultinomialNB': 0,
    'ComplementNB': 0,
    'LogisticRegression': 0,
    'RandomForestClassifier': 0,
    'KNeighborsClassifier': 0,
    'MLPClassifier': 0,
    'LinearSVC': 0,
    'SVC': 0
}

matriz_resultados = {
    'tfidf': copy.deepcopy(classf),
    'tfidf+lsa': copy.deepcopy(classf),
    'tfidf+lda': copy.deepcopy(classf),
    'count': copy.deepcopy(classf),
    'count+lsa': copy.deepcopy(classf),
    'count+lda': copy.deepcopy(classf),
    'tfidf+count+w2c': copy.deepcopy(classf),
}

train_time = {
    'tfidf': copy.deepcopy(classf),
    'tfidf+lsa': copy.deepcopy(classf),
    'tfidf+lda': copy.deepcopy(classf),
    'count': copy.deepcopy(classf),
    'count+lsa': copy.deepcopy(classf),
    'count+lda': copy.deepcopy(classf),
    'tfidf+count+w2c': copy.deepcopy(classf),
}

predict_time = {
    'tfidf': copy.deepcopy(classf),
    'tfidf+lsa': copy.deepcopy(classf),
    'tfidf+lda': copy.deepcopy(classf),
    'count': copy.deepcopy(classf),
    'count+lsa': copy.deepcopy(classf),
    'count+lda': copy.deepcopy(classf),
    'tfidf+count+w2c': copy.deepcopy(classf),
}

### Dataset

Fornecido pela equipe da PUC-PR através do site [EMOÇÕES.BR](http://www.ppgia.pucpr.br/~paraiso/mineracaodeemocoes/index.php), contém cerca de 1000 frases, categorizadas usando as 6 emoções de Ekman (alegria, surpresa, tristeza, medo, raiva, desgosto).
Alem disso, foi acrescido em torno de 700 novas frases rotuladas, totalizando 1720 frases.

In [3]:
def carregar(filename):
    frases = []
    with open(filename, 'r') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            sentimento = row[1].upper()
            if len(frase) > 5:
                frases.append((sentimento, frase))
    return frases

frases = carregar('/ssd/programas/sentiment-analysis-2018-president-election/dataset/frases/frases_todas.txt')

shuffle(frases)
print(frases[:5])

[('TRISTEZA', 'falt profissional qualific merc aviaçã sindicat diz sobrecarg trabalh anac tem prejudic cresciment merc'), ('SURPRESA', 'espanhol oferec servic domést sexual trabalh mistur ativ cozinh lav pass ativ erót'), ('NEUTRO', 'datafolh confund merc petrobr nã reag mais'), ('NEUTRO', 'loj american ir stf cobranc abus icms telecom convergent digital telecom'), ('TRISTEZA', 'saudad coraçã nã consegu grit')]


In [4]:
afrases = []
asentimentos =[]
for sentimento, frase in frases:
    afrases.append(frase)
    asentimentos.append(sentimento)

### Funções de Apoio

In [5]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    start = time.time()
    model.fit(X_train, y_train)
    train_time = round(time.time() - start, 5)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    start = time.time()
    y_pred = model.predict(X_test)
    predict_time = round(time.time() - start, 5)
    print(f'Modelo      : {model.__class__.__name__}')
    print(f'Train Time  : {train_time}s')
    print(f'Predict Time: {predict_time}s')
    print(f'Acurácia    : {accuracy}%')
    print(classification_report(y_test, y_pred))
    print('Matrix de Confusão: ')
    print(confusion_matrix(y_test, y_pred))
    print('-' * 30)
    print()
    return accuracy, train_time, predict_time

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

### Classificadores

In [6]:
classifiers = (
    MultinomialNB(),
    ComplementNB(),
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=50, min_samples_split=8, random_state=10),
    KNeighborsClassifier(n_neighbors=8, algorithm='auto'),
    MLPClassifier(**{'activation': 'relu', 'batch_size': 32, 'max_iter': 500, 'random_state': 10, 'solver': 'adam'}),
    LinearSVC(max_iter=150, random_state=10),
    SVC(**{'C': 1.5, 'kernel': 'rbf', 'random_state': 10}),
)

### TF-IDF

In [7]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

In [8]:
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_tfidf, asentimentos))
        matriz_resultados['tfidf'][classifier.__class__.__name__] = acc
        train_time['tfidf'][classifier.__class__.__name__] = tt
        predict_time['tfidf'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.00999s
Predict Time: 0.00108s
Acurácia    : 50.36%
              precision    recall  f1-score   support

     ALEGRIA       0.41      0.90      0.57       243
    DESGOSTO       1.00      0.03      0.05       118
        MEDO       0.00      0.00      0.00        39
      NEUTRO       0.99      0.59      0.74       136
       RAIVA       0.00      0.00      0.00        49
    SURPRESA       0.00      0.00      0.00        47
    TRISTEZA       0.54      0.59      0.56       192

   micro avg       0.50      0.50      0.50       824
   macro avg       0.42      0.30      0.27       824
weighted avg       0.55      0.50      0.43       824

Matrix de Confusão: 
[[219   0   0   1   0   0  23]
 [ 85   3   0   0   0   0  30]
 [ 26   0   0   0   0   0  13]
 [ 48   0   0  80   0   0   8]
 [ 37   0   0   0   0   0  12]
 [ 38   0   0   0   0   0   9]
 [ 79   0   0   0   0   0 113]]
------------------------------

Modelo      : ComplementNB
Train Time

### LSA (usando TF-IDF)

In [9]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(100, random_state=10)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf_svd)

for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, asentimentos))
        matriz_resultados['tfidf+lsa'][classifier.__class__.__name__] = acc
        train_time['tfidf+lsa'][classifier.__class__.__name__] = tt
        predict_time['tfidf+lsa'][classifier.__class__.__name__] = pt
    except Exception as e:
        print(e)

Modelo      : MultinomialNB
Train Time  : 0.0171s
Predict Time: 0.00241s
Acurácia    : 29.73%
              precision    recall  f1-score   support

     ALEGRIA       0.30      1.00      0.46       243
    DESGOSTO       0.00      0.00      0.00       118
        MEDO       0.00      0.00      0.00        39
      NEUTRO       0.00      0.00      0.00       136
       RAIVA       0.00      0.00      0.00        49
    SURPRESA       0.00      0.00      0.00        47
    TRISTEZA       0.67      0.01      0.02       192

   micro avg       0.30      0.30      0.30       824
   macro avg       0.14      0.14      0.07       824
weighted avg       0.24      0.30      0.14       824

Matrix de Confusão: 
[[243   0   0   0   0   0   0]
 [117   0   0   0   0   0   1]
 [ 39   0   0   0   0   0   0]
 [136   0   0   0   0   0   0]
 [ 49   0   0   0   0   0   0]
 [ 47   0   0   0   0   0   0]
 [190   0   0   0   0   0   2]]
------------------------------

Modelo      : ComplementNB
Train Time 

### LDA (usando TF-IDF)

In [10]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_lda = vec_tfidf.fit_transform(afrases)

lda = LatentDirichletAllocation(n_components=100, random_state=10)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_tfidf_lda)

for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, asentimentos))
        matriz_resultados['tfidf+lda'][classifier.__class__.__name__] = acc
        train_time['tfidf+lda'][classifier.__class__.__name__] = tt
        predict_time['tfidf+lda'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.0055s
Predict Time: 0.00057s
Acurácia    : 40.17%
              precision    recall  f1-score   support

     ALEGRIA       0.38      0.72      0.50       243
    DESGOSTO       0.25      0.03      0.05       118
        MEDO       0.00      0.00      0.00        39
      NEUTRO       0.53      0.44      0.48       136
       RAIVA       0.00      0.00      0.00        49
    SURPRESA       0.00      0.00      0.00        47
    TRISTEZA       0.39      0.49      0.44       192

   micro avg       0.40      0.40      0.40       824
   macro avg       0.22      0.24      0.21       824
weighted avg       0.33      0.40      0.33       824

Matrix de Confusão: 
[[174   2   0  17   0   0  50]
 [ 71   3   0  11   0   0  33]
 [ 22   1   0   3   0   0  13]
 [ 49   0   0  60   0   0  27]
 [ 31   0   0   6   0   0  12]
 [ 31   0   0   5   0   0  11]
 [ 81   6   0  11   0   0  94]]
------------------------------

Modelo      : ComplementNB
Train Time 

### Count

In [11]:
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

In [12]:
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_count, asentimentos))
        matriz_resultados['count'][classifier.__class__.__name__] = acc
        train_time['count'][classifier.__class__.__name__] = tt
        predict_time['count'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.01874s
Predict Time: 0.0014s
Acurácia    : 51.33%
              precision    recall  f1-score   support

     ALEGRIA       0.47      0.66      0.55       243
    DESGOSTO       0.39      0.30      0.34       118
        MEDO       0.33      0.03      0.05        39
      NEUTRO       0.87      0.81      0.84       136
       RAIVA       0.25      0.02      0.04        49
    SURPRESA       0.12      0.02      0.04        47
    TRISTEZA       0.45      0.60      0.51       192

   micro avg       0.51      0.51      0.51       824
   macro avg       0.41      0.35      0.34       824
weighted avg       0.48      0.51      0.48       824

Matrix de Confusão: 
[[160  21   0   9   2   2  49]
 [ 45  35   0   2   1   2  33]
 [ 19   2   1   0   0   0  17]
 [ 13   6   0 110   0   0   7]
 [ 21   5   0   3   1   0  19]
 [ 27   3   0   1   0   1  15]
 [ 53  17   2   2   0   3 115]]
------------------------------

Modelo      : ComplementNB
Train Time 

### LSA (usando Count)

In [13]:
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lsa = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(100, random_state=10)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(svd, normalizer)
X_svd = lda.fit_transform(X_count_lsa)

for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, asentimentos))
        matriz_resultados['count+lsa'][classifier.__class__.__name__] = acc
        train_time['count+lsa'][classifier.__class__.__name__] = tt
        predict_time['count+lsa'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.01894s
Predict Time: 0.00232s
Acurácia    : 29.49%
              precision    recall  f1-score   support

     ALEGRIA       0.29      1.00      0.46       243
    DESGOSTO       0.00      0.00      0.00       118
        MEDO       0.00      0.00      0.00        39
      NEUTRO       0.00      0.00      0.00       136
       RAIVA       0.00      0.00      0.00        49
    SURPRESA       0.00      0.00      0.00        47
    TRISTEZA       0.00      0.00      0.00       192

   micro avg       0.29      0.29      0.29       824
   macro avg       0.04      0.14      0.07       824
weighted avg       0.09      0.29      0.13       824

Matrix de Confusão: 
[[243   0   0   0   0   0   0]
 [118   0   0   0   0   0   0]
 [ 39   0   0   0   0   0   0]
 [136   0   0   0   0   0   0]
 [ 49   0   0   0   0   0   0]
 [ 47   0   0   0   0   0   0]
 [192   0   0   0   0   0   0]]
------------------------------

Modelo      : ComplementNB
Train Time

### LDA (usando Count)

In [14]:
vec_tfidf = CountVectorizer(ngram_range=(1, 2), max_features=1500)
X_count_lda = vec_tfidf.fit_transform(afrases)

lda = LatentDirichletAllocation(n_components=100, random_state=10)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_count_lda)

for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, asentimentos))
        matriz_resultados['count+lda'][classifier.__class__.__name__] = acc
        train_time['count+lda'][classifier.__class__.__name__] = tt
        predict_time['count+lda'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.00339s
Predict Time: 0.00046s
Acurácia    : 35.07%
              precision    recall  f1-score   support

     ALEGRIA       0.33      0.59      0.43       243
    DESGOSTO       0.44      0.03      0.06       118
        MEDO       0.00      0.00      0.00        39
      NEUTRO       0.51      0.37      0.43       136
       RAIVA       0.00      0.00      0.00        49
    SURPRESA       0.00      0.00      0.00        47
    TRISTEZA       0.32      0.47      0.38       192

   micro avg       0.35      0.35      0.35       824
   macro avg       0.23      0.21      0.19       824
weighted avg       0.32      0.35      0.29       824

Matrix de Confusão: 
[[144   1   0  18   0   0  80]
 [ 54   4   0  10   0   0  50]
 [ 28   1   0   0   0   0  10]
 [ 69   0   0  50   0   0  17]
 [ 27   0   0   3   0   0  19]
 [ 25   1   0   5   0   0  16]
 [ 87   2   0  12   0   0  91]]
------------------------------

Modelo      : ComplementNB
Train Time

### Count + TF-IDF + Word2Vec

In [15]:
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(afrases)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(afrases)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in afrases:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=5,
    min_count=1,
    seed=0,
    workers=6)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=500)

(14921612, 15758500)

In [16]:
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

In [17]:
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X, asentimentos))
        matriz_resultados['tfidf+count+w2c'][classifier.__class__.__name__] = acc
        train_time['tfidf+count+w2c'][classifier.__class__.__name__] = tt
        predict_time['tfidf+count+w2c'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : LogisticRegression
Train Time  : 4.44799s
Predict Time: 0.01792s
Acurácia    : 53.64%
              precision    recall  f1-score   support

     ALEGRIA       0.49      0.77      0.60       243
    DESGOSTO       0.49      0.15      0.23       118
        MEDO       0.00      0.00      0.00        39
      NEUTRO       0.78      0.85      0.82       136
       RAIVA       1.00      0.02      0.04        49
    SURPRESA       0.00      0.00      0.00        47
    TRISTEZA       0.46      0.62      0.53       192

   micro avg       0.54      0.54      0.54       824
   macro avg       0.46      0.35      0.32       824
weighted avg       0.51      0.54      0.47       824

Matrix de Confusão: 
[[188   6   0  13   0   0  36]
 [ 57  18   0   5   0   0  38]
 [ 18   1   0   1   0   0  19]
 [ 10   0   0 116   0   0  10]
 [ 23   1   0   4   1   0  20]
 [ 29   0   0   2   0   0  16]
 [ 55  11   0   7   0   0 119]]
------------------------------

Modelo      : RandomForestClassi

### Resultado dos Classificadores

In [18]:
print('Acuraria (%):')
print('-' * 20)
df = pd.DataFrame.from_dict(matriz_resultados)
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Acuraria (%):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,51.58%,43.08%,34.47%,46.00%,35.80%,31.92%,0.00%
KNeighborsClassifier,50.24%,43.33%,26.94%,32.28%,38.71%,31.19%,46.84%
LinearSVC,54.25%,54.37%,36.17%,51.33%,50.61%,35.32%,51.82%
LogisticRegression,52.18%,53.88%,38.71%,53.40%,49.03%,36.04%,53.64%
MLPClassifier,53.03%,51.70%,30.58%,51.82%,49.51%,36.04%,50.49%
MultinomialNB,50.36%,29.73%,40.17%,51.33%,29.49%,35.07%,0.00%
RandomForestClassifier,51.21%,50.61%,37.14%,50.73%,48.42%,38.59%,51.09%
SVC,29.49%,29.98%,29.49%,29.49%,29.49%,29.49%,29.49%


In [19]:
print('Tempo para treinar o classificador (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(train_time)
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para treinar o classificador (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.01076,0.00589,0.00847,0.01206,0.00886,0.00714,0.0
KNeighborsClassifier,0.00174,0.00622,0.00787,0.00094,0.00498,0.0042,0.31261
LinearSVC,0.13858,0.54554,0.22546,0.32589,0.47391,0.22048,0.15657
LogisticRegression,6.50192,0.4564,0.37246,4.91321,0.68922,0.37015,4.44799
MLPClassifier,453.93855,9.50933,54.92577,395.33429,8.39651,46.77398,170.0773
MultinomialNB,0.00999,0.0171,0.0055,0.01874,0.01894,0.00339,0.0
RandomForestClassifier,3.14551,0.92951,1.46694,2.03334,0.87288,0.39128,3.17291
SVC,0.94322,0.8383,0.91649,1.28656,0.92342,1.10614,49.55626


In [20]:
print('Tempo para predizer novos itens (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(predict_time)
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para predizer novos itens (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00069,0.00045,0.00065,0.00088,0.00048,0.00048,0.0
KNeighborsClassifier,0.09434,0.37465,0.4269,0.05422,0.26233,0.26016,14.24939
LinearSVC,0.0006,0.00043,0.00042,0.00053,0.0004,0.00041,0.02473
LogisticRegression,0.00351,0.00048,0.00128,0.00201,0.00054,0.00053,0.01792
MLPClassifier,0.009,0.00245,0.01006,0.01963,0.00197,0.00902,0.04517
MultinomialNB,0.00108,0.00241,0.00057,0.0014,0.00232,0.00046,0.0
RandomForestClassifier,0.0588,0.01727,0.01953,0.03211,0.01319,0.01371,0.07987
SVC,0.29518,0.28743,0.28461,0.43859,0.34577,0.23321,12.56116


### Modelo escolhido e salvo

In [25]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=1500)
X_tfidf_svd = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(100, random_state=10)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf_svd)

svm = LinearSVC(max_iter=150, random_state=10)
# svm = LinearSVC(random_state=10)
# svm = SVC(**{'C': 1.5, 'kernel': 'rbf', 'random_state': 0})
model = CalibratedClassifierCV(svm) 
model.fit(X_svd, asentimentos)

accuracy = np.round(model.score(X_svd, asentimentos) * 100, 2)
print(f'Modelo   : {model.__class__.__name__}')
print(f'Acurácia : {accuracy}%')

# filename = 'tfidf_emotions.sav'
# joblib.dump(vec_tfidf, filename)

# filename = 'lsa_emotions.sav'
# joblib.dump(lsa, filename)

# filename = 'model_emotions.sav'
# joblib.dump(model, filename)

Modelo   : CalibratedClassifierCV
Acurácia : 58.37%


In [22]:
model.classes_

array(['ALEGRIA', 'DESGOSTO', 'MEDO', 'NEUTRO', 'RAIVA', 'SURPRESA',
       'TRISTEZA'], dtype='<U8')

In [23]:
y = model.predict_proba(X_svd)
list(np.round(y[0] * 100, 2))

[12.28, 6.23, 5.07, 39.29, 3.51, 3.1, 30.51]