In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle
import time

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk import word_tokenize
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.externals import joblib

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

def highlight_max(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)


def highlight_min(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_min = data == data.min()
        return [attr if v else '' for v in is_min]
    else:  # from .apply(axis=None)
        is_min = data == data.min().min()
        return pd.DataFrame(np.where(is_min, attr, ''),
                            index=data.index, columns=data.columns)   

### Matriz de resultados

In [2]:
classf = {
    'MultinomialNB': 0,
    'ComplementNB': 0,
    'LogisticRegression': 0,
    'RandomForestClassifier': 0,
    'KNeighborsClassifier': 0,
    'MLPClassifier': 0,
    'LinearSVC': 0,
    'SVC': 0
}

matriz_resultados = {
    'tfidf': copy.deepcopy(classf),
    'tfidf+lsa': copy.deepcopy(classf),
    'tfidf+lda': copy.deepcopy(classf),
    'count': copy.deepcopy(classf),
    'count+lsa': copy.deepcopy(classf),
    'count+lda': copy.deepcopy(classf),
    'tfidf+count+w2c': copy.deepcopy(classf),
}

train_time = {
    'tfidf': copy.deepcopy(classf),
    'tfidf+lsa': copy.deepcopy(classf),
    'tfidf+lda': copy.deepcopy(classf),
    'count': copy.deepcopy(classf),
    'count+lsa': copy.deepcopy(classf),
    'count+lda': copy.deepcopy(classf),
    'tfidf+count+w2c': copy.deepcopy(classf),
}

predict_time = {
    'tfidf': copy.deepcopy(classf),
    'tfidf+lsa': copy.deepcopy(classf),
    'tfidf+lda': copy.deepcopy(classf),
    'count': copy.deepcopy(classf),
    'count+lsa': copy.deepcopy(classf),
    'count+lda': copy.deepcopy(classf),
    'tfidf+count+w2c': copy.deepcopy(classf),
}

### Dataset

Fornecido pela equipe da PUC-PR através do site [EMOÇÕES.BR](http://www.ppgia.pucpr.br/~paraiso/mineracaodeemocoes/index.php), contém cerca de 1000 frases, categorizadas usando as 6 emoções de Ekman (alegria, surpresa, tristeza, medo, raiva, desgosto).
Alem disso, foi acrescido em torno de 700 novas frases rotuladas, totalizando 1720 frases.

In [3]:
def carregar(filename):
    frases = []
    with open(filename, 'r') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            sentimento = row[1].upper()
            if len(frase) > 5:
                frases.append((sentimento, frase))
    return frases

frases = carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/frases/frases_todas.txt')

shuffle(frases)
print(frases[:5])

[('TRISTEZA', 'ameac tecnolog tropeçã fio panc cresc estud realiz eua consider acident períod ano cerc mil nort american vítim dess problem ano'), ('RAIVA', 'desej viv prepar combat nã dispor nest mund lut etern nã merec vid'), ('TRISTEZA', 'hoj saudad escorr olho'), ('TRISTEZA', 'asar temp tristez voar'), ('DESGOSTO', 'chuv deix mort filipin men pesso desaparec mau temp fech port provoc colaps red elétr')]


In [4]:
afrases = []
asentimentos =[]
for sentimento, frase in frases:
    afrases.append(frase)
    asentimentos.append(sentimento)

### Funções de Apoio

In [5]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    start = time.time()
    model.fit(X_train, y_train)
    train_time = round(time.time() - start, 5)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    start = time.time()
    y_pred = model.predict(X_test)
    predict_time = round(time.time() - start, 5)
    print(f'Modelo      : {model.__class__.__name__}')
    print(f'Train Time  : {train_time}s')
    print(f'Predict Time: {predict_time}s')
    print(f'Acurácia    : {accuracy}%')
    print(classification_report(y_test, y_pred))
    print('Matrix de Confusão: ')
    print(confusion_matrix(y_test, y_pred))
    print('-' * 30)
    print()
    return accuracy, train_time, predict_time

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

### Classificadores

In [6]:
classifiers = (
    MultinomialNB(),
    ComplementNB(),
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=50, min_samples_split=8, random_state=0),
    KNeighborsClassifier(n_neighbors=8, algorithm='auto'),
    MLPClassifier(hidden_layer_sizes=(100, 25), max_iter=500, random_state=0),
    LinearSVC(max_iter=150, random_state=0),
    SVC(gamma='auto', max_iter=150),
)

### TF-IDF

In [28]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

In [29]:
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_tfidf, asentimentos))
        matriz_resultados['tfidf'][classifier.__class__.__name__] = acc
        train_time['tfidf'][classifier.__class__.__name__] = tt
        predict_time['tfidf'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.00824s
Predict Time: 0.00099s
Acurácia    : 46.3%
              precision    recall  f1-score   support

     ALEGRIA       0.42      0.83      0.56       184
    DESGOSTO       1.00      0.03      0.05       109
        MEDO       0.00      0.00      0.00        43
       RAIVA       0.00      0.00      0.00        33
    SURPRESA       0.00      0.00      0.00        31
    TRISTEZA       0.53      0.64      0.58       168

   micro avg       0.46      0.46      0.46       568
   macro avg       0.32      0.25      0.20       568
weighted avg       0.48      0.46      0.36       568

Matrix de Confusão: 
[[153   0   0   0   0  31]
 [ 79   3   0   0   0  27]
 [ 27   0   0   0   0  16]
 [ 19   0   0   0   0  14]
 [ 23   0   0   0   0   8]
 [ 61   0   0   0   0 107]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.0123s
Predict Time: 0.00106s
Acurácia    : 45.25%
              precision    recall  f1-score   support


### LSA (usando TF-IDF)

In [35]:
svd = TruncatedSVD(n_components=70, random_state=0)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, asentimentos))
        matriz_resultados['tfidf+lsa'][classifier.__class__.__name__] = acc
        train_time['tfidf+lsa'][classifier.__class__.__name__] = tt
        predict_time['tfidf+lsa'][classifier.__class__.__name__] = pt
    except Exception as e:
        print(e)

Modelo      : MultinomialNB
Train Time  : 0.00201s
Predict Time: 0.00017s
Acurácia    : 32.92%
              precision    recall  f1-score   support

     ALEGRIA       0.33      1.00      0.49       184
    DESGOSTO       0.00      0.00      0.00       109
        MEDO       0.00      0.00      0.00        43
       RAIVA       0.00      0.00      0.00        33
    SURPRESA       0.00      0.00      0.00        31
    TRISTEZA       0.75      0.02      0.03       168

   micro avg       0.33      0.33      0.33       568
   macro avg       0.18      0.17      0.09       568
weighted avg       0.33      0.33      0.17       568

Matrix de Confusão: 
[[184   0   0   0   0   0]
 [109   0   0   0   0   0]
 [ 43   0   0   0   0   0]
 [ 32   0   0   0   0   1]
 [ 31   0   0   0   0   0]
 [165   0   0   0   0   3]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.00303s
Predict Time: 0.0002s
Acurácia    : 43.66%
              precision    recall  f1-score   support

### LDA (usando TF-IDF)

In [10]:
lda = LatentDirichletAllocation(n_components=100, max_iter=25, random_state=0, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, asentimentos))
        matriz_resultados['tfidf+lda'][classifier.__class__.__name__] = acc
        train_time['tfidf+lda'][classifier.__class__.__name__] = tt
        predict_time['tfidf+lda'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.01385s
Predict Time: 0.00023s
Acurácia    : 32.92%
              precision    recall  f1-score   support

     ALEGRIA       0.33      0.59      0.42       184
    DESGOSTO       0.30      0.03      0.05       109
        MEDO       0.00      0.00      0.00        43
       RAIVA       0.00      0.00      0.00        33
    SURPRESA       0.00      0.00      0.00        31
    TRISTEZA       0.33      0.45      0.38       168

   micro avg       0.33      0.33      0.33       568
   macro avg       0.16      0.18      0.14       568
weighted avg       0.26      0.33      0.26       568

Matrix de Confusão: 
[[108   3   0   0   0  73]
 [ 63   3   0   0   0  43]
 [ 32   0   0   0   0  11]
 [ 18   0   0   0   0  15]
 [ 17   1   0   0   0  13]
 [ 89   3   0   0   0  76]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.00268s
Predict Time: 0.00027s
Acurácia    : 21.65%
              precision    recall  f1-score   suppor

### Count

In [11]:
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

In [12]:
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_count, asentimentos))
        matriz_resultados['count'][classifier.__class__.__name__] = acc
        train_time['count'][classifier.__class__.__name__] = tt
        predict_time['count'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.00561s
Predict Time: 0.00063s
Acurácia    : 45.42%
              precision    recall  f1-score   support

     ALEGRIA       0.48      0.64      0.54       184
    DESGOSTO       0.38      0.39      0.38       109
        MEDO       0.25      0.02      0.04        43
       RAIVA       0.00      0.00      0.00        33
    SURPRESA       0.00      0.00      0.00        31
    TRISTEZA       0.48      0.58      0.53       168

   micro avg       0.45      0.45      0.45       568
   macro avg       0.26      0.27      0.25       568
weighted avg       0.39      0.45      0.41       568

Matrix de Confusão: 
[[117  28   2   1   2  34]
 [ 41  42   0   0   0  26]
 [ 19   2   1   0   0  21]
 [ 15   4   0   0   0  14]
 [ 16   4   0   0   0  11]
 [ 38  31   1   0   0  98]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.00804s
Predict Time: 0.00055s
Acurácia    : 38.56%
              precision    recall  f1-score   suppor

### LSA (usando Count)

In [36]:
svd = TruncatedSVD(n_components=70, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(svd, normalizer)
X_svd = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_svd, asentimentos))
        matriz_resultados['count+lsa'][classifier.__class__.__name__] = acc
        train_time['count+lsa'][classifier.__class__.__name__] = tt
        predict_time['count+lsa'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.00562s
Predict Time: 0.00018s
Acurácia    : 32.75%
              precision    recall  f1-score   support

     ALEGRIA       0.33      1.00      0.49       184
    DESGOSTO       0.00      0.00      0.00       109
        MEDO       0.00      0.00      0.00        43
       RAIVA       0.00      0.00      0.00        33
    SURPRESA       0.00      0.00      0.00        31
    TRISTEZA       1.00      0.01      0.02       168

   micro avg       0.33      0.33      0.33       568
   macro avg       0.22      0.17      0.09       568
weighted avg       0.40      0.33      0.17       568

Matrix de Confusão: 
[[184   0   0   0   0   0]
 [109   0   0   0   0   0]
 [ 43   0   0   0   0   0]
 [ 33   0   0   0   0   0]
 [ 31   0   0   0   0   0]
 [166   0   0   0   0   2]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.00416s
Predict Time: 0.00017s
Acurácia    : 41.02%
              precision    recall  f1-score   suppor

### LDA (usando Count)

In [14]:
lda = LatentDirichletAllocation(n_components=100, max_iter=25, random_state=0, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X_lda, asentimentos))
        matriz_resultados['count+lda'][classifier.__class__.__name__] = acc
        train_time['count+lda'][classifier.__class__.__name__] = tt
        predict_time['count+lda'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : MultinomialNB
Train Time  : 0.0113s
Predict Time: 0.00633s
Acurácia    : 29.93%
              precision    recall  f1-score   support

     ALEGRIA       0.32      0.53      0.40       184
    DESGOSTO       0.20      0.07      0.11       109
        MEDO       0.00      0.00      0.00        43
       RAIVA       0.00      0.00      0.00        33
    SURPRESA       0.00      0.00      0.00        31
    TRISTEZA       0.29      0.39      0.33       168

   micro avg       0.30      0.30      0.30       568
   macro avg       0.13      0.16      0.14       568
weighted avg       0.23      0.30      0.25       568

Matrix de Confusão: 
[[97 18  0  0  0 69]
 [58  8  0  0  0 43]
 [30  2  0  0  0 11]
 [13  1  0  0  0 19]
 [12  4  0  0  0 15]
 [96  7  0  0  0 65]]
------------------------------

Modelo      : ComplementNB
Train Time  : 0.00505s
Predict Time: 0.00087s
Acurácia    : 22.01%
              precision    recall  f1-score   support

     ALEGRIA       0.29      0.26 

### Count + TF-IDF + Word2Vec

In [15]:
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(afrases)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(afrases)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in afrases:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=25,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(23382733, 24849000)

In [16]:
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

In [17]:
for classifier in classifiers:
    try:
        acc, tt, pt = run_ml_model(classifier, **split_data(X, asentimentos))
        matriz_resultados['tfidf+count+w2c'][classifier.__class__.__name__] = acc
        train_time['tfidf+count+w2c'][classifier.__class__.__name__] = tt
        predict_time['tfidf+count+w2c'][classifier.__class__.__name__] = pt
    except:
        pass

Modelo      : LogisticRegression
Train Time  : 2.33798s
Predict Time: 0.00706s
Acurácia    : 46.65%
              precision    recall  f1-score   support

     ALEGRIA       0.46      0.77      0.58       184
    DESGOSTO       0.49      0.20      0.29       109
        MEDO       0.00      0.00      0.00        43
       RAIVA       0.00      0.00      0.00        33
    SURPRESA       0.00      0.00      0.00        31
    TRISTEZA       0.47      0.61      0.53       168

   micro avg       0.47      0.47      0.47       568
   macro avg       0.24      0.26      0.23       568
weighted avg       0.38      0.47      0.40       568

Matrix de Confusão: 
[[141   3   0   0   0  40]
 [ 58  22   0   0   0  29]
 [ 22   1   0   0   0  20]
 [ 16   2   0   0   0  15]
 [ 16   2   0   0   0  13]
 [ 51  15   0   0   0 102]]
------------------------------

Modelo      : RandomForestClassifier
Train Time  : 1.7544s
Predict Time: 0.05688s
Acurácia    : 46.48%
              precision    recall  f1-

### Resultado dos Classificadores

In [41]:
print('Acuraria (%):')
print('-' * 20)
df = pd.DataFrame.from_dict(matriz_resultados)
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Acuraria (%):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,45.25%,43.66%,21.65%,38.56%,41.02%,22.01%,0.00%
KNeighborsClassifier,44.01%,38.91%,29.93%,25.18%,37.32%,30.46%,43.66%
LinearSVC,49.12%,47.89%,31.51%,45.25%,44.89%,29.75%,44.89%
LogisticRegression,47.01%,47.36%,32.39%,47.54%,44.37%,29.23%,46.65%
MLPClassifier,47.01%,42.25%,30.46%,47.01%,44.37%,31.51%,40.67%
MultinomialNB,46.30%,32.92%,32.92%,45.42%,32.75%,29.93%,0.00%
RandomForestClassifier,46.13%,47.89%,33.27%,46.13%,39.61%,32.22%,46.48%
SVC,42.78%,32.04%,34.15%,35.92%,34.68%,33.10%,40.14%


In [42]:
print('Tempo para treinar o classificador (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(train_time)
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para treinar o classificador (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.0123,0.00303,0.00268,0.00804,0.00416,0.00505,0.0
KNeighborsClassifier,0.00085,0.00292,0.00297,0.00075,0.00228,0.00343,0.20067
LinearSVC,0.02689,0.17524,0.09014,0.06372,0.18478,0.15558,0.0307
LogisticRegression,1.27656,0.11657,0.06507,1.85829,0.15694,0.12894,2.33798
MLPClassifier,36.22989,4.43274,10.19349,36.59538,6.21379,14.23235,47.70024
MultinomialNB,0.00824,0.00201,0.01385,0.00561,0.00562,0.0113,0.0
RandomForestClassifier,0.92376,0.33086,0.42793,0.9681,0.32705,0.31514,1.7544
SVC,0.18585,0.13227,0.17654,0.27134,0.13267,0.21381,6.24931


In [43]:
print('Tempo para predizer novos itens (segundos):')
print('-' * 20)
df = pd.DataFrame.from_dict(predict_time)
df.style.apply(highlight_min).format({
    'tfidf': '{:,.5f}'.format,
    'tfidf+lsa': '{:,.5f}'.format,
    'tfidf+lda': '{:,.5f}'.format,
    'count': '{:,.5f}'.format,
    'count+lsa': '{:,.5f}'.format,
    'count+lda': '{:,.5f}'.format,
    'tfidf+count+w2c': '{:,.5f}'.format,
})

Tempo para predizer novos itens (segundos):
--------------------


Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,0.00106,0.0002,0.00027,0.00055,0.00017,0.00087,0.0
KNeighborsClassifier,0.01438,0.0905,0.08619,0.01559,0.08759,0.09102,5.85375
LinearSVC,0.00026,0.00017,0.00021,0.00063,0.00017,0.00139,0.00478
LogisticRegression,0.00052,0.00018,0.00019,0.00053,0.00018,0.00018,0.00706
MLPClassifier,0.00329,0.00092,0.00132,0.00648,0.00095,0.00365,0.07299
MultinomialNB,0.00099,0.00017,0.00023,0.00063,0.00018,0.00633,0.0
RandomForestClassifier,0.02215,0.00815,0.00887,0.02311,0.00846,0.01271,0.05688
SVC,0.09287,0.05394,0.07276,0.10811,0.0554,0.07291,2.82346


### Modelo escolhido e salvo

In [44]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(n_components=70, random_state=0)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf)

svm = LinearSVC(max_iter=150, random_state=0)
model = CalibratedClassifierCV(svm) 
model.fit(X_svd, asentimentos)

accuracy = np.round(model.score(X_svd, asentimentos) * 100, 2)
print(f'Modelo   : {model.__class__.__name__}')
print(f'Acurácia : {accuracy}%')

filename = 'tfidf_emotions.sav'
joblib.dump(vec_tfidf, filename)

filename = 'lsa_emotions.sav'
joblib.dump(lsa, filename)

filename = 'model_emotions.sav'
joblib.dump(model, filename)

Modelo   : CalibratedClassifierCV
Acurácia : 53.95%


['model_emotions.sav']

In [22]:
model.classes_

array(['ALEGRIA', 'DESGOSTO', 'MEDO', 'RAIVA', 'SURPRESA', 'TRISTEZA'],
      dtype='<U8')

In [23]:
y = model.predict_proba(X_svd)
list(np.round(y[0] * 100, 2))

[42.3, 19.37, 7.21, 2.73, 4.64, 23.74]