In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk import word_tokenize
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.externals import joblib

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

def highlight_max(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)

### Matriz de resultados

In [2]:
classf = {
    'MultinomialNB': 0,
    'ComplementNB': 0,
    'LogisticRegression': 0,
    'RandomForestClassifier': 0,
    'KNeighborsClassifier': 0,
    'MLPClassifier': 0,
    'LinearSVC': 0,
    'SVC': 0
}

matriz_resultados = {
    'tfidf': copy.deepcopy(classf),
    'tfidf+lsa': copy.deepcopy(classf),
    'tfidf+lda': copy.deepcopy(classf),
    'count': copy.deepcopy(classf),
    'count+lsa': copy.deepcopy(classf),
    'count+lda': copy.deepcopy(classf),
    'tfidf+count+w2c': copy.deepcopy(classf),
}

### Dataset

Fornecido pela equipe da PUC-PR através do site [EMOÇÕES.BR](http://www.ppgia.pucpr.br/~paraiso/mineracaodeemocoes/index.php), contém cerca de 1000 frases, categorizadas usando as 6 emoções de Ekman (alegria, surpresa, tristeza, medo, raiva, desgosto).
Alem disso, foi acrescido em torno de 700 novas frases rotuladas, totalizando 1720 frases.

In [3]:
def carregar(filename):
    frases = []
    with open(filename, 'r') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            sentimento = row[1].upper()
            if len(frase) > 5:
                frases.append((sentimento, frase))
    return frases

frases = carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/frases/frases_todas.txt')

shuffle(frases)
print(frases[:5])

[('DESGOSTO', 'fundaçã sarney critic intervençã ministéri públic president exercíci diz estranh reprov cont proced administr nã conden diz fundaçã'), ('NEUTRO', 'remax portugal novidad crédit habit facebok'), ('TRISTEZA', 'sussurr baix jov dem pra tant problem'), ('ALEGRIA', 'pert concordat cham govern motors plan tesour eua prev control montador govern planej injet mais biliã viabiliz empres'), ('ALEGRIA', 'lul ped ministr apresent marc regulatóri pré sal solen rir marc iníci oper cam pré sal lul receb pequen barril petról extra hoj camp tup')]


In [4]:
afrases = []
asentimentos =[]
for sentimento, frase in frases:
    afrases.append(frase)
    asentimentos.append(sentimento)

### Funções de Apoio

In [5]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    y_pred = model.predict(X_test)
    print(f'Modelo   : {model.__class__.__name__}')
    print(f'Acurácia : {accuracy}%')
    print(classification_report(y_test, y_pred))
    print('Matrix de Confusão: ')
    print(confusion_matrix(y_test, y_pred))
    print('-' * 30)
    print()
    return accuracy

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

### Classificadores

In [6]:
classifiers = (
    MultinomialNB(),
    ComplementNB(),
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=50, min_samples_split=8, random_state=0),
    KNeighborsClassifier(n_neighbors=8, algorithm='auto'),
    MLPClassifier(hidden_layer_sizes=(100, 25), max_iter=500, random_state=0),
    LinearSVC(max_iter=500),
    SVC(gamma='auto', max_iter=500),
)

### TF-IDF

In [7]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

In [8]:
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_tfidf, asentimentos))
        matriz_resultados['tfidf'][classifier.__class__.__name__] = acc
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 45.11%
              precision    recall  f1-score   support

     ALEGRIA       0.34      0.94      0.49       206
    DESGOSTO       1.00      0.02      0.03       113
        MEDO       0.00      0.00      0.00        50
      NEUTRO       1.00      0.57      0.73       142
       RAIVA       0.00      0.00      0.00        47
    SURPRESA       0.00      0.00      0.00        49
    TRISTEZA       0.59      0.44      0.50       200

   micro avg       0.45      0.45      0.45       807
   macro avg       0.42      0.28      0.25       807
weighted avg       0.55      0.45      0.38       807

Matrix de Confusão: 
[[193   0   0   0   0   0  13]
 [ 91   2   0   0   0   0  20]
 [ 41   0   0   0   0   0   9]
 [ 57   0   0  81   0   0   4]
 [ 37   0   0   0   0   0  10]
 [ 43   0   0   0   0   0   6]
 [112   0   0   0   0   0  88]]
------------------------------

Modelo   : ComplementNB
Acurácia : 52.79%
              precision    recall  f1-score   s

### LSA (usando TF-IDF)

In [9]:
svd = TruncatedSVD(n_components=70, n_iter=50, random_state=0)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_svd, asentimentos))
        matriz_resultados['tfidf+lsa'][classifier.__class__.__name__] = acc
    except Exception as e:
        print(e)

Modelo   : MultinomialNB
Acurácia : 25.53%
              precision    recall  f1-score   support

     ALEGRIA       0.26      1.00      0.41       206
    DESGOSTO       0.00      0.00      0.00       113
        MEDO       0.00      0.00      0.00        50
      NEUTRO       0.00      0.00      0.00       142
       RAIVA       0.00      0.00      0.00        47
    SURPRESA       0.00      0.00      0.00        49
    TRISTEZA       0.00      0.00      0.00       200

   micro avg       0.26      0.26      0.26       807
   macro avg       0.04      0.14      0.06       807
weighted avg       0.07      0.26      0.10       807

Matrix de Confusão: 
[[206   0   0   0   0   0   0]
 [113   0   0   0   0   0   0]
 [ 50   0   0   0   0   0   0]
 [142   0   0   0   0   0   0]
 [ 47   0   0   0   0   0   0]
 [ 49   0   0   0   0   0   0]
 [200   0   0   0   0   0   0]]
------------------------------

Modelo   : ComplementNB
Acurácia : 45.35%
              precision    recall  f1-score   s

### LDA (usando TF-IDF)

In [10]:
lda = LatentDirichletAllocation(n_components=70, max_iter=50, random_state=0, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_lda, asentimentos))
        matriz_resultados['tfidf+lda'][classifier.__class__.__name__] = acc
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 26.02%
              precision    recall  f1-score   support

     ALEGRIA       0.25      0.67      0.36       206
    DESGOSTO       0.10      0.01      0.02       113
        MEDO       0.00      0.00      0.00        50
      NEUTRO       0.40      0.16      0.23       142
       RAIVA       0.00      0.00      0.00        47
    SURPRESA       0.00      0.00      0.00        49
    TRISTEZA       0.26      0.24      0.25       200

   micro avg       0.26      0.26      0.26       807
   macro avg       0.14      0.15      0.12       807
weighted avg       0.21      0.26      0.20       807

Matrix de Confusão: 
[[138   3   0  10   0   0  55]
 [ 78   1   0   7   0   0  27]
 [ 40   0   0   3   0   0   7]
 [ 90   2   0  23   0   0  27]
 [ 34   2   0   2   0   0   9]
 [ 36   0   0   1   0   0  12]
 [138   2   0  12   0   0  48]]
------------------------------

Modelo   : ComplementNB
Acurácia : 22.8%
              precision    recall  f1-score   su

### Count

In [11]:
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

In [12]:
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_count, asentimentos))
        matriz_resultados['count'][classifier.__class__.__name__] = acc
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 53.53%
              precision    recall  f1-score   support

     ALEGRIA       0.44      0.69      0.54       206
    DESGOSTO       0.42      0.39      0.40       113
        MEDO       0.50      0.02      0.04        50
      NEUTRO       0.94      0.82      0.88       142
       RAIVA       0.33      0.02      0.04        47
    SURPRESA       0.00      0.00      0.00        49
    TRISTEZA       0.51      0.64      0.57       200

   micro avg       0.54      0.54      0.54       807
   macro avg       0.45      0.37      0.35       807
weighted avg       0.51      0.54      0.49       807

Matrix de Confusão: 
[[143  22   0   5   2   1  33]
 [ 44  44   1   0   0   0  24]
 [ 26   4   1   0   0   0  19]
 [ 13   7   0 116   0   0   6]
 [ 23   4   0   0   1   0  19]
 [ 27   3   0   0   0   0  19]
 [ 50  21   0   2   0   0 127]]
------------------------------

Modelo   : ComplementNB
Acurácia : 48.45%
              precision    recall  f1-score   s

### LSA (usando Count)

In [13]:
svd = TruncatedSVD(n_components=70, n_iter=50, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(svd, normalizer)
X_svd = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_svd, asentimentos))
        matriz_resultados['count+lsa'][classifier.__class__.__name__] = acc
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 25.53%
              precision    recall  f1-score   support

     ALEGRIA       0.26      1.00      0.41       206
    DESGOSTO       0.00      0.00      0.00       113
        MEDO       0.00      0.00      0.00        50
      NEUTRO       0.00      0.00      0.00       142
       RAIVA       0.00      0.00      0.00        47
    SURPRESA       0.00      0.00      0.00        49
    TRISTEZA       0.00      0.00      0.00       200

   micro avg       0.26      0.26      0.26       807
   macro avg       0.04      0.14      0.06       807
weighted avg       0.07      0.26      0.10       807

Matrix de Confusão: 
[[206   0   0   0   0   0   0]
 [113   0   0   0   0   0   0]
 [ 50   0   0   0   0   0   0]
 [142   0   0   0   0   0   0]
 [ 47   0   0   0   0   0   0]
 [ 49   0   0   0   0   0   0]
 [200   0   0   0   0   0   0]]
------------------------------

Modelo   : ComplementNB
Acurácia : 36.93%
              precision    recall  f1-score   s

### LDA (usando Count)

In [14]:
lda = LatentDirichletAllocation(n_components=70, max_iter=50, random_state=0, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_lda, asentimentos))
        matriz_resultados['count+lda'][classifier.__class__.__name__] = acc
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 26.39%
              precision    recall  f1-score   support

     ALEGRIA       0.26      0.62      0.37       206
    DESGOSTO       0.33      0.04      0.06       113
        MEDO       0.00      0.00      0.00        50
      NEUTRO       0.25      0.20      0.23       142
       RAIVA       0.00      0.00      0.00        47
    SURPRESA       0.00      0.00      0.00        49
    TRISTEZA       0.27      0.26      0.27       200

   micro avg       0.26      0.26      0.26       807
   macro avg       0.16      0.16      0.13       807
weighted avg       0.23      0.26      0.21       807

Matrix de Confusão: 
[[128   3   0  21   0   0  54]
 [ 56   4   0  20   0   0  33]
 [ 31   1   0   8   0   0  10]
 [ 88   1   0  29   0   0  24]
 [ 38   0   0   5   0   0   4]
 [ 26   0   0   8   0   0  15]
 [121   3   0  24   0   0  52]]
------------------------------

Modelo   : ComplementNB
Acurácia : 20.45%
              precision    recall  f1-score   s

### Count + TF-IDF + Word2Vec

In [15]:
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(afrases)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(afrases)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in afrases:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=25,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(29459294, 31096000)

In [16]:
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

In [17]:
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X, asentimentos))
        matriz_resultados['tfidf+count+w2c'][classifier.__class__.__name__] = acc
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 53.66%
              precision    recall  f1-score   support

     ALEGRIA       0.41      0.75      0.53       206
    DESGOSTO       0.60      0.27      0.37       113
        MEDO       1.00      0.02      0.04        50
      NEUTRO       0.86      0.84      0.85       142
       RAIVA       0.00      0.00      0.00        47
    SURPRESA       0.00      0.00      0.00        49
    TRISTEZA       0.54      0.65      0.59       200

   micro avg       0.54      0.54      0.54       807
   macro avg       0.49      0.36      0.34       807
weighted avg       0.53      0.54      0.48       807

Matrix de Confusão: 
[[154   9   0  10   0   0  33]
 [ 63  30   0   2   0   0  18]
 [ 26   2   1   1   0   0  20]
 [ 16   0   0 119   0   0   7]
 [ 28   1   0   1   0   0  17]
 [ 32   1   0   0   0   0  16]
 [ 59   7   0   5   0   0 129]]
------------------------------

Modelo   : RandomForestClassifier
Acurácia : 53.04%
              precision    recal

### Resultado dos Classificadores

In [18]:
df = pd.DataFrame.from_dict(matriz_resultados)
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,52.79%,45.35%,22.80%,48.45%,36.93%,20.45%,0.00%
KNeighborsClassifier,48.45%,47.46%,26.52%,26.52%,39.90%,28.00%,47.46%
LinearSVC,53.90%,53.66%,26.02%,50.93%,50.68%,26.52%,51.18%
LogisticRegression,50.19%,52.91%,26.27%,52.66%,49.44%,26.52%,53.66%
MLPClassifier,50.93%,51.30%,27.63%,50.93%,49.81%,31.85%,48.57%
MultinomialNB,45.11%,25.53%,26.02%,53.53%,25.53%,26.39%,0.00%
RandomForestClassifier,50.06%,50.93%,35.44%,52.54%,48.70%,28.00%,53.04%
SVC,25.53%,25.53%,25.53%,25.53%,25.53%,25.53%,25.53%


### Modelo escolhido e salvo

In [19]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(n_components=70, n_iter=50, random_state=0)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf)

svm = LinearSVC(max_iter=1200)
model = CalibratedClassifierCV(svm) 
model.fit(X_svd, asentimentos)

accuracy = np.round(model.score(X_svd, asentimentos) * 100, 2)
print(f'Modelo   : {model.__class__.__name__}')
print(f'Acurácia : {accuracy}%')

filename = 'tfidf_emotions.sav'
joblib.dump(vec_tfidf, filename)

filename = 'lsa_emotions.sav'
joblib.dump(lsa, filename)

filename = 'model_emotions.sav'
joblib.dump(model, filename)

Modelo   : CalibratedClassifierCV
Acurácia : 57.41%


['model_emotions.sav']

In [20]:
model.classes_

array(['ALEGRIA', 'DESGOSTO', 'MEDO', 'NEUTRO', 'RAIVA', 'SURPRESA',
       'TRISTEZA'], dtype='<U8')

In [21]:
y = model.predict_proba(X_svd)
list(np.round(y[0] * 100, 2))

[33.3, 33.52, 4.06, 0.09, 7.51, 2.77, 18.75]