In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk import word_tokenize
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix
from sklearn.externals import joblib

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

def highlight_max(data, color='green'):
    attr = f'background-color: {color}; color: white; font-weight: bold;'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)

### Matriz de resultados

In [2]:
classf = {
    'MultinomialNB': 0,
    'ComplementNB': 0,
    'LogisticRegression': 0,
    'RandomForestClassifier': 0,
    'KNeighborsClassifier': 0,
    'MLPClassifier': 0,
    'LinearSVC': 0,
    'SVC': 0
}

matriz_resultados = {
    'tfidf': copy.deepcopy(classf),
    'tfidf+lsa': copy.deepcopy(classf),
    'tfidf+lda': copy.deepcopy(classf),
    'count': copy.deepcopy(classf),
    'count+lsa': copy.deepcopy(classf),
    'count+lda': copy.deepcopy(classf),
    'tfidf+count+w2c': copy.deepcopy(classf),
}

### Dataset

Fornecido pela equipe da PUC-PR através do site [EMOÇÕES.BR](http://www.ppgia.pucpr.br/~paraiso/mineracaodeemocoes/index.php), contém cerca de 1000 frases, categorizadas usando as 6 emoções de Ekman (alegria, surpresa, tristeza, medo, raiva, desgosto).
Alem disso, foi acrescido em torno de 700 novas frases rotuladas, totalizando 1720 frases.

In [3]:
def carregar(filename):
    frases = []
    with open(filename, 'r') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            sentimento = row[1].upper()
            if len(frase) > 5:
                frases.append((sentimento, frase))
    return frases

frases = carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/frases/frases_todas.txt')

shuffle(frases)
print(frases[:5])

[('ALEGRIA', 'alto consum laticíni infânc aument expect vid diz estud pesquis encontr ano crianc hav ser alvo estud ano'), ('MEDO', 'nunc deix raiv tir melhor'), ('RAIVA', 'vid chei obrig gent cumpr mais vontad infring deslav'), ('RAIVA', 'pesso afivel másc cab algum ano acredit piament verdadeir rost gent lha arranc fic carn viv dor desesper incapaz comprend gest violent melhor prov respeit pod dar'), ('TRISTEZA', 'currícul propor mec ide diz paul renat reinvent invent diz secretári segund propost const resolu cne')]


In [4]:
afrases = []
asentimentos =[]
for sentimento, frase in frases:
    afrases.append(frase)
    asentimentos.append(sentimento)

### Funções de Apoio

In [5]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    print(f'Modelo   : {model.__class__.__name__}')
    print(f'Acurácia : {accuracy}%')
    print('-' * 20)
    return accuracy

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

### Classificadores

In [6]:
classifiers = (
    MultinomialNB(),
    ComplementNB(),
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=50, min_samples_split=8, random_state=0),
    KNeighborsClassifier(n_neighbors=8, algorithm='auto'),
    MLPClassifier(hidden_layer_sizes=(100, 25), max_iter=500, random_state=0),
    LinearSVC(max_iter=500),
    SVC(gamma='auto', max_iter=500),
)

### TF-IDF

In [7]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

In [8]:
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_tfidf, asentimentos))
        matriz_resultados['tfidf'][classifier.__class__.__name__] = acc
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 46.87%
--------------------
Modelo   : ComplementNB
Acurácia : 47.03%
--------------------
Modelo   : LogisticRegression
Acurácia : 47.19%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 46.39%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 43.98%
--------------------
Modelo   : MLPClassifier
Acurácia : 46.07%
--------------------
Modelo   : LinearSVC
Acurácia : 49.44%
--------------------
Modelo   : SVC
Acurácia : 33.07%
--------------------


### LSA (usando TF-IDF)

In [9]:
svd = TruncatedSVD(n_components=70, n_iter=50, random_state=0)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_svd, asentimentos))
        matriz_resultados['tfidf+lsa'][classifier.__class__.__name__] = acc
    except Exception as e:
        print(e)

Modelo   : MultinomialNB
Acurácia : 33.07%
--------------------
Modelo   : ComplementNB
Acurácia : 46.39%
--------------------
Modelo   : LogisticRegression
Acurácia : 48.96%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 48.96%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 40.93%
--------------------
Modelo   : MLPClassifier
Acurácia : 46.23%
--------------------
Modelo   : LinearSVC
Acurácia : 49.6%
--------------------
Modelo   : SVC
Acurácia : 33.07%
--------------------


### LDA (usando TF-IDF)

In [10]:
lda = LatentDirichletAllocation(n_components=70, max_iter=50, random_state=0, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_lda, asentimentos))
        matriz_resultados['tfidf+lda'][classifier.__class__.__name__] = acc
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 33.23%
--------------------
Modelo   : ComplementNB
Acurácia : 23.43%
--------------------
Modelo   : LogisticRegression
Acurácia : 31.94%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 36.12%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 33.07%
--------------------
Modelo   : MLPClassifier
Acurácia : 32.1%
--------------------
Modelo   : LinearSVC
Acurácia : 31.62%
--------------------
Modelo   : SVC
Acurácia : 33.07%
--------------------


### Count

In [11]:
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

In [12]:
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_count, asentimentos))
        matriz_resultados['count'][classifier.__class__.__name__] = acc
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 48.31%
--------------------
Modelo   : ComplementNB
Acurácia : 37.08%
--------------------
Modelo   : LogisticRegression
Acurácia : 48.15%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 47.51%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 17.5%
--------------------
Modelo   : MLPClassifier
Acurácia : 47.67%
--------------------
Modelo   : LinearSVC
Acurácia : 47.35%
--------------------
Modelo   : SVC
Acurácia : 33.07%
--------------------


### LSA (usando Count)

In [13]:
svd = TruncatedSVD(n_components=70, n_iter=50, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(svd, normalizer)
X_svd = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_svd, asentimentos))
        matriz_resultados['count+lsa'][classifier.__class__.__name__] = acc
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 33.07%
--------------------
Modelo   : ComplementNB
Acurácia : 41.73%
--------------------
Modelo   : LogisticRegression
Acurácia : 47.83%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 47.19%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 38.68%
--------------------
Modelo   : MLPClassifier
Acurácia : 44.62%
--------------------
Modelo   : LinearSVC
Acurácia : 48.31%
--------------------
Modelo   : SVC
Acurácia : 33.07%
--------------------


### LDA (usando Count)

In [14]:
lda = LatentDirichletAllocation(n_components=70, max_iter=50, random_state=0, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X_lda, asentimentos))
        matriz_resultados['count+lda'][classifier.__class__.__name__] = acc
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 29.37%
--------------------
Modelo   : ComplementNB
Acurácia : 22.79%
--------------------
Modelo   : LogisticRegression
Acurácia : 29.21%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 33.23%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 31.46%
--------------------
Modelo   : MLPClassifier
Acurácia : 33.23%
--------------------
Modelo   : LinearSVC
Acurácia : 28.25%
--------------------
Modelo   : SVC
Acurácia : 33.07%
--------------------


### Count + TF-IDF + Word2Vec

In [15]:
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(afrases)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(afrases)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in afrases:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=25,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(24952079, 26535000)

In [16]:
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

In [17]:
for classifier in classifiers:
    try:
        acc = run_ml_model(classifier, **split_data(X, asentimentos))
        matriz_resultados['tfidf+count+w2c'][classifier.__class__.__name__] = acc
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 48.48%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 48.64%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 45.59%
--------------------
Modelo   : MLPClassifier
Acurácia : 43.66%
--------------------
Modelo   : LinearSVC
Acurácia : 47.67%
--------------------
Modelo   : SVC
Acurácia : 33.07%
--------------------


### Resultado dos Classificadores

In [18]:
df = pd.DataFrame.from_dict(matriz_resultados)
df.style.apply(highlight_max).format({
    'tfidf': '{:,.2f}%'.format,
    'tfidf+lsa': '{:,.2f}%'.format,
    'tfidf+lda': '{:,.2f}%'.format,
    'count': '{:,.2f}%'.format,
    'count+lsa': '{:,.2f}%'.format,
    'count+lda': '{:,.2f}%'.format,
    'tfidf+count+w2c': '{:,.2f}%'.format,
})

Unnamed: 0,tfidf,tfidf+lsa,tfidf+lda,count,count+lsa,count+lda,tfidf+count+w2c
ComplementNB,47.03%,46.39%,23.43%,37.08%,41.73%,22.79%,0.00%
KNeighborsClassifier,43.98%,40.93%,33.07%,17.50%,38.68%,31.46%,45.59%
LinearSVC,49.44%,49.60%,31.62%,47.35%,48.31%,28.25%,47.67%
LogisticRegression,47.19%,48.96%,31.94%,48.15%,47.83%,29.21%,48.48%
MLPClassifier,46.07%,46.23%,32.10%,47.67%,44.62%,33.23%,43.66%
MultinomialNB,46.87%,33.07%,33.23%,48.31%,33.07%,29.37%,0.00%
RandomForestClassifier,46.39%,48.96%,36.12%,47.51%,47.19%,33.23%,48.64%
SVC,33.07%,33.07%,33.07%,33.07%,33.07%,33.07%,33.07%


### Modelo escolhido e salvo

In [38]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

svd = TruncatedSVD(n_components=70, n_iter=50, random_state=0)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf)

svm = LinearSVC(max_iter=1200)
model = CalibratedClassifierCV(svm) 
model.fit(X_svd, asentimentos)

accuracy = np.round(model.score(X_svd, asentimentos) * 100, 2)
print(f'Modelo   : {model.__class__.__name__}')
print(f'Acurácia : {accuracy}%')

filename = 'tfidf_emotions.sav'
joblib.dump(vec_tfidf, filename)

filename = 'lsa_emotions.sav'
joblib.dump(lsa, filename)

filename = 'model_emotions.sav'
joblib.dump(model, filename)

Modelo   : CalibratedClassifierCV
Acurácia : 52.44%


['model_emotions.sav']

In [39]:
y = model.predict_proba(X_svd)
list(np.round(y[0] * 100, 2))

[0.11, 53.75, 10.87, 5.6, 5.09, 5.19, 0.22, 19.18]