In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk import word_tokenize
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

### Dataset

Fornecido pela equipe da PUC-PR através do site [EMOÇÕES.BR](http://www.ppgia.pucpr.br/~paraiso/mineracaodeemocoes/index.php), contém cerca de 1000 frases, categorizadas usando as 6 emoções de Ekman (alegria, surpresa, tristeza, medo, raiva, desgosto).

In [2]:
def carregar(filename):
    frases = []
    with open(filename, 'r') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            sentimento = row[1].upper()
            if len(frase) > 5:
                frases.append((sentimento, frase))
    return frases

frases = carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/puc-pr/g1_v1.csv')

shuffle(frases)

print(frases[:5])

[('ALEGRIA', 'gabriel destac rapidez extraçã óle tup barril entreg rio janeir president lul gabriel diss lágrim extraçã primeir óle tup'), ('DESGOSTO', 'estiag provoc queim regiã central país bombeir precis cont dois incêndi goiân julh dest ano registr foc'), ('DESGOSTO', 'rapaz pres cnhs fals camp grand segund políc vend document dit políc aprend faz falsific web'), ('ALEGRIA', 'senador piau recup bem após cirurg reduçã estômag heráclit fort dem caminh corredor hospital assessor diz senador dev fic licenc seman'), ('DESGOSTO', 'substitut faust sanct lib vend gad oportunity antes sair fér juiz cas determin sequestr gad apont lavag dinheir grup daniel dant fazend')]


In [3]:
afrases = []
asentimentos =[]
for sentimento, frase in frases:
    afrases.append(frase)
    asentimentos.append(sentimento)

In [4]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    print(f'Modelo   : {model.__class__.__name__}')
    print(f'Acurácia : {accuracy}%')
    print('-' * 20)
    return accuracy

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

### Classificadores

In [5]:
classifiers = (
    MultinomialNB(),
    ComplementNB(),
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=50, min_samples_split=5, random_state=0),
    KNeighborsClassifier(n_neighbors=8, algorithm='auto'),
    MLPClassifier(hidden_layer_sizes=(250,), max_iter=1000),
    LinearSVC(max_iter=1000),
    SVC(gamma='auto', max_iter=1000),
)

### TF-IDF

In [6]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

In [7]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_tfidf, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 45.92%
--------------------
Modelo   : ComplementNB
Acurácia : 45.02%
--------------------
Modelo   : LogisticRegression
Acurácia : 45.62%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 47.13%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 43.5%
--------------------
Modelo   : MLPClassifier
Acurácia : 48.04%
--------------------
Modelo   : LinearSVC
Acurácia : 50.76%
--------------------
Modelo   : SVC
Acurácia : 37.76%
--------------------


### LSA (usando TF-IDF)

In [8]:
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=0)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except Exception as e:
        print(e)

Modelo   : MultinomialNB
Acurácia : 38.37%
--------------------
Modelo   : ComplementNB
Acurácia : 41.99%
--------------------
Modelo   : LogisticRegression
Acurácia : 45.32%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 46.83%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 41.69%
--------------------
Modelo   : MLPClassifier
Acurácia : 47.43%
--------------------
Modelo   : LinearSVC
Acurácia : 43.81%
--------------------
Modelo   : SVC
Acurácia : 37.76%
--------------------


### LDA (usando TF-IDF)

In [9]:
lda = LatentDirichletAllocation(n_components=100, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 32.02%
--------------------
Modelo   : ComplementNB
Acurácia : 22.05%
--------------------
Modelo   : LogisticRegression
Acurácia : 32.02%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 31.42%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 31.42%
--------------------
Modelo   : MLPClassifier
Acurácia : 29.91%
--------------------
Modelo   : LinearSVC
Acurácia : 32.33%
--------------------
Modelo   : SVC
Acurácia : 37.76%
--------------------


### Count

In [10]:
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

In [11]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_count, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 47.43%
--------------------
Modelo   : ComplementNB
Acurácia : 35.95%
--------------------
Modelo   : LogisticRegression
Acurácia : 51.36%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 46.83%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 37.46%
--------------------
Modelo   : MLPClassifier
Acurácia : 48.94%
--------------------
Modelo   : LinearSVC
Acurácia : 50.76%
--------------------
Modelo   : SVC
Acurácia : 37.76%
--------------------


### LSA (usando Count)

In [12]:
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(svd, normalizer)
X_svd = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 37.76%
--------------------
Modelo   : ComplementNB
Acurácia : 38.97%
--------------------
Modelo   : LogisticRegression
Acurácia : 46.83%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 43.5%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 41.39%
--------------------
Modelo   : MLPClassifier
Acurácia : 44.71%
--------------------
Modelo   : LinearSVC
Acurácia : 44.71%
--------------------
Modelo   : SVC
Acurácia : 37.76%
--------------------


### LDA (usando Count)

In [13]:
lda = LatentDirichletAllocation(n_components=100, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 29.31%
--------------------
Modelo   : ComplementNB
Acurácia : 25.68%
--------------------
Modelo   : LogisticRegression
Acurácia : 29.31%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 30.51%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 29.31%
--------------------
Modelo   : MLPClassifier
Acurácia : 29.61%
--------------------
Modelo   : LinearSVC
Acurácia : 29.31%
--------------------
Modelo   : SVC
Acurácia : 37.76%
--------------------


### Count + TF-IDF + Word2Vec

In [14]:
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(afrases)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(afrases)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in afrases:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=25,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(17906692, 18842000)

In [15]:
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

In [16]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 52.27%
--------------------
Modelo   : ComplementNB
Acurácia : 46.22%
--------------------
Modelo   : LogisticRegression
Acurácia : 50.76%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 47.13%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 42.9%
--------------------
Modelo   : MLPClassifier
Acurácia : 45.92%
--------------------
Modelo   : LinearSVC
Acurácia : 49.55%
--------------------
Modelo   : SVC
Acurácia : 37.76%
--------------------
