In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk import word_tokenize
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

### Dataset

Fornecido pela equipe da PUC-PR através do site [EMOÇÕES.BR](http://www.ppgia.pucpr.br/~paraiso/mineracaodeemocoes/index.php), contém cerca de 1000 frases, categorizadas usando as 6 emoções de Ekman (alegria, surpresa, tristeza, medo, raiva, desgosto).

In [8]:
def carregar(filename):
    frases = []
    with open(filename, 'r') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            sentimento = row[1].upper()
            if len(frase) > 5:
                frases.append((sentimento, frase))
    return frases

frases = carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/frases/g1_v1.csv')
frases += carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/frases/frases_tagged.txt')

shuffle(frases)
print(frases[:5])

[('TRISTEZA', 'asar temp tristez voar'), ('ALEGRIA', 'sandr anenberg sair quarenten após contra nov grip exam confirm diagnóst nest terc feir jornal apresent sintom doenc desd dia julh'), ('ALEGRIA', 'codefat aprov linh crédit milhã taxist diz lup recurs provenient fund ampar trabalh fat linh revendedor carr pratic esgot diz'), ('DESGOSTO', 'psdb protocol três represent contr sarney conselh étic med ter bas denúnc apresent arthur virgíli president sen respond agor nov açõ quebr decor'), ('MEDO', 'vacin contr tuberculos perig dem bebês vírus aids conclusã estud faz oms áfric sul onde hiv comum imuniz nã nã traz proteçã pod mat crianc')]


In [9]:
afrases = []
asentimentos =[]
for sentimento, frase in frases:
    afrases.append(frase)
    asentimentos.append(sentimento)

In [10]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    print(f'Modelo   : {model.__class__.__name__}')
    print(f'Acurácia : {accuracy}%')
    print('-' * 20)
    return accuracy

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

### Classificadores

In [11]:
classifiers = (
    MultinomialNB(),
    ComplementNB(),
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=50, min_samples_split=5, random_state=0),
    KNeighborsClassifier(n_neighbors=8, algorithm='auto'),
    MLPClassifier(hidden_layer_sizes=(100, 25), max_iter=500, random_state=0),
    LinearSVC(max_iter=500),
    SVC(gamma='auto', max_iter=500),
)

### TF-IDF

In [12]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

In [13]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_tfidf, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 46.96%
--------------------
Modelo   : ComplementNB
Acurácia : 45.26%
--------------------
Modelo   : LogisticRegression
Acurácia : 48.42%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 44.77%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 42.82%
--------------------
Modelo   : MLPClassifier
Acurácia : 47.69%
--------------------
Modelo   : LinearSVC
Acurácia : 50.12%
--------------------
Modelo   : SVC
Acurácia : 31.39%
--------------------


### LSA (usando TF-IDF)

In [14]:
svd = TruncatedSVD(n_components=100, n_iter=50, random_state=0)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except Exception as e:
        print(e)

Modelo   : MultinomialNB
Acurácia : 31.39%
--------------------
Modelo   : ComplementNB
Acurácia : 46.96%
--------------------
Modelo   : LogisticRegression
Acurácia : 50.85%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 47.45%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 42.09%
--------------------
Modelo   : MLPClassifier
Acurácia : 46.72%
--------------------
Modelo   : LinearSVC
Acurácia : 50.36%
--------------------
Modelo   : SVC
Acurácia : 31.39%
--------------------


### LDA (usando TF-IDF)

In [15]:
lda = LatentDirichletAllocation(n_components=200, max_iter=50, random_state=0, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 35.77%
--------------------
Modelo   : ComplementNB
Acurácia : 30.9%
--------------------
Modelo   : LogisticRegression
Acurácia : 34.79%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 35.52%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 35.77%
--------------------
Modelo   : MLPClassifier
Acurácia : 33.58%
--------------------
Modelo   : LinearSVC
Acurácia : 32.85%
--------------------
Modelo   : SVC
Acurácia : 31.39%
--------------------


### Count

In [16]:
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

In [17]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_count, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 46.96%
--------------------
Modelo   : ComplementNB
Acurácia : 36.25%
--------------------
Modelo   : LogisticRegression
Acurácia : 49.88%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 47.2%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 30.66%
--------------------
Modelo   : MLPClassifier
Acurácia : 45.74%
--------------------
Modelo   : LinearSVC
Acurácia : 48.66%
--------------------
Modelo   : SVC
Acurácia : 31.39%
--------------------


### LSA (usando Count)

In [18]:
svd = TruncatedSVD(n_components=100, n_iter=50, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(svd, normalizer)
X_svd = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 32.12%
--------------------
Modelo   : ComplementNB
Acurácia : 44.28%
--------------------
Modelo   : LogisticRegression
Acurácia : 44.53%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 46.23%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 35.52%
--------------------
Modelo   : MLPClassifier
Acurácia : 46.23%
--------------------
Modelo   : LinearSVC
Acurácia : 45.01%
--------------------
Modelo   : SVC
Acurácia : 31.39%
--------------------


### LDA (usando Count)

In [19]:
lda = LatentDirichletAllocation(n_components=200, max_iter=50, random_state=0, n_jobs=5)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 29.2%
--------------------
Modelo   : ComplementNB
Acurácia : 21.17%
--------------------
Modelo   : LogisticRegression
Acurácia : 30.17%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 36.25%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 31.63%
--------------------
Modelo   : MLPClassifier
Acurácia : 30.9%
--------------------
Modelo   : LinearSVC
Acurácia : 28.95%
--------------------
Modelo   : SVC
Acurácia : 31.39%
--------------------


### Count + TF-IDF + Word2Vec

In [20]:
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(afrases)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(afrases)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in afrases:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=25,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(19165324, 20346000)

In [25]:
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

In [26]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 52.55%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 48.91%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 42.09%
--------------------
Modelo   : MLPClassifier
Acurácia : 45.5%
--------------------
Modelo   : LinearSVC
Acurácia : 48.91%
--------------------
Modelo   : SVC
Acurácia : 31.39%
--------------------
