In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk import word_tokenize
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

### Dataset

Fornecido pela equipe da PUC-PR através do site [EMOÇÕES.BR](http://www.ppgia.pucpr.br/~paraiso/mineracaodeemocoes/index.php), contém cerca de 1000 frases, categorizadas usando as 6 emoções de Ekman (alegria, surpresa, tristeza, medo, raiva, desgosto).

In [42]:
def carregar(filename):
    frases = []
    with open(filename, 'r') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            sentimento = row[1].upper()
            if len(frase) > 5:
                frases.append((sentimento, frase))
    return frases

frases = carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/puc-pr/g1_v1.csv')

shuffle(frases)

print(frases[:5])

[('ALEGRIA', 'fri nã afast alun piscin curitib piscin aquec receb pesso dia segund prefeitur professor diz alun pass adapt iníci ativ'), ('ALEGRIA', 'eua alenc obtém autoriz pass nov tratament contr cânc vic president receb nov medic tratament atac célul provoc tumor'), ('DESGOSTO', 'milit faz barric torn cas líd nigér extrem islâm autodenomin talibã deix mais cem mort ataqu contr civ polic fim seman'), ('DESGOSTO', 'funcionári acus govern par favorec madeireir esquem pod ter moviment milhõ secretári mei ambient anunc saíd carg'), ('DESGOSTO', 'quatr palestin fic fer bombardei israelens tún gaz ataqu respost lançament foguet palestin contr israel cinc tún abastec faix gaz sid destruíd')]


In [43]:
afrases = []
asentimentos =[]
for sentimento, frase in frases:
    afrases.append(frase)
    asentimentos.append(sentimento)

In [44]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    print(f'Modelo   : {model.__class__.__name__}')
    print(f'Acurácia : {accuracy}%')
    print('-' * 20)
    return accuracy

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

### Classificadores

In [45]:
classifiers = (
    MultinomialNB(),
    ComplementNB(),
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=50, min_samples_split=5, random_state=0),
    KNeighborsClassifier(n_neighbors=8, algorithm='auto'),
    MLPClassifier(hidden_layer_sizes=(250,), max_iter=1000),
    LinearSVC(max_iter=1000),
    SVC(gamma='auto', max_iter=1000),
)

### TF-IDF

In [46]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

In [47]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_tfidf, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 51.36%
--------------------
Modelo   : ComplementNB
Acurácia : 45.92%
--------------------
Modelo   : LogisticRegression
Acurácia : 50.15%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 50.15%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 48.94%
--------------------
Modelo   : MLPClassifier
Acurácia : 51.06%
--------------------
Modelo   : LinearSVC
Acurácia : 53.17%
--------------------
Modelo   : SVC
Acurácia : 39.27%
--------------------


### LSA (usando TF-IDF)

In [48]:
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=0)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except Exception as e:
        print(e)

Modelo   : MultinomialNB
Acurácia : 39.58%
--------------------
Modelo   : ComplementNB
Acurácia : 47.43%
--------------------
Modelo   : LogisticRegression
Acurácia : 52.57%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 50.15%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 43.5%
--------------------
Modelo   : MLPClassifier
Acurácia : 49.55%
--------------------
Modelo   : LinearSVC
Acurácia : 52.27%
--------------------
Modelo   : SVC
Acurácia : 39.27%
--------------------


### LDA (usando TF-IDF)

In [49]:
lda = LatentDirichletAllocation(n_components=100, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 35.65%
--------------------
Modelo   : ComplementNB
Acurácia : 27.49%
--------------------
Modelo   : LogisticRegression
Acurácia : 36.25%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 36.25%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 36.56%
--------------------
Modelo   : MLPClassifier
Acurácia : 36.56%
--------------------
Modelo   : LinearSVC
Acurácia : 34.74%
--------------------
Modelo   : SVC
Acurácia : 39.27%
--------------------


### Count

In [50]:
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

In [51]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_count, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 48.64%
--------------------
Modelo   : ComplementNB
Acurácia : 34.14%
--------------------
Modelo   : LogisticRegression
Acurácia : 54.08%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 49.24%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 37.76%
--------------------
Modelo   : MLPClassifier
Acurácia : 51.96%
--------------------
Modelo   : LinearSVC
Acurácia : 53.78%
--------------------
Modelo   : SVC
Acurácia : 39.27%
--------------------


### LSA (usando Count)

In [52]:
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(svd, normalizer)
X_svd = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 39.27%
--------------------
Modelo   : ComplementNB
Acurácia : 45.32%
--------------------
Modelo   : LogisticRegression
Acurácia : 49.85%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 48.04%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 40.18%
--------------------
Modelo   : MLPClassifier
Acurácia : 48.04%
--------------------
Modelo   : LinearSVC
Acurácia : 47.43%
--------------------
Modelo   : SVC
Acurácia : 39.27%
--------------------


### LDA (usando Count)

In [53]:
lda = LatentDirichletAllocation(n_components=100, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 36.56%
--------------------
Modelo   : ComplementNB
Acurácia : 29.0%
--------------------
Modelo   : LogisticRegression
Acurácia : 36.86%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 35.05%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 41.39%
--------------------
Modelo   : MLPClassifier
Acurácia : 35.95%
--------------------
Modelo   : LinearSVC
Acurácia : 36.56%
--------------------
Modelo   : SVC
Acurácia : 39.27%
--------------------


### Count + TF-IDF + Word2Vec

In [54]:
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(afrases)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(afrases)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in afrases:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=15,
    min_count=2,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(15706965, 18842000)

In [55]:
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    try:
        r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * model.wv[word]
    except:
        pass
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

In [56]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X, asentimentos))
    except:
        pass

Modelo   : MultinomialNB
Acurácia : 53.47%
--------------------
Modelo   : ComplementNB
Acurácia : 47.73%
--------------------
Modelo   : LogisticRegression
Acurácia : 54.68%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 51.36%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 49.85%
--------------------
Modelo   : MLPClassifier
Acurácia : 48.94%
--------------------
Modelo   : LinearSVC
Acurácia : 53.17%
--------------------
Modelo   : SVC
Acurácia : 39.27%
--------------------
