In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from utils import tokenizer, load_six_emotions, load_3_emotions

### Dataset

Para validar, serão utilizados dois datasets.

O primeiro deles foi fornecido a nós pela Barbara Martinazzo, e contém cerca de 1000 frases categorizadas entre as 6 emoções do Ekman (alegria, surpresa, tristeza, medo, raiva, desgosto).

O segundo dataset, contém também cerca de 1000 frases, também categorizados usando as 6 emoções conforme o anterior.

In [2]:
def carregar(filename):
    frases = []
    with open(filename, 'r') as h:
        reader = csv.reader(h, delimiter=';')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            sentimento = row[1].upper()
            if len(frase) > 5:
                frases.append((sentimento, frase))
    return frases

frases = carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/puc-pr/politica.csv')
frases += carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/puc-pr/g1_v1.csv')

shuffle(frases)

print(frases[:5])

[('TRISTEZA', 'subprocur df det desacat após acident carr segund testemunh subprocur bat carr frent fug embriaguez atest mei exam clínic afirm deleg'), ('MEDO', 'manifest faz ato contr reuniã itál ministr financ maior econom rúss têm reuniã manifest distribu eur fals lec este sáb'), ('TRISTEZA', 'britân morr diet sop águ helen anderson anos perd kg pod ter sofr reaçã químic própri corp'), ('DESGOSTO', 'colecion inset aguc inter ciênc diz pesquis especial question leis ríg proíb prátic brasil colet inset permit apen fins científ'), ('DESGOSTO', 'médic guantánam viol étic profissional diz relatóri relatóri critic pentágon cont ajud médic interrogatóri')]


In [3]:
afrases = []
asentimentos =[]
for sentimento, frase in frases:
    afrases.append(frase)
    asentimentos.append(sentimento)

In [4]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    print(f'Modelo   : {model.__class__.__name__}')
    print(f'Acurácia : {np.round(model.score(X_test, y_test) * 100, 2)}%')
    print('-' * 20)

def split_data(X, y):
    test_size = .3
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

### Classificadores

In [5]:
classifiers = (
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=20),
    LinearSVC(max_iter=1500),
    MultinomialNB(),
    KNeighborsClassifier(),
    MLPClassifier(max_iter=500),
    SVC(gamma='auto', max_iter=1500),
)

### TF-IDF

In [6]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

In [7]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_tfidf, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 55.48%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 59.8%
--------------------
Modelo   : LinearSVC
Acurácia : 58.97%
--------------------
Modelo   : MultinomialNB
Acurácia : 55.81%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 47.18%
--------------------
Modelo   : MLPClassifier
Acurácia : 58.8%
--------------------
Modelo   : SVC
Acurácia : 37.38%
--------------------


### LSA (usando TF-IDF)

In [8]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 46.01%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 57.97%
--------------------
Modelo   : LinearSVC
Acurácia : 45.02%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 40.03%
--------------------
Modelo   : MLPClassifier
Acurácia : 45.02%
--------------------
Modelo   : SVC
Acurácia : 38.87%
--------------------


### LDA (usando TF-IDF)

In [9]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
X_lda = lda.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 36.88%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 56.15%
--------------------
Modelo   : LinearSVC
Acurácia : 36.88%
--------------------
Modelo   : MultinomialNB
Acurácia : 36.88%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 34.72%
--------------------
Modelo   : MLPClassifier
Acurácia : 37.38%
--------------------
Modelo   : SVC
Acurácia : 36.88%
--------------------


### Count

In [10]:
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

In [11]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_count, asentimentos))
    except:
        pass



Modelo   : LogisticRegression
Acurácia : 58.8%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 59.14%
--------------------
Modelo   : LinearSVC
Acurácia : 59.97%
--------------------
Modelo   : MultinomialNB
Acurácia : 59.8%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 7.14%
--------------------
Modelo   : MLPClassifier
Acurácia : 57.14%
--------------------
Modelo   : SVC
Acurácia : 37.38%
--------------------


### LSA (usando Count)

In [12]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 41.86%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 58.31%
--------------------
Modelo   : LinearSVC
Acurácia : 42.52%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 44.52%
--------------------
Modelo   : MLPClassifier
Acurácia : 43.19%
--------------------
Modelo   : SVC
Acurácia : 41.36%
--------------------




### LDA (usando Count)

In [13]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
X_lda = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 37.38%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 55.48%
--------------------
Modelo   : LinearSVC
Acurácia : 37.38%
--------------------
Modelo   : MultinomialNB
Acurácia : 37.38%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 40.2%
--------------------
Modelo   : MLPClassifier
Acurácia : 37.38%
--------------------
Modelo   : SVC
Acurácia : 37.38%
--------------------
