In [14]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

### Dataset

Para validar, serão utilizados dois datasets.

O primeiro deles foi fornecido a nós pela Barbara Martinazzo, e contém cerca de 1000 frases categorizadas entre as 6 emoções do Ekman (alegria, surpresa, tristeza, medo, raiva, desgosto).

O segundo dataset, contém também cerca de 1000 frases, também categorizados usando as 6 emoções conforme o anterior.

In [15]:
def carregar(filename):
    frases = []
    with open(filename, 'r') as h:
        reader = csv.reader(h, delimiter=';')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            sentimento = row[1].upper()
            if len(frase) > 5:
                frases.append((sentimento, frase))
    return frases

frases = carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/puc-pr/politica.csv')
frases += carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/puc-pr/g1_v1.csv')

shuffle(frases)

print(frases[:5])

[('ALEGRIA', 'encomend indústr eua caem mais previst marc baix sétim desvaloriz oit mes dad fevereir revis alta'), ('TRISTEZA', 'alco registr prejuíz milhõ trimestr fabric alumíni inaugur tempor balanc eua prejuíz açã entretant fic abaix esper anal'), ('ALEGRIA', 'fisc destro forn carvã ileg par agent aind flagr florest send desmat imagens satélit der iníci açã'), ('TRISTEZA', 'estud indic chimpanzés capaz aprec músic pesquis filhot revel animal prefer ouv músic harmoni'), ('DESGOSTO', 'rom tum diz nã deix corregedor sen afast funçã ped alguns senador diretor acus tum envolv fraud contrat')]


In [16]:
afrases = []
asentimentos =[]
for sentimento, frase in frases:
    afrases.append(frase)
    asentimentos.append(sentimento)

In [17]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    print(f'Modelo   : {model.__class__.__name__}')
    print(f'Acurácia : {accuracy}%')
    print('-' * 20)
    return accuracy

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

### Classificadores

In [18]:
classifiers = (
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=25),
    LinearSVC(max_iter=1500),
    MultinomialNB(),
    KNeighborsClassifier(),
    MLPClassifier(max_iter=1500),
    SVC(gamma='auto', max_iter=1500),
)

### TF-IDF

In [19]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

In [20]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_tfidf, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 55.14%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 58.16%
--------------------
Modelo   : LinearSVC
Acurácia : 59.97%
--------------------
Modelo   : MultinomialNB
Acurácia : 54.98%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 44.41%
--------------------
Modelo   : MLPClassifier
Acurácia : 60.73%
--------------------
Modelo   : SVC
Acurácia : 34.14%
--------------------


### LSA (usando TF-IDF)

In [21]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 42.75%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 54.83%
--------------------
Modelo   : LinearSVC
Acurácia : 44.71%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 41.99%
--------------------
Modelo   : MLPClassifier
Acurácia : 46.68%
--------------------
Modelo   : SVC
Acurácia : 35.2%
--------------------


### LDA (usando TF-IDF)

In [22]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
X_lda = lda.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 34.14%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 52.11%
--------------------
Modelo   : LinearSVC
Acurácia : 34.14%
--------------------
Modelo   : MultinomialNB
Acurácia : 34.14%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 33.38%
--------------------
Modelo   : MLPClassifier
Acurácia : 34.14%
--------------------
Modelo   : SVC
Acurácia : 34.14%
--------------------


### Count

In [23]:
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

In [24]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_count, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 59.67%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 59.67%
--------------------
Modelo   : LinearSVC
Acurácia : 58.61%
--------------------
Modelo   : MultinomialNB
Acurácia : 56.19%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 6.8%
--------------------
Modelo   : MLPClassifier
Acurácia : 59.52%
--------------------
Modelo   : SVC
Acurácia : 34.14%
--------------------


### LSA (usando Count)

In [25]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 41.24%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 52.87%
--------------------
Modelo   : LinearSVC
Acurácia : 41.39%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 36.86%
--------------------
Modelo   : MLPClassifier
Acurácia : 42.45%
--------------------
Modelo   : SVC
Acurácia : 38.97%
--------------------


### LDA (usando Count)

In [26]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
X_lda = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 34.14%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 54.68%
--------------------
Modelo   : LinearSVC
Acurácia : 34.14%
--------------------
Modelo   : MultinomialNB
Acurácia : 34.14%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 36.71%
--------------------
Modelo   : MLPClassifier
Acurácia : 34.14%
--------------------
Modelo   : SVC
Acurácia : 34.14%
--------------------
