In [11]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

### Dataset

Para validar, serão utilizados dois datasets.

O primeiro deles foi fornecido a nós pela Barbara Martinazzo, e contém cerca de 1000 frases categorizadas entre as 6 emoções do Ekman (alegria, surpresa, tristeza, medo, raiva, desgosto).

O segundo dataset, contém também cerca de 1000 frases, também categorizados usando as 6 emoções conforme o anterior.

In [2]:
def carregar(filename):
    frases = []
    with open(filename, 'r') as h:
        reader = csv.reader(h, delimiter=';')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            sentimento = row[1].upper()
            if len(frase) > 5:
                frases.append((sentimento, frase))
    return frases

frases = carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/puc-pr/politica.csv')
frases += carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/puc-pr/g1_v1.csv')

shuffle(frases)

print(frases[:5])

[('ALEGRIA', 'moviment lgbt faz manifest ramp congress açã acontec durant seminári sobr tem particip ped criminaliz homofob'), ('ALEGRIA', 'bid dar empréstim us bilhõ méxic grip cris emprést quas tripl valor oferec objet ameniz impact grip cris econôm'), ('DESGOSTO', 'berzoin cham empresári demit precis ladrõ president pt particip fest dia trabalh cut jos genoín particip event central'), ('TRISTEZA', 'grup troc tir políc após roub banc rs segund inform brig milit cofr lev ladrõ nenhum suspeit pres esta terc feir'), ('TRISTEZA', 'rendiment fix poupanc pod imped reduçã jur cas própr cadernet rendiment fix garant lei ano percentual serv jur mínim crédit imobiliári')]


In [5]:
afrases = []
asentimentos =[]
for sentimento, frase in frases:
    afrases.append(frase)
    asentimentos.append(sentimento)

In [23]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    print(f'Modelo   : {model.__class__.__name__}')
    print(f'Acurácia : {accuracy}%')
    print('-' * 20)
    return accuracy

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

### Classificadores

In [24]:
classifiers = (
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=20),
    LinearSVC(max_iter=1500),
    MultinomialNB(),
    KNeighborsClassifier(),
    MLPClassifier(max_iter=500),
    SVC(gamma='auto', max_iter=1500),
)

### TF-IDF

In [25]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

In [26]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_tfidf, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 56.65%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 60.57%
--------------------
Modelo   : LinearSVC
Acurácia : 60.42%
--------------------
Modelo   : MultinomialNB
Acurácia : 55.59%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 48.04%
--------------------
Modelo   : MLPClassifier
Acurácia : 59.52%
--------------------
Modelo   : SVC
Acurácia : 35.2%
--------------------


### LSA (usando TF-IDF)

In [27]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 42.45%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 57.7%
--------------------
Modelo   : LinearSVC
Acurácia : 43.66%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 36.4%
--------------------
Modelo   : MLPClassifier
Acurácia : 42.75%
--------------------
Modelo   : SVC
Acurácia : 36.25%
--------------------


### LDA (usando TF-IDF)

In [28]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
X_lda = lda.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 34.14%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 53.78%
--------------------
Modelo   : LinearSVC
Acurácia : 34.29%
--------------------
Modelo   : MultinomialNB
Acurácia : 35.2%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 35.95%
--------------------
Modelo   : MLPClassifier
Acurácia : 35.2%
--------------------
Modelo   : SVC
Acurácia : 35.2%
--------------------


### Count

In [29]:
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

In [30]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_count, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 61.78%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 58.46%
--------------------
Modelo   : LinearSVC
Acurácia : 61.78%
--------------------
Modelo   : MultinomialNB
Acurácia : 57.7%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 30.36%
--------------------
Modelo   : MLPClassifier
Acurácia : 59.06%
--------------------
Modelo   : SVC
Acurácia : 35.2%
--------------------


### LSA (usando Count)

In [31]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 39.58%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 56.65%
--------------------
Modelo   : LinearSVC
Acurácia : 39.88%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 41.99%
--------------------
Modelo   : MLPClassifier
Acurácia : 41.99%
--------------------
Modelo   : SVC
Acurácia : 41.24%
--------------------


### LDA (usando Count)

In [32]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
X_lda = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 35.2%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 54.23%
--------------------
Modelo   : LinearSVC
Acurácia : 35.2%
--------------------
Modelo   : MultinomialNB
Acurácia : 35.2%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 37.46%
--------------------
Modelo   : MLPClassifier
Acurácia : 35.2%
--------------------
Modelo   : SVC
Acurácia : 35.2%
--------------------
