In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

### Dataset

Para validar, serão utilizados dois datasets.

O primeiro deles foi fornecido a nós pela Barbara Martinazzo, e contém cerca de 1000 frases categorizadas entre as 6 emoções do Ekman (alegria, surpresa, tristeza, medo, raiva, desgosto).

O segundo dataset, contém também cerca de 1000 frases, também categorizados usando as 6 emoções conforme o anterior.

In [2]:
def carregar(filename):
    frases = []
    with open(filename, 'r') as h:
        reader = csv.reader(h, delimiter=';')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            sentimento = row[1].upper()
            if len(frase) > 5:
                frases.append((sentimento, frase))
    return frases

frases = carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/puc-pr/politica.csv')
frases += carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/puc-pr/g1_v1.csv')

shuffle(frases)

print(frases[:5])

[('ALEGRIA', 'barrag piau transbord primeir vez nort piau sofr maior volum chuv últim anos forc águ cort estrad abriu crat mais quilômetr'), ('TRISTEZA', 'políc encontr beb pert sacol lix crianc lev pront socorr mã nã localiz'), ('DESGOSTO', 'nov grip faz univers federal paran suspend aul dia suspensã abrang tod camp instituiçã motiv reduz possibil transmissã nov grip'), ('TRISTEZA', 'famíl alend nã coment romanc net salvador filh chávez pabl sepúlved alend médic net lendári president marí filh casament president venezuelan'), ('TRISTEZA', 'cinc imóv demol recif estabelec áre públic nã alvar proprietári diss aguard decisã justic')]


In [3]:
afrases = []
asentimentos =[]
for sentimento, frase in frases:
    afrases.append(frase)
    asentimentos.append(sentimento)

In [4]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    print(f'Modelo   : {model.__class__.__name__}')
    print(f'Acurácia : {accuracy}%')
    print('-' * 20)
    return accuracy

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

### Classificadores

In [5]:
classifiers = (
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=25),
    LinearSVC(max_iter=1500),
    MultinomialNB(),
    KNeighborsClassifier(),
    MLPClassifier(max_iter=1500),
    SVC(gamma='auto', max_iter=1500),
)

### TF-IDF

In [6]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

In [7]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_tfidf, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 55.59%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 59.06%
--------------------
Modelo   : LinearSVC
Acurácia : 59.21%
--------------------
Modelo   : MultinomialNB
Acurácia : 56.04%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 45.92%
--------------------
Modelo   : MLPClassifier
Acurácia : 58.76%
--------------------
Modelo   : SVC
Acurácia : 33.69%
--------------------


### LSA (usando TF-IDF)

In [8]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 42.15%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 58.31%
--------------------
Modelo   : LinearSVC
Acurácia : 44.41%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 37.76%
--------------------
Modelo   : MLPClassifier
Acurácia : 46.83%
--------------------
Modelo   : SVC
Acurácia : 34.74%
--------------------


### LDA (usando TF-IDF)

In [9]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
X_lda = lda.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 33.69%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 52.87%
--------------------
Modelo   : LinearSVC
Acurácia : 33.69%
--------------------
Modelo   : MultinomialNB
Acurácia : 33.69%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 36.4%
--------------------
Modelo   : MLPClassifier
Acurácia : 33.69%
--------------------
Modelo   : SVC
Acurácia : 33.69%
--------------------


### Count

In [10]:
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

In [11]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_count, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 58.91%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 60.73%
--------------------
Modelo   : LinearSVC
Acurácia : 59.21%
--------------------
Modelo   : MultinomialNB
Acurácia : 58.76%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 36.4%
--------------------
Modelo   : MLPClassifier
Acurácia : 59.67%
--------------------
Modelo   : SVC
Acurácia : 33.69%
--------------------


### LSA (usando Count)

In [12]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 40.63%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 57.25%
--------------------
Modelo   : LinearSVC
Acurácia : 40.79%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 37.92%
--------------------
Modelo   : MLPClassifier
Acurácia : 44.56%
--------------------
Modelo   : SVC
Acurácia : 40.03%
--------------------


### LDA (usando Count)

In [13]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
X_lda = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 33.69%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 52.57%
--------------------
Modelo   : LinearSVC
Acurácia : 33.69%
--------------------
Modelo   : MultinomialNB
Acurácia : 33.69%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 38.37%
--------------------
Modelo   : MLPClassifier
Acurácia : 34.29%
--------------------
Modelo   : SVC
Acurácia : 33.69%
--------------------
