In [1]:
import sys
sys.path.append('../..')

import csv
import codecs
import copy
import re
from random import shuffle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix

from utils import tokenizer, load_six_emotions, load_3_emotions

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

### Dataset

Fornecido pela equipe da PUC-PR através do site [EMOÇÕES.BR](http://www.ppgia.pucpr.br/~paraiso/mineracaodeemocoes/index.php), contém cerca de 1000 frases, categorizadas usando as 6 emoções de Ekman (alegria, surpresa, tristeza, medo, raiva, desgosto).

In [15]:
def carregar(filename):
    frases = []
    with open(filename, 'r') as h:
        reader = csv.reader(h, delimiter=';')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            sentimento = row[1].upper()
            if len(frase) > 5:
                frases.append((sentimento, frase))
    return frases

frases = carregar('/home/rdenadai/vagrant/python-dev/sentiment-analysis-2018-president-election/dataset/puc-pr/g1_v1.csv')

shuffle(frases)

print(frases[:5])

[('DESGOSTO', 'númer cas grip eua sob rein unid divulg três nov cas nest doming organiz mundial saúd cont mort méxic eua'), ('TRISTEZA', 'cas desab centr histór luís outr ameac caus chuv seis rodov feder têm problem tráfeg'), ('ALEGRIA', 'imagens mostr flagrant violênc policial quatr polic agred grup jovens port alegr brig milit identific afast polic particip açã'), ('MEDO', 'vacin contr tuberculos perig dem bebês vírus aids conclusã estud feit oms áfric sul onde hiv comum imuniz nã nã traz proteçã pod mat crianc'), ('ALEGRIA', 'manteg admit nov brasil entrar recessã técnic recessã caracteriz dois trimestr segu qued pib pib caiu últim trimestr ministr nov recu trimestr')]


In [16]:
afrases = []
asentimentos =[]
for sentimento, frase in frases:
    afrases.append(frase)
    asentimentos.append(sentimento)

In [17]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = np.round(model.score(X_test, y_test) * 100, 2)
    print(f'Modelo   : {model.__class__.__name__}')
    print(f'Acurácia : {accuracy}%')
    print('-' * 20)
    return accuracy

def split_data(X, y):
    test_size = .33
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

### Classificadores

In [18]:
classifiers = (
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=25),
    LinearSVC(max_iter=1500),
    MultinomialNB(),
    KNeighborsClassifier(),
    MLPClassifier(max_iter=1500),
    SVC(gamma='auto', max_iter=1500),
)

### TF-IDF

In [19]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

In [20]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_tfidf, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 41.69%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 42.3%
--------------------
Modelo   : LinearSVC
Acurácia : 44.41%
--------------------
Modelo   : MultinomialNB
Acurácia : 42.9%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 39.27%
--------------------
Modelo   : MLPClassifier
Acurácia : 45.32%
--------------------
Modelo   : SVC
Acurácia : 33.84%
--------------------


### LSA (usando TF-IDF)

In [21]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 41.39%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 38.07%
--------------------
Modelo   : LinearSVC
Acurácia : 42.6%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 35.35%
--------------------
Modelo   : MLPClassifier
Acurácia : 42.6%
--------------------
Modelo   : SVC
Acurácia : 33.84%
--------------------


### LDA (usando TF-IDF)

In [22]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
X_lda = lda.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 30.82%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 28.4%
--------------------
Modelo   : LinearSVC
Acurácia : 30.82%
--------------------
Modelo   : MultinomialNB
Acurácia : 33.84%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 29.0%
--------------------
Modelo   : MLPClassifier
Acurácia : 30.82%
--------------------
Modelo   : SVC
Acurácia : 33.84%
--------------------


### Count

In [23]:
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

In [24]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_count, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 44.11%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 41.69%
--------------------
Modelo   : LinearSVC
Acurácia : 45.32%
--------------------
Modelo   : MultinomialNB
Acurácia : 44.11%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 9.37%
--------------------
Modelo   : MLPClassifier
Acurácia : 44.71%
--------------------
Modelo   : SVC
Acurácia : 33.84%
--------------------


### LSA (usando Count)

In [25]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 37.46%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 32.63%
--------------------
Modelo   : LinearSVC
Acurácia : 38.07%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 29.91%
--------------------
Modelo   : MLPClassifier
Acurácia : 36.86%
--------------------
Modelo   : SVC
Acurácia : 40.18%
--------------------


### LDA (usando Count)

In [26]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
X_lda = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, asentimentos))
    except:
        pass

Modelo   : LogisticRegression
Acurácia : 33.84%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 29.0%
--------------------
Modelo   : LinearSVC
Acurácia : 33.84%
--------------------
Modelo   : MultinomialNB
Acurácia : 33.84%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 28.4%
--------------------
Modelo   : MLPClassifier
Acurácia : 33.84%
--------------------
Modelo   : SVC
Acurácia : 33.84%
--------------------
