<a href="https://colab.research.google.com/github/rdenadai/sentiment-analysis-2018-president-election/blob/edgarbanhesse/testes_valencia_projeto_final_ia369y_2sem_2018.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Testes de valência para o projeto final de IA369Y 2 Semestre 2018

Passos para tratar os dados com valência, testar e escolher um classificador para utilizar no projeto final de IA369Y.

1) Remover espaços duplos, quebras de linha, números e links do dataset e das frases a serem testadas.

2) Remover stopwords e aplicar o stemmer.

3) Treinar os classificadores.

4) Realizar as predições com os classificadores.

5) Avaliar as medidas obtidas com os classificadores.

In [0]:
#Importação das bibliotecas para testes
import csv
import codecs
import copy
import re
from random import shuffle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [7]:
#Instalação e download
!pip install emoji
!python -m spacy download pt

import time
import concurrent.futures
from unicodedata import normalize
from string import punctuation
from functools import lru_cache

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import emoji

nltk.download('stopwords')

def remover_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')


def _load_emotion_file_content(emotion, path='dataset/emocoes'):
    with open(f'{path}/{emotion}', 'r') as h:
        words = h.readlines()
        for i, word in enumerate(words):
            word = word.replace('\n', '').lower().strip()
            words[i] = STEMMER.stem(word)
            # words[i] = [w.lemma_ for w in NLP(word, disable=['parser'])][0]
    return sorted(list(set(words)))


@lru_cache(maxsize=256)
def load_six_emotions(filepath):
    """Ekman, Friesen, and Ellsworth : anger, disgust, fear, joy, sadness, surprise."""
    emotion_words = {
        'ALEGRIA': _load_emotion_file_content('alegria', filepath),
        'DESGOSTO': _load_emotion_file_content('desgosto', filepath),
        'MEDO': _load_emotion_file_content('medo', filepath),
        'RAIVA': _load_emotion_file_content('raiva', filepath),
        'SURPRESA': _load_emotion_file_content('surpresa', filepath),
        'TRISTEZA': _load_emotion_file_content('tristeza', filepath),
    }
    return emotion_words


@lru_cache(maxsize=256)
def load_3_emotions(filepath):
    """Ekman, Friesen, and Ellsworth : anger, disgust, fear, joy, sadness, surprise."""
    emotion_words = {
        'POSITIVO': _load_emotion_file_content('positivo', filepath),
        'NEGATIVO': _load_emotion_file_content('negativo', filepath),
        'NEUTRO': _load_emotion_file_content('neutro', filepath),
    }
    return emotion_words


@lru_cache(maxsize=256)
def load_valence_emotions(filename_oplexicon, filename_sentilex):
    data = {
        'POSITIVO': [],
        'NEGATIVO': [],
        'NEUTRO': [],
    }

    oplexicon = load_valence_emotions_from_oplexicon(filename_oplexicon)
    sentilex = load_valence_emotions_from_sentilex(filename_sentilex)

    data['POSITIVO'] = oplexicon['POSITIVO'] + sentilex['POSITIVO']
    data['NEGATIVO'] = oplexicon['NEGATIVO'] + sentilex['NEGATIVO']
    data['NEUTRO'] = oplexicon['NEUTRO'] + sentilex['NEUTRO']
    data['POSITIVO'] = sorted(list(set(data['POSITIVO'])))
    data['NEGATIVO'] = sorted(list(set(data['NEGATIVO'])))
    data['NEUTRO'] = sorted(list(set(data['NEUTRO'])))
    return data


@lru_cache(maxsize=256)
def load_valence_emotions_from_oplexicon(filename):
    """NEUTRAL | POSITIVE | NEGATIVE."""
    spacy_conv = {
        'adj': 'ADJ',
        'n': 'NOUN',
        'vb': 'VERB',
        'det': 'DET',
        'emot': 'EMOT',
        'htag': 'HTAG'
    }

    data = {
        'POSITIVO': [],
        'NEGATIVO': [],
        'NEUTRO': [],
    }
    with codecs.open(filename, 'r', 'UTF-8') as hf:
        lines = hf.readlines()
        for line in lines:
            info = line.lower().split(',')
            if len(info[0].split()) <= 1:
                info[1] = [spacy_conv.get(tag) for tag in info[1].split()]
                word, tags, sent = info[:3]
                if 'HTAG' not in tags and 'EMOT' not in tags:
                    word = STEMMER.stem(word.lower().strip())
                    # word = [w.lemma_ for w in NLP(word.lower().strip(), disable=['parser'])][0]
                    if len(word) > 2:
                        sent = int(sent)
                        if sent == 1:
                            data['POSITIVO'] += [word]
                        elif sent == -1:
                            data['NEGATIVO'] += [word]
                        else:
                            data['NEUTRO'] += [word]
    data['POSITIVO'] = sorted(list(set(data['POSITIVO'])))
    data['NEGATIVO'] = sorted(list(set(data['NEGATIVO'])))
    data['NEUTRO'] = sorted(list(set(data['NEUTRO'])))
    return data


@lru_cache(maxsize=256)
def load_valence_emotions_from_sentilex(filename):
    """NEUTRAL | POSITIVE | NEGATIVE."""
    data = {
        'POSITIVO': [],
        'NEGATIVO': [],
        'NEUTRO': [],
    }
    with codecs.open(filename, 'r', 'UTF-8') as hf:
        lines = hf.readlines()
        for line in lines:
            info = line.lower().split('.')
            words = [word.strip() for word in info[0].split(',')]
            for word in words:
                word = STEMMER.stem(word.lower().strip())
                # word = [w.lemma_ for w in NLP(word.lower().strip(), disable=['parser'])][0]
                if len(word) > 2:
                    cdata = info[1].split(';')
                    if len(cdata) > 0:
                        sent0 = [int(k.replace('pol:n0=', '')) if 'pol:n0=' in k else None for k in cdata]
                        sent1 = [int(k.replace('pol:n1=', '')) if 'pol:n1=' in k else None for k in cdata]
                        sent0 = list(filter(None.__ne__, sent0))
                        sent1 = list(filter(None.__ne__, sent1))
                        if len(sent0) >= 1 and len(sent1) <= 0:
                            sent = sent0[0]
                            if sent == 1:
                                data['POSITIVO'] += [word]
                            elif sent == -1:
                                data['NEGATIVO'] += [word]
                            else:
                                data['NEUTRO'] += [word]
    data['POSITIVO'] = sorted(list(set(data['POSITIVO'])))
    data['NEGATIVO'] = sorted(list(set(data['NEGATIVO'])))
    data['NEUTRO'] = sorted(list(set(data['NEUTRO'])))
    return data


@lru_cache(maxsize=256)
def is_number(s):
    try:
        complex(s) # for int, long, float and complex
    except ValueError:
        return False
    return True


@lru_cache(maxsize=256)
def _get_stopwords():
    stpwords = stopwords.words('portuguese')
    rms = ['um', 'não', 'mais', 'muito']
    for rm in rms:
        del stpwords[stpwords.index(rm)]
    return stpwords, punctuation


def generate_corpus(documents=None, debug=False):
    assert len(documents) > 0
    if debug: print('Iniciando processamento...')
    tokenized_docs = documents
    with concurrent.futures.ProcessPoolExecutor() as procs:
        if debug: print('Executando processo de remoção das stopwords...')
        tokenized_frases = procs.map(tokenizer, tokenized_docs, chunksize=25)
    if debug: print('Finalizado...')
    return list(tokenized_frases)


def tokenizer(phrase, clean=False):
    if not clean:
        phrase = clean_up(phrase)
    phrase = NLP(phrase, disable=['parser'])
    clean_frase = []
    clfa = clean_frase.append
    for palavra in phrase:
        if palavra.pos_ != 'PUNCT':
            word = palavra.text.strip()
            if not is_number(word) and len(word) > 1:
                clfa(STEMMER.stem(palavra.text))
    return ' '.join(clean_frase)


def clean_up(phrase):
    STOPWORDS, PUNCT = _get_stopwords()
    # Transforma as hashtags em palavras
    try:
        for group in re.findall(r'#\S+\b', phrase, re.DOTALL):
            g2 = re.sub(r'([A-Z])', r' \1', group, flags=re.MULTILINE)
            phrase = re.sub(r'{}\b'.format(group), g2, phrase, flags=re.MULTILINE)
    except Exception:
        pass
    # lowercase para fazer outros pré-processamentos
    phrase = phrase.lower()
    phrase = emoji.get_emoji_regexp().sub(r'', phrase)
    for stw in STOPWORDS:
        phrase = re.sub(r'\b{}\b'.format(stw), '', phrase, flags=re.MULTILINE)
    for punct in PUNCT:
        phrase = phrase.replace(punct, ' ')
    for o, r in RM:
        phrase = re.sub(o, r, phrase, flags=re.MULTILINE)
    return phrase


# GLOBALS
NLP = spacy.load('pt')
# STEMMER = nltk.stem.RSLPStemmer()
STEMMER = nltk.stem.SnowballStemmer('portuguese')
STOPWORDS, PUNCT = _get_stopwords()
RM = [
    (r'\n+', r' . '), (r'"', r' '), (r'\'', r' '),  (r'@', r''), (r'[…]', ' . '), (r'[0-9]*', r''),
    (r'#', r''), (r'(RT)', r''), (r'(http[s]*?:\/\/)+.*[\r\n]*', r''),
    (r'“', r''), (r'”', ''), (r'([aeiouqwtyupdfghjklçzxcvbnm|!@$%&\.\[\]\(\)+-_=<>,;:])\1+', r'\1'),
    (r'(ñ)', r'não'), (r'(nã)', r'não'), (r'\s+', r' '),
]


[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/pt_core_news_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/pt

    You can now load the model via spacy.load('pt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Datasets

Para validar, serão utilizados dois datasets.

Os dos datasets foram obtidos do site minerando dados.

O primeiro deles tem tweets de política de Minas Gerais com rótulos de valência: positivo, negativo e neutro. Foi feito um tratamento para eliminar tweets repetidos e dessa forma sobraram 3016 tweets.

O segundo contém 2123 títulos de notícias com rótulos de valência: positivo, negativo e neutro.

In [49]:
#Download dos datasets
!ls -la
!rm -f *.csv
!rm -f *.txt
!wget https://raw.githubusercontent.com/rdenadai/sentiment-analysis-2018-president-election/edgarbanhesse/material-apoio/tweets_mg_tratados.csv
!wget https://raw.githubusercontent.com/rdenadai/sentiment-analysis-2018-president-election/edgarbanhesse/material-apoio/titulo_noticias.txt
!wget https://raw.githubusercontent.com/rdenadai/sentiment-analysis-2018-president-election/edgarbanhesse/material-apoio/50_tweets_mg.csv
!ls -la

total 480
drwxr-xr-x 1 root root   4096 Nov  3 21:36 .
drwxr-xr-x 1 root root   4096 Nov  3 19:51 ..
-rw-r--r-- 1 root root   5060 Nov  3 21:36 50_tweets_mg.csv
drwxr-xr-x 4 root root   4096 Nov  1 16:29 .config
drwxr-xr-x 2 root root   4096 Nov  1 16:42 sample_data
-rw-r--r-- 1 root root 152016 Nov  3 21:36 titulo_noticias.txt
-rw-r--r-- 1 root root 295061 Nov  3 21:36 tweets_mg_tratados.csv
--2018-11-03 21:41:35--  https://raw.githubusercontent.com/rdenadai/sentiment-analysis-2018-president-election/edgarbanhesse/material-apoio/tweets_mg_tratados.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 295061 (288K) [text/plain]
Saving to: ‘tweets_mg_tratados.csv’


2018-11-03 21:41:35 (6.04 MB/s) - ‘tweets_mg_tratados.csv’ saved [295061/295061]

--2018-11-03 21:41:

In [52]:
#Carregando os datasets
def carregar(filename):
    frases = []
    with open(filename, 'r', encoding='utf-8') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            valencia = row[1].upper()
            if len(frase) > 5:
                frases.append((valencia, frase))
    return frases

frases = carregar('tweets_mg_tratados.csv')
frases += carregar('titulo_noticias.txt')

shuffle(frases)

print(frases[:15])

[('POSITIVO', 'par min jov anos pres tráfic drog'), ('NEGATIVO', 'govern min compr mais aeronav olha min fal verdad fal mim não turm'), ('POSITIVO', 'mês após assin petrobr receb contrat uniã merc folh paul'), ('NEGATIVO', 'ibovesp recu dia gir frac'), ('POSITIVO', 'eletrobr pod sub dilm perd eleiçõ vej estim petr val invest infomoney'), ('NEGATIVO', 'vereador caporã ped justic regulariz atend banc brasil portal araçag'), ('NEUTRO', 'govern vet mudanc códig étic milit'), ('POSITIVO', 'após denúnc cã encontr drog menor det mg'), ('NEGATIVO', 'folh polít petrobr eletrobr perd bilhõ govern dilm açõ caír men metad'), ('NEGATIVO', 'gigantesc barb mal destaqu cadern cultur estad min'), ('NEGATIVO', 'sabesp deix invest obras pun reajust menor saopaul saopaul estadã'), ('NEGATIVO', 'rt joseluisfreit vereador cheg algem câm carating tom poss carg polit estad min'), ('NEUTRO', 'rt julianaciprian secretári deix carg govern tem após defend chacin polit estad min'), ('POSITIVO', 'temp real val disp

In [53]:
afrases = []
avalencias = []
for valencia, frase in frases:
    afrases.append(frase)
    avalencias.append(valencia)
    
print(afrases)
print(avalencias)

['par min jov anos pres tráfic drog', 'govern min compr mais aeronav olha min fal verdad fal mim não turm', 'mês após assin petrobr receb contrat uniã merc folh paul', 'ibovesp recu dia gir frac', 'eletrobr pod sub dilm perd eleiçõ vej estim petr val invest infomoney', 'vereador caporã ped justic regulariz atend banc brasil portal araçag', 'govern vet mudanc códig étic milit', 'após denúnc cã encontr drog menor det mg', 'folh polít petrobr eletrobr perd bilhõ govern dilm açõ caír men metad', 'gigantesc barb mal destaqu cadern cultur estad min', 'sabesp deix invest obras pun reajust menor saopaul saopaul estadã', 'rt joseluisfreit vereador cheg algem câm carating tom poss carg polit estad min', 'rt julianaciprian secretári deix carg govern tem após defend chacin polit estad min', 'temp real val disp mais soment açõ caem ibovesp', 'govern retom pent fin benefíci inss próxim seman estad min', 'ambev fic dividend amanhã vej val pen compr açã ambev fic', 'estoy ignor tod program dond hay es

In [0]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    print(f'Modelo   : {model.__class__.__name__}')
    print(f'Acurácia : {np.round(model.score(X_test, y_test) * 100, 2)}%')
    print('-' * 20)

def split_data(X, y):
    test_size = .3
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

## Classificadores

In [0]:
classifiers = (
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=20),
    LinearSVC(max_iter=1500),
    MultinomialNB(),
    KNeighborsClassifier(),
    MLPClassifier(max_iter=500),
    SVC(gamma='auto', max_iter=1500),
)

## TF-IDF

In [0]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

In [57]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_tfidf, avalencias))
    except:
        pass

Modelo   : RandomForestClassifier
Acurácia : 62.94%
--------------------
Modelo   : LinearSVC
Acurácia : 64.11%
--------------------
Modelo   : MultinomialNB
Acurácia : 58.39%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 60.92%
--------------------
Modelo   : MLPClassifier
Acurácia : 62.42%
--------------------
Modelo   : SVC
Acurácia : 41.35%
--------------------


## LSA (usando TF-IDF)

In [59]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, avalencias))
    except:
        pass

Modelo   : RandomForestClassifier
Acurácia : 48.76%
--------------------
Modelo   : LinearSVC
Acurácia : 48.31%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 47.2%
--------------------
Modelo   : MLPClassifier
Acurácia : 50.07%
--------------------
Modelo   : SVC
Acurácia : 43.95%
--------------------


## LDA (usando TF-IDF)¶

In [60]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
X_lda = lda.fit_transform(X_tfidf)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, avalencias))
    except:
        pass



Modelo   : RandomForestClassifier
Acurácia : 44.28%
--------------------
Modelo   : LinearSVC
Acurácia : 42.13%
--------------------
Modelo   : MultinomialNB
Acurácia : 41.35%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 44.67%
--------------------
Modelo   : MLPClassifier
Acurácia : 42.52%
--------------------
Modelo   : SVC
Acurácia : 42.39%
--------------------




## Count

In [0]:
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

In [62]:
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_count, avalencias))
    except:
        pass

Modelo   : RandomForestClassifier
Acurácia : 62.61%
--------------------
Modelo   : LinearSVC
Acurácia : 61.7%
--------------------
Modelo   : MultinomialNB
Acurácia : 63.26%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 46.03%
--------------------
Modelo   : MLPClassifier
Acurácia : 63.26%
--------------------
Modelo   : SVC
Acurácia : 41.35%
--------------------


## LSA (usando Count)

In [63]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, avalencias))
    except:
        pass

Modelo   : RandomForestClassifier
Acurácia : 49.61%
--------------------
Modelo   : LinearSVC
Acurácia : 51.5%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 49.15%
--------------------
Modelo   : MLPClassifier
Acurácia : 51.56%
--------------------
Modelo   : SVC
Acurácia : 52.54%
--------------------




## LDA (usando Count)¶

In [64]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
X_lda = lda.fit_transform(X_count)

for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, avalencias))
    except:
        pass



Modelo   : RandomForestClassifier
Acurácia : 47.85%
--------------------
Modelo   : LinearSVC
Acurácia : 50.0%
--------------------
Modelo   : MultinomialNB
Acurácia : 48.57%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 46.1%
--------------------
Modelo   : MLPClassifier
Acurácia : 50.0%
--------------------
Modelo   : SVC
Acurácia : 49.74%
--------------------
