<a href="https://colab.research.google.com/github/rdenadai/sentiment-analysis-2018-president-election/blob/edgarbanhesse/testes_valencia_projeto_final_ia369y_2sem_2018.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Testes de valência para o projeto final de IA369Y 2 Semestre 2018

Passos para tratar os dados com valência, testar e escolher um classificador para utilizar no projeto final de IA369Y.

1) Remover espaços duplos, quebras de linha, números e links do dataset e das frases a serem testadas.

2) Remover stopwords e aplicar o stemmer.

3) Treinar os classificadores.

4) Realizar as predições com os classificadores.

5) Avaliar as medidas obtidas com os classificadores.

In [0]:
#Instalação e download
#!pip uninstall scikit-learn -y
#!pip install git+git://github.com/scikit-learn/scikit-learn.git
!pip install -U scikit-learn
!pip install --upgrade scikit-learn
#!pip install --user --upgrade scikit-learn==0.20.dev0

!pip install scipy
!pip install -U gensim
!pip install emoji
!python -m spacy download pt

#Importação das bibliotecas para testes
import csv
import codecs
import copy
import re
from random import shuffle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
#from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix

%matplotlib inline

np.warnings.filterwarnings('ignore')
np.random.seed(12345)

Requirement already up-to-date: scikit-learn in /usr/local/lib/python3.6/dist-packages (0.20.0)
Requirement already up-to-date: scikit-learn in /usr/local/lib/python3.6/dist-packages (0.20.0)
Requirement already up-to-date: gensim in /usr/local/lib/python3.6/dist-packages (3.6.0)

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/pt_core_news_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/pt

    You can now load the model via spacy.load('pt')



In [0]:
#Importação das bibliotecas para testes
import time
import concurrent.futures
from unicodedata import normalize
from string import punctuation
from functools import lru_cache

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import emoji

nltk.download('stopwords')

def remover_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')


def _load_emotion_file_content(emotion, path='dataset/emocoes'):
    with open(f'{path}/{emotion}', 'r') as h:
        words = h.readlines()
        for i, word in enumerate(words):
            word = word.replace('\n', '').lower().strip()
            words[i] = STEMMER.stem(word)
            # words[i] = [w.lemma_ for w in NLP(word, disable=['parser'])][0]
    return sorted(list(set(words)))


@lru_cache(maxsize=256)
def load_six_emotions(filepath):
    """Ekman, Friesen, and Ellsworth : anger, disgust, fear, joy, sadness, surprise."""
    emotion_words = {
        'ALEGRIA': _load_emotion_file_content('alegria', filepath),
        'DESGOSTO': _load_emotion_file_content('desgosto', filepath),
        'MEDO': _load_emotion_file_content('medo', filepath),
        'RAIVA': _load_emotion_file_content('raiva', filepath),
        'SURPRESA': _load_emotion_file_content('surpresa', filepath),
        'TRISTEZA': _load_emotion_file_content('tristeza', filepath),
    }
    return emotion_words


@lru_cache(maxsize=256)
def load_3_emotions(filepath):
    """Ekman, Friesen, and Ellsworth : anger, disgust, fear, joy, sadness, surprise."""
    emotion_words = {
        'POSITIVO': _load_emotion_file_content('positivo', filepath),
        'NEGATIVO': _load_emotion_file_content('negativo', filepath),
        'NEUTRO': _load_emotion_file_content('neutro', filepath),
    }
    return emotion_words


@lru_cache(maxsize=256)
def load_valence_emotions(filename_oplexicon, filename_sentilex):
    data = {
        'POSITIVO': [],
        'NEGATIVO': [],
        'NEUTRO': [],
    }

    oplexicon = load_valence_emotions_from_oplexicon(filename_oplexicon)
    sentilex = load_valence_emotions_from_sentilex(filename_sentilex)

    data['POSITIVO'] = oplexicon['POSITIVO'] + sentilex['POSITIVO']
    data['NEGATIVO'] = oplexicon['NEGATIVO'] + sentilex['NEGATIVO']
    data['NEUTRO'] = oplexicon['NEUTRO'] + sentilex['NEUTRO']
    data['POSITIVO'] = sorted(list(set(data['POSITIVO'])))
    data['NEGATIVO'] = sorted(list(set(data['NEGATIVO'])))
    data['NEUTRO'] = sorted(list(set(data['NEUTRO'])))
    return data


@lru_cache(maxsize=256)
def load_valence_emotions_from_oplexicon(filename):
    """NEUTRAL | POSITIVE | NEGATIVE."""
    spacy_conv = {
        'adj': 'ADJ',
        'n': 'NOUN',
        'vb': 'VERB',
        'det': 'DET',
        'emot': 'EMOT',
        'htag': 'HTAG'
    }

    data = {
        'POSITIVO': [],
        'NEGATIVO': [],
        'NEUTRO': [],
    }
    with codecs.open(filename, 'r', 'UTF-8') as hf:
        lines = hf.readlines()
        for line in lines:
            info = line.lower().split(',')
            if len(info[0].split()) <= 1:
                info[1] = [spacy_conv.get(tag) for tag in info[1].split()]
                word, tags, sent = info[:3]
                if 'HTAG' not in tags and 'EMOT' not in tags:
                    word = STEMMER.stem(word.lower().strip())
                    # word = [w.lemma_ for w in NLP(word.lower().strip(), disable=['parser'])][0]
                    if len(word) > 2:
                        sent = int(sent)
                        if sent == 1:
                            data['POSITIVO'] += [word]
                        elif sent == -1:
                            data['NEGATIVO'] += [word]
                        else:
                            data['NEUTRO'] += [word]
    data['POSITIVO'] = sorted(list(set(data['POSITIVO'])))
    data['NEGATIVO'] = sorted(list(set(data['NEGATIVO'])))
    data['NEUTRO'] = sorted(list(set(data['NEUTRO'])))
    return data


@lru_cache(maxsize=256)
def load_valence_emotions_from_sentilex(filename):
    """NEUTRAL | POSITIVE | NEGATIVE."""
    data = {
        'POSITIVO': [],
        'NEGATIVO': [],
        'NEUTRO': [],
    }
    with codecs.open(filename, 'r', 'UTF-8') as hf:
        lines = hf.readlines()
        for line in lines:
            info = line.lower().split('.')
            words = [word.strip() for word in info[0].split(',')]
            for word in words:
                word = STEMMER.stem(word.lower().strip())
                # word = [w.lemma_ for w in NLP(word.lower().strip(), disable=['parser'])][0]
                if len(word) > 2:
                    cdata = info[1].split(';')
                    if len(cdata) > 0:
                        sent0 = [int(k.replace('pol:n0=', '')) if 'pol:n0=' in k else None for k in cdata]
                        sent1 = [int(k.replace('pol:n1=', '')) if 'pol:n1=' in k else None for k in cdata]
                        sent0 = list(filter(None.__ne__, sent0))
                        sent1 = list(filter(None.__ne__, sent1))
                        if len(sent0) >= 1 and len(sent1) <= 0:
                            sent = sent0[0]
                            if sent == 1:
                                data['POSITIVO'] += [word]
                            elif sent == -1:
                                data['NEGATIVO'] += [word]
                            else:
                                data['NEUTRO'] += [word]
    data['POSITIVO'] = sorted(list(set(data['POSITIVO'])))
    data['NEGATIVO'] = sorted(list(set(data['NEGATIVO'])))
    data['NEUTRO'] = sorted(list(set(data['NEUTRO'])))
    return data


@lru_cache(maxsize=256)
def is_number(s):
    try:
        complex(s) # for int, long, float and complex
    except ValueError:
        return False
    return True


@lru_cache(maxsize=256)
def _get_stopwords():
    stpwords = stopwords.words('portuguese')
    rms = ['um', 'não', 'mais', 'muito']
    for rm in rms:
        del stpwords[stpwords.index(rm)]
    return stpwords, punctuation


def generate_corpus(documents=None, debug=False):
    assert len(documents) > 0
    if debug: print('Iniciando processamento...')
    tokenized_docs = documents
    with concurrent.futures.ProcessPoolExecutor() as procs:
        if debug: print('Executando processo de remoção das stopwords...')
        tokenized_frases = procs.map(tokenizer, tokenized_docs, chunksize=25)
    if debug: print('Finalizado...')
    return list(tokenized_frases)


def tokenizer(phrase, clean=False):
    if not clean:
        phrase = clean_up(phrase)
    phrase = NLP(phrase, disable=['parser'])
    clean_frase = []
    clfa = clean_frase.append
    for palavra in phrase:
        if palavra.pos_ != 'PUNCT':
            word = palavra.text.strip()
            if not is_number(word) and len(word) > 1:
                clfa(STEMMER.stem(palavra.text))
    return ' '.join(clean_frase)


def clean_up(phrase):
    STOPWORDS, PUNCT = _get_stopwords()
    # Transforma as hashtags em palavras
    try:
        for group in re.findall(r'#\S+\b', phrase, re.DOTALL):
            g2 = re.sub(r'([A-Z])', r' \1', group, flags=re.MULTILINE)
            phrase = re.sub(r'{}\b'.format(group), g2, phrase, flags=re.MULTILINE)
    except Exception:
        pass
    # lowercase para fazer outros pré-processamentos
    phrase = phrase.lower()
    phrase = emoji.get_emoji_regexp().sub(r'', phrase)
    for stw in STOPWORDS:
        phrase = re.sub(r'\b{}\b'.format(stw), '', phrase, flags=re.MULTILINE)
    for punct in PUNCT:
        phrase = phrase.replace(punct, ' ')
    for o, r in RM:
        phrase = re.sub(o, r, phrase, flags=re.MULTILINE)
    return phrase


# GLOBALS
NLP = spacy.load('pt')
# STEMMER = nltk.stem.RSLPStemmer()
STEMMER = nltk.stem.SnowballStemmer('portuguese')
STOPWORDS, PUNCT = _get_stopwords()
RM = [
    (r'\n+', r' . '), (r'"', r' '), (r'\'', r' '),  (r'@', r''), (r'[…]', ' . '), (r'[0-9]*', r''),
    (r'#', r''), (r'(RT)', r''), (r'(http[s]*?:\/\/)+.*[\r\n]*', r''),
    (r'“', r''), (r'”', ''), (r'([aeiouqwtyupdfghjklçzxcvbnm|!@$%&\.\[\]\(\)+-_=<>,;:])\1+', r'\1'),
    (r'(ñ)', r'não'), (r'(nã)', r'não'), (r'\s+', r' '),
]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Datasets

Para validar, serão utilizados dois datasets.

Os dos datasets foram obtidos do site minerando dados.

O primeiro deles tem tweets de política de Minas Gerais com rótulos de valência: positivo, negativo e neutro. Foi feito um tratamento para eliminar tweets repetidos e dessa forma sobraram 3016 tweets.

O segundo contém 2123 títulos de notícias com rótulos de valência: positivo, negativo e neutro.

In [0]:
#Download dos datasets
!ls -la
!rm -f *.csv
!rm -f *.txt
!wget https://raw.githubusercontent.com/rdenadai/sentiment-analysis-2018-president-election/edgarbanhesse/material-apoio/tweets_mg_tratados.csv
!wget https://raw.githubusercontent.com/rdenadai/sentiment-analysis-2018-president-election/edgarbanhesse/material-apoio/titulo_noticias.txt
!wget https://raw.githubusercontent.com/rdenadai/sentiment-analysis-2018-president-election/edgarbanhesse/material-apoio/50_tweets_mg.csv
!ls -la

total 480
drwxr-xr-x 1 root root   4096 Nov 18 18:24 .
drwxr-xr-x 1 root root   4096 Nov 18 18:18 ..
-rw-r--r-- 1 root root   4912 Nov 18 18:24 50_tweets_mg.csv
drwxr-xr-x 4 root root   4096 Nov 15 19:23 .config
drwxr-xr-x 2 root root   4096 Nov 15 19:33 sample_data
-rw-r--r-- 1 root root 152016 Nov 18 18:24 titulo_noticias.txt
-rw-r--r-- 1 root root 295061 Nov 18 18:24 tweets_mg_tratados.csv
--2018-11-18 19:33:08--  https://raw.githubusercontent.com/rdenadai/sentiment-analysis-2018-president-election/edgarbanhesse/material-apoio/tweets_mg_tratados.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 295061 (288K) [text/plain]
Saving to: ‘tweets_mg_tratados.csv’


2018-11-18 19:33:08 (7.64 MB/s) - ‘tweets_mg_tratados.csv’ saved [295061/295061]

--2018-11-18 19:33:

In [0]:
#Carregando os datasets
def carregar(filename):
    frases = []
    with open(filename, 'r', encoding='utf-8') as h:
        reader = csv.reader(h, delimiter='|')
        for row in reader:
            frase = tokenizer(row[0]).strip()
            valencia = row[1].upper()
            if len(frase) > 5:
                frases.append((valencia, frase))
    return frases

frases = carregar('tweets_mg_tratados.csv')
frases += carregar('titulo_noticias.txt')

shuffle(frases)

print(frases[:5])

[('NEUTRO', 'temp real ciel vir qued após result gerdau csn ambev reag'), ('NEUTRO', 'brasil bas material apresentaca'), ('NEUTRO', 'rt juniorpitangu biarant real cruzeirens cabertolaz destaqu cadern tv estad min este doming'), ('POSITIVO', 'starbucks quer triplic númer loj país inclu bh econom estad min'), ('POSITIVO', 'tem sartor entreg hoj ambulânc municípi estad quatr veícul apresent problem')]


In [0]:
#Carrega os datasets em separado
tweets_mg = []
titulo_noticias = []

tweets_mg = carregar('tweets_mg_tratados.csv')
titulo_noticias = carregar('titulo_noticias.txt')

print(tweets_mg)
print(titulo_noticias)

[('NEUTRO', 'bom band mort'), ('NEUTRO', 'fóruns region govern vã eleg nov prefeit vereador'), ('NEGATIVO', 'govern min ger compr mais dois helicópter'), ('POSITIVO', 'polic milit faz prisõ aprend armas fog drog bail funk'), ('POSITIVO', 'um cab políc milit pm anos folg imped um roub pad noit esta'), ('NEUTRO', 'medíocr'), ('NEGATIVO', 'pedr vc acha comunic govern govern'), ('NEGATIVO', 'sulfúr eh dirí soy estúp qué verguenz'), ('NEGATIVO', 'che conoc nov'), ('NEUTRO', 'mineir diz não torc tim nenhum dentr um estad atlét mg cruzeir pq'), ('POSITIVO', 'minsaud envi equip forc nacional sus auxili atend pacient suspeit febr amarel mg colet'), ('NEUTRO', 'fal sobr seman vem diz dor sobr tarif transport nacional estad min'), ('NEGATIVO', 'míd mund tod arranc vár cabec comemor funk facçã nacional estad min'), ('NEUTRO', 'míd mund tod arranc vár cabec comemor funk'), ('NEGATIVO', 'jeitinh kalil nov prefeit bh almoc marmitex elimin traj formal polit estad min'), ('NEUTRO', 'mg cultur dic cultu

In [0]:
#all_datasets
afrases = []
avalencias = []
for valencia, frase in frases:
    afrases.append(frase)
    avalencias.append(valencia)
    
print(afrases)
print(avalencias)


#tweets_mg
atweets_mg = []
aval_tweets_mg = []
for valencia, frase in tweets_mg:
    atweets_mg.append(frase)
    aval_tweets_mg.append(valencia)

print(atweets_mg)
print(aval_tweets_mg)


#titulo_noticias
atitulo_noticias = []
aval_titulo_noticias = []
for valencia, frase in titulo_noticias:
    atitulo_noticias.append(frase)
    aval_titulo_noticias.append(valencia)

print(atitulo_noticias)
print(aval_titulo_noticias)

['temp real ciel vir qued após result gerdau csn ambev reag', 'brasil bas material apresentaca', 'rt juniorpitangu biarant real cruzeirens cabertolaz destaqu cadern tv estad min este doming', 'starbucks quer triplic númer loj país inclu bh econom estad min', 'tem sartor entreg hoj ambulânc municípi estad quatr veícul apresent problem', 'vend lançament mrv cresc tri resilient econom econom estadã', 'pmg políc milit prend autor furt fios ituiutab', 'hav suspeit atuaçã organiz crimin petrobr folh paul', 'ameac não indic membr cpi mist petrobr folh paul', 'fa mg tien ese estad tir el hay diosss', 'govern min ger compr mais dois helicópter', 'min ger melhor estad brasil sil sil sil team', 'mody coloc ratings revisã possível rebaix', 'atras ating fornecedor áre naval petrobr', 'jov fic fer após ser tortur govern valad', 'sobral mais notic hor ambev vai reajust prec cervej', 'bovesp cai após ibop contrari expect segund turn valor econôm', 'lucr líqu ajust anhangu ating trimestr', 'mais empres

In [0]:
def run_ml_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    print(f'Modelo   : {model.__class__.__name__}')
    print(f'Acurácia : {np.round(model.score(X_test, y_test) * 100, 2)}%')
    print('-' * 20)

def split_data(X, y):
    test_size = .3
    random_state = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }

## Classificadores

In [0]:
classifiers = (
    MultinomialNB(),
    #ComplementNB(), #Não carregou no colaboratory
    LogisticRegression(multi_class='auto', solver='lbfgs'),
    RandomForestClassifier(n_estimators=50, min_samples_split=5, random_state=0),
    KNeighborsClassifier(n_neighbors=8, algorithm='auto'),
    MLPClassifier(hidden_layer_sizes=(250,), max_iter=1000),
    LinearSVC(max_iter=1000),
    SVC(gamma='auto', max_iter=1000),
)

## TF-IDF

In [0]:
vec_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vec_tfidf.fit_transform(afrases)

vec_tfidf_tmg = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf_tmg = vec_tfidf_tmg.fit_transform(atweets_mg)

vec_tfidf_tn = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf_tn = vec_tfidf_tn.fit_transform(atitulo_noticias)

In [0]:
print("\nall_datasets")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_tfidf, avalencias))
    except:
        pass

print("\ntweets_mg")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_tfidf_tmg, aval_tweets_mg))
    except:
        pass

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_tfidf_tn, aval_titulo_noticias))
    except:
        pass      


all_datasets
Modelo   : MultinomialNB
Acurácia : 58.71%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 64.56%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 59.88%
--------------------
Modelo   : MLPClassifier
Acurácia : 64.82%
--------------------
Modelo   : LinearSVC
Acurácia : 65.8%
--------------------
Modelo   : SVC
Acurácia : 50.33%
--------------------

tweets_mg
Modelo   : MultinomialNB
Acurácia : 62.49%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 63.71%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 61.27%
--------------------
Modelo   : MLPClassifier
Acurácia : 62.15%
--------------------
Modelo   : LinearSVC
Acurácia : 63.37%
--------------------
Modelo   : SVC
Acurácia : 43.51%
--------------------

titulo_noticias
Modelo   : MultinomialNB
Acurácia : 61.85%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 61.54%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 60.91%


## LSA (usando TF-IDF)

In [0]:
#all_datasets
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=0)
normalizer = MinMaxScaler(copy=False)
lsa = make_pipeline(svd, normalizer)
X_svd = lsa.fit_transform(X_tfidf)

print("\nall_datasets")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, avalencias))
    except Exception as e:
        print(e)

#tweets_mg
X_svd = lsa.fit_transform(X_tfidf_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, aval_tweets_mg))
    except:
        pass


#titulo_noticias
X_svd = lsa.fit_transform(X_tfidf_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, aval_titulo_noticias))
    except:
        pass


all_datasets
Modelo   : MultinomialNB
Acurácia : 42.13%
--------------------
multi_class should be either multinomial or ovr, got auto
Modelo   : RandomForestClassifier
Acurácia : 59.88%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 55.27%
--------------------
Modelo   : MLPClassifier
Acurácia : 57.22%
--------------------
Modelo   : LinearSVC
Acurácia : 61.44%
--------------------
Modelo   : SVC
Acurácia : 42.78%
--------------------

tweets_mg
Modelo   : MultinomialNB
Acurácia : 45.17%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 62.15%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 58.27%
--------------------
Modelo   : MLPClassifier
Acurácia : 62.71%
--------------------
Modelo   : LinearSVC
Acurácia : 63.82%
--------------------
Modelo   : SVC
Acurácia : 44.95%
--------------------

titulo_noticias
Modelo   : MultinomialNB
Acurácia : 46.47%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 61.54%
------------

## LDA (usando TF-IDF)

In [0]:
#all_datasets
lda = LatentDirichletAllocation(n_components=100, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_tfidf)

print("\nall_datasets")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, avalencias))
    except:
        pass


#tweets_mg
X_lda = lda.fit_transform(X_tfidf_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, aval_tweets_mg))
    except:
        pass


#titulo_noticias
X_lda = lda.fit_transform(X_tfidf_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, aval_titulo_noticias))
    except:
        pass      


all_datasets
Modelo   : MultinomialNB
Acurácia : 54.36%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 50.26%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 50.78%
--------------------
Modelo   : MLPClassifier
Acurácia : 52.8%
--------------------
Modelo   : LinearSVC
Acurácia : 54.1%
--------------------
Modelo   : SVC
Acurácia : 51.82%
--------------------

tweets_mg
Modelo   : MultinomialNB
Acurácia : 56.6%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 54.38%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 55.94%
--------------------
Modelo   : MLPClassifier
Acurácia : 55.72%
--------------------
Modelo   : LinearSVC
Acurácia : 56.16%
--------------------
Modelo   : SVC
Acurácia : 55.72%
--------------------

titulo_noticias
Modelo   : MultinomialNB
Acurácia : 51.49%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 47.88%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 48.98%
--

## Count

In [0]:
#all_datasets
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count = vec_count.fit_transform(afrases)

print("\nall_datasets")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_count, avalencias))
    except:
        pass

      
#tweets_mg
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count_tmg = vec_count.fit_transform(atweets_mg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_count_tmg, aval_tweets_mg))
    except:
        pass
      
      
#titulo_noticias
vec_count = CountVectorizer(ngram_range=(1, 2))
X_count_tn = vec_count.fit_transform(atitulo_noticias)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_count_tn, aval_titulo_noticias))
    except:
        pass      


all_datasets
Modelo   : MultinomialNB
Acurácia : 64.37%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 65.02%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 41.68%
--------------------
Modelo   : MLPClassifier
Acurácia : 65.15%
--------------------
Modelo   : LinearSVC
Acurácia : 65.8%
--------------------
Modelo   : SVC
Acurácia : 43.37%
--------------------

tweets_mg
Modelo   : MultinomialNB
Acurácia : 62.15%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 63.26%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 49.28%
--------------------
Modelo   : MLPClassifier
Acurácia : 61.38%
--------------------
Modelo   : LinearSVC
Acurácia : 61.49%
--------------------
Modelo   : SVC
Acurácia : 43.51%
--------------------

titulo_noticias
Modelo   : MultinomialNB
Acurácia : 66.09%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 64.68%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 37.36%


## LSA (usando Count)

In [0]:
#all_datasets
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(svd, normalizer)
X_svd = lda.fit_transform(X_count)

print("\nall_datasets")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, avalencias))
    except:
        pass
      

#tweets_mg
X_svd = lda.fit_transform(X_count_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, aval_tweets_mg))
    except:
        pass
      
      
#titulos_noticias
X_svd = lda.fit_transform(X_count_tn)

print("\ntitulos_noticias")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_svd, aval_titulo_noticias))
    except:
        pass      


all_datasets
Modelo   : MultinomialNB
Acurácia : 42.39%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 59.49%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 55.66%
--------------------
Modelo   : MLPClassifier
Acurácia : 59.82%
--------------------
Modelo   : LinearSVC
Acurácia : 60.79%
--------------------
Modelo   : SVC
Acurácia : 46.88%
--------------------

tweets_mg
Modelo   : MultinomialNB
Acurácia : 49.94%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 62.71%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 60.16%
--------------------
Modelo   : MLPClassifier
Acurácia : 63.15%
--------------------
Modelo   : LinearSVC
Acurácia : 63.6%
--------------------
Modelo   : SVC
Acurácia : 58.6%
--------------------

titulos_noticias
Modelo   : MultinomialNB
Acurácia : 46.47%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 60.6%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 46.15%
-

## LDA (usando Count)

In [0]:
#all_datasets
lda = LatentDirichletAllocation(n_components=100, random_state=0)
normalizer = MinMaxScaler(copy=False)
lda = make_pipeline(lda, normalizer)
X_lda = lda.fit_transform(X_count)

print("\nall_datasets")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, avalencias))
    except:
        pass


#tweets_mg
X_lda = lda.fit_transform(X_count_tmg)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, aval_tweets_mg))
    except:
        pass
      
      
#titulo_noticias
X_lda = lda.fit_transform(X_count_tn)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X_lda, aval_titulo_noticias))
    except:
        pass


all_datasets
Modelo   : MultinomialNB
Acurácia : 54.81%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 53.32%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 52.41%
--------------------
Modelo   : MLPClassifier
Acurácia : 54.1%
--------------------
Modelo   : LinearSVC
Acurácia : 54.29%
--------------------
Modelo   : SVC
Acurácia : 48.63%
--------------------

tweets_mg
Modelo   : MultinomialNB
Acurácia : 61.71%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 59.93%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 56.71%
--------------------
Modelo   : MLPClassifier
Acurácia : 57.71%
--------------------
Modelo   : LinearSVC
Acurácia : 59.49%
--------------------
Modelo   : SVC
Acurácia : 57.05%
--------------------

titulo_noticias
Modelo   : MultinomialNB
Acurácia : 49.76%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 46.31%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 43.49%


## Count + TF-IDF + Word2Vec

In [0]:
#afrases
#atweets_mg
#atitulo_noticias
#avalencias
#aval_tweets_mg
#aval_titulo_noticias

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
#all_datasets
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(afrases)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(afrases)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in afrases:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=25,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(39902405, 45879000)

In [0]:
#all_datasets
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\nall_datasets")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X, avalencias))
    except:
        pass


all_datasets
Modelo   : RandomForestClassifier
Acurácia : 63.2%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 60.4%
--------------------
Modelo   : MLPClassifier
Acurácia : 62.35%
--------------------
Modelo   : LinearSVC
Acurácia : 65.08%
--------------------
Modelo   : SVC
Acurácia : 52.47%
--------------------


In [53]:
#tweets_mg
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(atweets_mg)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(atweets_mg)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in atweets_mg:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=25,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(23296133, 29024000)

In [54]:
#tweets_mg
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\ntweets_mg")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X, aval_tweets_mg))
    except:
        pass


tweets_mg
Modelo   : RandomForestClassifier
Acurácia : 64.48%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 59.6%
--------------------
Modelo   : MLPClassifier
Acurácia : 61.04%
--------------------
Modelo   : LinearSVC
Acurácia : 62.38%
--------------------
Modelo   : SVC
Acurácia : 43.51%
--------------------


In [55]:
#titulo_noticias
# Count
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(atitulo_noticias)
weights_count = pd.DataFrame(np.round(X_count.toarray().T, 8), index=vec_count.get_feature_names())

# TF-IDF
vec_tfidf = TfidfVectorizer()
X_tfidf = vec_tfidf.fit_transform(atitulo_noticias)
weights_tfidf = pd.DataFrame(np.round(X_tfidf.toarray().T, 8), index=vec_tfidf.get_feature_names())

# Word2Vec preprocessing
frases_w2v = []
for frase in atitulo_noticias:
    bigram = []
    p_frase = word_tokenize(frase)
    for m, palavra in enumerate(p_frase):
        next_p = None
        try:
            next_p = p_frase[m+1]
        except:
            pass
        bigram += [f'{palavra}']
#         if next_p:
#             bigram += [f'{palavra} {next_p}']
    frases_w2v += [bigram]

# Word2Vec
model = gensim.models.Word2Vec(
    sentences=frases_w2v,
    sg=1,
    hs=1,
    size=1,
    window=25,
    min_count=1,
    seed=0,
    workers=10)
model.train(frases_w2v, total_examples=len(frases_w2v), epochs=1000)

(14144745, 16855000)

In [57]:
#titulo_noticias
r_words = {}
for word in vec_count.get_feature_names():
    idx = weights_count.index.get_loc(word)
    w2c_val = .1
    try:
        w2c_val = model.wjv[word]
    except:
        pass
    r_words[word] = (weights_tfidf.iloc[idx].values + weights_count.iloc[idx].values) * w2c_val
lwor = list(r_words.keys())
X = np.asarray(list(r_words.values()))
weights = pd.DataFrame(X, index=lwor)
X = X.T

normalizer = Normalizer(copy=False)
X = normalizer.fit_transform(X)

print("\ntitulo_noticias")
for classifier in classifiers:
    try:
        run_ml_model(classifier, **split_data(X, aval_titulo_noticias))
    except:
        pass


titulo_noticias
Modelo   : MultinomialNB
Acurácia : 61.22%
--------------------
Modelo   : RandomForestClassifier
Acurácia : 62.95%
--------------------
Modelo   : KNeighborsClassifier
Acurácia : 59.81%
--------------------
Modelo   : MLPClassifier
Acurácia : 64.36%
--------------------
Modelo   : LinearSVC
Acurácia : 67.5%
--------------------
Modelo   : SVC
Acurácia : 46.47%
--------------------
