[View in Colaboratory](https://colab.research.google.com/github/rdenadai/BolaoSimples/blob/master/notebooks/text_classification_example.ipynb)

## An√°lise e Valida√ß√£o de Textos em Portugu√™s


### Refer√™ncias:

 - [NLTK](http://www.nltk.org/howto/portuguese_en.html)
 - [spaCy](https://spacy.io/usage/spacy-101)
 - [Utilizando processamento de linguagem natural para criar uma sumariza√ß√£o autom√°tica de textos](https://medium.com/@viniljf/utilizando-processamento-de-linguagem-natural-para-criar-um-sumariza%C3%A7%C3%A3o-autom%C3%A1tica-de-textos-775cb428c84e)
 - [Latent Semantic Analysis (LSA) for Text Classification Tutorial](http://mccormickml.com/2016/03/25/lsa-for-text-classification-tutorial/)
 - [Topic Modeling with LSA, PLSA, LDA & lda2Vec](https://medium.com/nanonets/topic-modeling-with-lsa-psla-lda-and-lda2vec-555ff65b0b05)
 - [Unsupervised Emotion Detection from Text using Semantic and Syntactic Relations](http://www.cse.yorku.ca/~aan/research/paper/Emo_WI10.pdf)

### Instala√ß√£o

In [7]:
!pip install -U spacy
!python -m spacy download en
!python -m spacy download pt
# !pip install feedparser

Requirement already up-to-date: spacy in /usr/local/lib/python3.6/dist-packages (2.0.12)

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en

    You can now load the model via spacy.load('en')


[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/pt_core_news_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/pt

    You can now load the model via spacy.load('pt')



In [8]:
# Download Oplexicon
!rm -rf wget-log*
!rm -rf oplexicon_v3.0
!wget -O oplexicon_v3.0.zip https://github.com/rdenadai/sentiment-analysis-2018-president-election/blob/master/dataset/oplexicon_v3.0.zip?raw=true
!unzip oplexicon_v3.0.zip
!ls -lh


Redirecting output to ‚Äòwget-log‚Äô.
Archive:  oplexicon_v3.0.zip
  inflating: oplexicon_v3.0/lexico_v3.0.txt  
  inflating: oplexicon_v3.0/README   
total 120K
drwxr-xr-x 2 root root 4.0K Oct  5 20:06 oplexicon_v3.0
-rw-r--r-- 1 root root 102K Oct  5 20:06 oplexicon_v3.0.zip
drwxr-xr-x 2 root root 4.0K Sep 28 23:32 sample_data
-rw-r--r-- 1 root root 1.6K Oct  5 20:06 wget-log


### Imports

In [9]:
import nltk

nltk.download('rslp')
nltk.download('averaged_perceptron_tagger')
nltk.download('floresta')
nltk.download('mac_morpho')
nltk.download('machado')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

import concurrent.futures
import codecs
import re
import pprint
from random import shuffle
from string import punctuation
import copy

import numpy as np
from scipy.sparse.linalg import svds
from scipy.linalg import svd
import pandas as pd
import spacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.utils.extmath import randomized_svd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import floresta as flt
from nltk.corpus import machado as mch
from nltk.corpus import mac_morpho as mcm


nlp = spacy.load('pt')
pp = pprint.PrettyPrinter(indent=4)
stemmer = nltk.stem.RSLPStemmer()

[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package floresta to /root/nltk_data...
[nltk_data]   Package floresta is already up-to-date!
[nltk_data] Downloading package mac_morpho to /root/nltk_data...
[nltk_data]   Package mac_morpho is already up-to-date!
[nltk_data] Downloading package machado to /root/nltk_data...
[nltk_data]   Package machado is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk

### Functions

In [0]:
def load_oplexicon_data(filename):
    spacy_conv = {
        'adj': 'ADJ',
        'n': 'NOUN',
        'vb': 'VERB',
        'det': 'DET',
        'emot': 'EMOT',
        'htag': 'HTAG'
    }
    
    data = {}
    with codecs.open(filename, 'r', 'UTF-8') as hf:
        lines = hf.readlines()
        for line in lines:
            info = line.lower().split(',')
            if len(info[0].split()) <= 1:
                info[1] = [spacy_conv.get(tag) for tag in info[1].split()]
                word, tags, sent = info[:3]
                if 'HTAG' not in tags and 'EMOT' not in tags:
                    word = nlp(word.lower())[0].lemma_
                    # word = word.replace('-se', '')
                    # stem = stemmer.stem(word)
                    if word in data:
                        data[word] += [{
                            'word': [word],
                            'tags': tags,
                            'sentiment': sent
                        }]
                    else:
                        data[word] = [{
                            'word': [word],
                            'tags': tags,
                            'sentiment': sent
                        }]
    return data

### Usage

In [15]:
frase = u"Gostaria de saber mais informa√ß√µes sobre a Amazon. Uma excelente loja de produtos online!".lower()
doc = nlp(frase)
pp.pprint([(w.text, w.pos_) for w in doc])

# for dc in doc:
#     if dc.pos_ == 'VERB':
#         print(dc.lemma_)
#     else:
#         print(dc)

[   ('gostaria', 'VERB'),
    ('de', 'ADP'),
    ('saber', 'VERB'),
    ('mais', 'DET'),
    ('informa√ß√µes', 'NOUN'),
    ('sobre', 'ADP'),
    ('a', 'DET'),
    ('amazon', 'NOUN'),
    ('.', 'PUNCT'),
    ('uma', 'DET'),
    ('excelente', 'ADJ'),
    ('loja', 'NOUN'),
    ('de', 'ADP'),
    ('produtos', 'NOUN'),
    ('online', 'ADJ'),
    ('!', 'PUNCT')]


In [16]:
opx = load_oplexicon_data('oplexicon_v3.0/lexico_v3.0.txt')
print('Oplexicon size: ', len(opx))
print('Examples: ')

view = opx.items()
pp.pprint(list(view)[:7])

KeyboardInterrupt: ignored

In [0]:
ALEGRIA = ['abundante', 'acalmar', 'aceit√°vel', 'aclamar', 'aconchego', 'ades√£o', 'admirar', 'adorar', 'af√°vel', 'afei√ß√£o', 'afeto', 'afortunado', 'agradar', 'ajeitar', 'al√≠vio', 'amabilidade', 'amado', 'amar', 'am√°vel', 'amenizar', 'ameno', 'amig√°vel', 'amistoso', ' amizade', ' amor', ' anima√ß√£o', ' √¢nimo', 'anseio', '√¢nsia', 'ansioso', 'apaixonado', 'apaziguar', 'aplausos', 'apoiar', 'aprazer', 'apreciar', 'aprova√ß√£o', 'aproveitar', 'ardor', 'armirar', 'arrumar', 'atra√ß√£o', 'atraente', 'atrair', 'avidamente', 'avidez', '√°vido', 'belo', 'bem-estar', 'benefic√™ncia', 'beneficiador', 'benef√≠cio', 'ben√©fico', 'benevoc√™ncia', 'benignamente', 'ben√≠gno', 'bom', 'bondade', 'bondoso', 'bonito', 'brilhante', 'brincadeira', 'calma', 'calor', 'caridade', 'caridoso', 'carinho', 'cativar', 'charme', 'cheery', 'clamar', 'cofortar', 'coleguismo', 'com√©dia', 'c√¥mico', 'comover', 'compaix√£o', 'companheirismo', 'compatibilidade', 'compat√≠vel', 'complac√™ncia', 'completar', 'compreens√£o', 'conclus√£o', 'concretiza√ß√£o', 'condescend√™ncia', 'confian√ßa', 'confortante', 'congratula√ß√£o', 'conquistar', 'consentir', 'considera√ß√£o', 'consola√ß√£o', 'contentamento', 'coragem', 'cordial', 'considerar', 'consolo', 'contente', 'cuidadoso', 'cumplicidade', 'dedica√ß√£o', 'deleitado', 'delicadamente', 'delicadeza', 'delicado', 'desejar', 'despreocupa√ß√£o', 'devo√ß√£o', 'devoto', 'divers√£o', 'divertido', 'encantar', 'elogiado', 'emo√ß√£o', 'emocionante', 'emotivo', 'empatia', 'emp√°tico', 'empolga√ß√£o', 'enamorar', 'encantado', 'encorajado', 'enfeitar', 'engra√ßado', 'entendimento', 'entusiasmadamente', 'entusi√°stico', 'esperan√ßa', 'esplendor', 'estima', 'estimar', 'estimulante', 'euforia', 'euf√≥rico', 'euforizante', 'exaltar', 'excelente', 'excitar', 'expansivo', 'extasiar', 'exuberante', 'exultar', 'f√£', 'facilitar', 'familiaridade', 'fascina√ß√£o', 'fasc√≠nio', 'favor', 'favorecer', 'favorito', 'felicidade', 'feliz', 'festa', 'festejar', 'festivo', 'fidelidade', 'fiel', 'filantropia', 'filantr√≥pico', 'fraterno', 'ganhar', 'generosidade', 'generoso', 'gentil', 'gl√≥ria', 'glorificar', 'gostar', 'gostoso', 'gozar', 'gratificante', 'grato', 'hilariante', 'honra', 'humor', 'impressionar', 'incentivar', 'incentivo', 'inclina√ß√£o', 'incr√≠vel', 'inspirar', 'interessar', 'interesse', 'irmandade', 'jovial', 'jubilante', 'j√∫bilo', 'lealdade', 'leg√≠timo', 'leveza', 'louvar', 'louv√°vel', 'louvavelmente', 'lucrativo', 'lucro', 'maravilhoso', 'melhor', 'obter', 'obteve', 'ode', 'orgulho', 'paix√£o', 'parabenizar', 'paz', 'piedoso', 'positivo', 'prazenteiro', 'prazer', 'predile√ß√£o', 'preencher', 'prefer√™ncia', 'preferido', 'promissor', 'prosperidade', 'prote√ß√£o', 'proteger', 'revigorar', 'simp√°tico', 'vantajoso', 'protetor', 'risada', 'sobreviv√™ncia', 'vencedor', 'proveito', 'risonho', 'sobreviver', 'venera√ß√£o', 'provil√©gio', 'rom√¢ntico', 'sorte', 'ventura', 'querer', 'romantismo', 'sortudo', 'vida', 'radiante', 'saciar', 'sucesso', 'vigor', 'realizar', 'saci√°vel', 'surpreender', 'virtude', 'recomend√°vel', 'satisfa√ß√£o', 'tenro', 'virtuoso', 'reconhecer', 'satisfatoriamente', 'ternura', 'vit√≥ria', 'recompensa', 'satisfat√≥rio', 'torcer', 'vitorioso', 'recrear', 'satisfazer', 'tranquilo', 'viver', 'recreativo', 'satisfeito', 'tranquilo', 'vivo', 'recrea√ß√£o', 'sedu√ß√£o', 'triunfo', 'zelo', 'regozijar', 'seduzir', 'triunfal', 'zeloso', 'respeitar', 'sereno', 'triunfante', 'ressuscitar', 'simpaticamente', 'vantagem',]
DESGOSTO = ['abomin√°vel', 'adoentado', 'amargamente', 'antipatia', 'antip√°tico', 'asco', 'asqueroso', 'avers√£o', 'chatear', 'chatea√ß√£o', 'desagrado', 'desagrad√°vel', 'desprez√≠vel', 'detest√°vel', 'doente', 'doen√ßa', 'enfermidade', 'enjoativo', 'enjoo', 'enj√¥o', 'feio', 'f√©tido', 'golfar', 'grave', 'gravidade', 'grosseiro', 'grosso', 'horr√≠vel', 'ign√≥bil', 'ilegal', 'incomodar', 'inc√¥mdo', 'indecente', 'indisposi√ß√£o', 'indisposto', 'inescrupuloso', 'maldade', 'maldoso', 'malvado', 'mau', 'nauseabundo', 'nauseante', 'nausear', 'nauseoso', 'nojento', 'nojo', 'n√°usea', 'obsceno', 'obstruir', 'obstru√ß√£o', 'ofensivo', 'pat√©tico', 'perigoso', 'repelente', 'repelir', 'repugnante', 'repulsa', 'repulsivo', 'repuls√£o', 'rude', 'sujeira', 'sujo', 'terrivelmente', 'terr√≠vel', 'torpe', 'travesso', 'travessura', 'ultrajante', 'vil', 'vomitar', 'v√¥mito',]
MEDO = ['abomin√°vel', 'afugentar', 'alarmar', 'alerta', 'amea√ßa', 'amedrontar', 'angustia', 'ang√∫stia', 'angustiadamente', 'ansiedade', 'ansioso', 'apavorar', 'apreender', 'apreens√£o', 'apreensivo', 'arrepio', 'assombrado', 'assombro', 'assustado', 'assustadoramente', 'atemorizar', 'aterrorizante', 'brutal', 'calafrio', 'chocado', 'chocante', 'consternado', 'covarde', 'cruel', 'crueldade', 'cruelmente', 'cuidado', 'cuidadosamente', 'cuidadoso', 'defender', 'defensor', 'defesa', 'derrotar', 'desconfiado', 'desconfian√ßa', 'desencorajar', 'desespero', 'deter', 'envergonhado', 'escandalizado', 'escurid√£o', 'espantoso', 'estremecedor', 'estremecer', 'expulsar', 'feio', 'friamente', 'fugir', 'hesitar', 'horrendo', 'horripilante', 'horr√≠vel', 'horrivelmente', 'horror', 'horrorizar', 'impaci√™ncia', 'impaciente', 'impiedade', 'impiedoso', 'indecis√£o', 'inquieto', 'inseguran√ßa', 'inseguro', 'intimidar', 'medonho', 'medroso', 'monstruosamente', 'mortalha', 'nervoso', 'p√¢nico', 'pavor', 'premoni√ß√£o', 'preocupar', 'press√°gio', 'pressentimento', 'recear', 'receativamente', 'receio', 'receoso', 'ruim', 'suspeita', 'suspense', 'susto', 'temer', 'tenso', 'terror', 'tremor', 'temeroso', 'terrificar', 'timidamente', 'vigiar', 'temor', 'terr√≠vel', 'timidez', 'vigilante', 'tens√£o', 'terrivelmente', 't√≠mido',]
RAIVA = ['abomina√ß√£o', 'aborrecer', 'adredido', 'agredir', 'agress√£o', 'agressivo', 'amaldi√ßoado', 'amargor', 'amargura', 'amolar', 'ang√∫stia', 'animosidade', 'antipatia', 'antip√°tico', 'asco', 'assassinar', 'assassinato', 'assediar', 'ass√©dio', 'atormentar', 'avarento', 'avareza', 'avers√£o', 'beligerante', 'bravejar', 'chatea√ß√£o', 'chato', 'cobi√ßoso', 'c√≥lera', 'col√©rico', 'complicar', 'contraiedade', 'contrariar', 'corrup√ß√£o', 'corrupto', 'cruxificar', 'demon√≠aco', 'dem√¥nio', 'descaso', 'descontente', 'descontrole', 'desenganar', 'desgostar', 'desgra√ßa', 'desprazer', 'desprezar', 'destrui√ß√£o', 'destruir', 'detestar', 'diabo', 'diab√≥lico', 'doido', 'encolerizar', 'energicamente', 'enfurecido', 'enfuriante', 'enlouquecer', 'enraivecer', 'escandalizar', 'esc√¢ndalo', 'escoriar', 'exasperar', 'execra√ß√£o', 'ferir', 'frustra√ß√£o', 'frustrar', 'f√∫ria', 'furioso', 'furor', 'gan√¢ncia', 'ganancioso', 'guerra', 'guerreador', 'guerrilha', 'hostil', 'humilhar', 'implic√¢ncia', 'implicar', 'importunar', 'incomodar', 'inc√¥modo', 'indignar', 'infernizar', 'inimigo', 'inimizade', 'inj√∫ria', 'injuriado', 'injusti√ßa', 'insulto', 'mal√≠cia', 'odi√°vel', 'repulsivo', 'inveja', 'malicioso', '√≥dio', 'resmungar', 'ira', 'malignidade', 'odioso', 'ressentido', 'irado', 'mal√≠gno', 'ofendido', 'revolta', 'irascibilidade', 'maltratar', 'ofensa', 'rid√≠culo', 'irasc√≠vel', 'maluco', 'opress√£o', 'tempestuoso', 'irritar', 'malvadeza', 'opressivo', 'tirano', 'louco', 'malvado', 'oprimir', 'tormento', 'loucura', 'matar', 'persegui√ß√£o', 'torturar', 'magoar', 'mesquinho', 'perseguir', 'ultrage', 'mal', 'misantropia', 'perturbar', 'ultrajar', 'maldade', 'misantr√≥pico', 'perverso', 'vexat√≥rio', 'maldi√ß√£o', 'molestar', 'provocar', 'vigoroso', 'maldito', 'mol√©stia', 'rabugento', 'vingan√ßa', 'maldizer', 'mortal', 'raivoso', 'vingar', 'maldoso', 'morte', 'rancor', 'vingativo', 'malefic√™ncia', 'mort√≠fero', 'reclamar', 'viol√™ncia', 'mal√©fico', 'mortificar', 'repress√£o', 'violento', 'malevol√™ncia', 'nervoso', 'reprimir', 'zangar', 'mal√©volo', 'odiar', 'repulsa',]
SURPRESA = ['admirar', 'afei√ß√£o', 'apavorante', 'assombro', 'chocado', 'chocante', 'desconcertar', 'deslumbrar', 'embasbacar', 'emudecer', 'encantamento', 'enorme', 'espanto', 'estupefante', 'estupefato', 'estupefazer', 'expectativa', 'fantasticamente', 'fant√°stico', 'horripilante', 'imagin√°rio', 'imenso', 'impressionado', 'incr√≠vel', 'maravilha', 'milagre', 'mist√©rio', 'misterioso', '√≥timo', 'pasmo', 'perplexo', 'prod√≠gio', 'sensacional', 'surpreendente', 'surpreender', 'suspense', 'susto', 'temor', 'tremendo',]
TRISTEZA = ['abandonar', 'abatido', 'abomin√°vel', 'aborrecer', 'abortar', 'afligir', 'aflito', 'afli√ß√£o', 'agoniar', 'amargo', 'amargor', 'amargura', 'ansiedade', 'arrepender', 'arrependidamente', 'atrito', 'azar', 'cabisbaixo', 'choro', 'choroso', 'chor√£o', 'coitado', 'compassivo', 'compun√ß√£o', 'contristador', 'contrito', 'contri√ß√£o', 'culpa', 'defeituoso', 'degradante', 'deplor√°vel', 'deposi√ß√£o', 'depravado', 'depressivo', 'depress√£o', 'deprimente', 'deprimir', 'derrota', 'derrubar', 'desalentar', 'desamparo', 'desanimar', 'desapontar', 'desconsolo', 'descontente', 'desculpas', 'desencorajar', 'desespero', 'desgaste', 'desgosto', 'desgra√ßa', 'desistir', 'desist√™ncia', 'deslocado', 'desmoralizar', 'desolar', 'desonra', 'despojado', 'desprazer', 'desprezo', 'desumano', 'des√¢nimo', 'discriminar', 'disforia', 'disf√≥rico', 'dissuadir', 'doloroso', 'dor', 'd√≥', 'enfadado', 'enlutar', 'entediado', 'entristecedor', 'entristecer', 'envergonhar', 'errante', 'erro', 'err√¥neo', 'escurecer', 'escurid√£o', 'escuro', 'esquecido', 'estragado', 'execr√°vel', 'extirpar', 'falsidade', 'falso', 'falta', 'fraco', 'fraqueza', 'fric√ß√£o', 'frieza', 'frio', 'funesto', 'f√∫nebre', 'grave', 'horror', 'humilhar', 'inconsol√°vel', 'indefeso', 'infelicidade', 'infeliz', 'infort√∫nio', 'isolar', 'lacrimejante', 'lacrimoso', 'lamentar', 'lastimoso', 'luto', 'lutoso', 'l√°grima', 'l√°stima', 'l√∫gubre', 'magoar', 'martirizar', 'mart√≠rio', 'mau', 'melancolia', 'melanc√≥lico', 'menosprezar', 'miseravelmente', 'misterioso', 'mist√©rio', 'mis√©ria', 'morre', 'morte', 'mortificante', 'm√°goa', 'negligentemente', 'nocivo', 'obscuro', 'opressivo', 'opress√£o', 'oprimir', 'pena', 'penalizar', 'penitente', 'penoso', 'penumbra', 'perder', 'perturbado', 'perverso', 'pervertar', 'pesaroso', 'pessimamente', 'piedade', 'pobre', 'porcamente', 'prejudicado', 'prejudicial', 'preju√≠zo', 'pressionar', 'press√£o', 'quebrar', 'queda', 'queixoso', 'recha√ßar', 'remorso', 'repressivo', 'repress√£o', 'reprimir', 'ruim', 'secreto', 'servil', 'sobrecarga', 'sobrecarregado', 'sofrer', 'sofrimento', 'solid√£o', 'sombrio', 'soturno', 'sujo', 'suplicar', 'supl√≠cio', 's√≥', 'timidez', 'torturar', 'trevas', 'triste', 'tristemente', 't√©dio', 't√≠mido', 'vazio',]

emotion_words = {
    'ALEGRIA': ALEGRIA,
    'DESGOSTO': DESGOSTO,
    'MEDO': MEDO,
    'RAIVA': RAIVA,
    'SURPRESA': SURPRESA,
    'TRISTEZA': TRISTEZA,
}
for key, values in words.items():
    for i, word in enumerate(values):
        emotion_words[key][i] = ''.join([p.lemma_ for p in nlp(word.lower())])

In [162]:
stpwords = set(stopwords.words('portuguese') + list(punctuation))
# stpwords = set(list(punctuation))

def tokenize_frases(frase):
    return word_tokenize(frase.lower())

def rm_stop_words_tokenized(frase):
    frase = nlp(frase.lower())
    clean_frase = []
    for palavra in frase:
        if palavra.pos_ != 'PUNCT':
            palavra = palavra.lemma_
            if palavra not in stpwords and not palavra.isdigit():
                clean_frase.append(palavra)
    return ' '.join(filter(None, clean_frase))

def generate_corpus(frases, tokenize=False):
    print('Iniciando processamento...')
    tokenized_frases = frases
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as procs:
        if tokenize:
            print('Executando processo de tokeniza√ß√£o das frases...')
            tokenized_frases = procs.map(tokenize_frases, frases, chunksize=25)
        print('Executando processo de remo√ß√£o das stopwords...')
        tokenized_frases = procs.map(rm_stop_words_tokenized, tokenized_frases, chunksize=25)
    print('Filtro e finaliza√ß√£o...')
    return tokenized_frases


frases = [
    'Bom dia SENADOR, agora est√° claro porque o ped√°gio n√£o baixava,o judici√°rio n√£o se manifestava quando era provocado e as CPIs s√≥ serviram pr√° corrup√ß√£o,deu no que deu üôÑ',
    'N√£o basta apenas retirar o candidato preferencial da maioria dos eleitores brasileiros. Tem que impedir tamb√©m que esses mesmos eleitores possam comparecer √†s urnas. Que democracia √© essa, minha gente? Poder judici√°rio comprometido at√© os cabelos com o golpe de destr√≥i o pa√≠s.',
    'Deus aben√ßoe o dia de todos voc√™, tenham um bom trabalho e bom estudo a todos. E pra aqueles que n√£o trabalha e nem estuda, boa curti√ß√£o em sua cama üôÇ',
    'Aprenda a ter amor pr√≥prio que nem essa banana q fez uma tatuagem dela mesma.',
    'Estou muito feliz hoje',
    'Dias chuvosos me deixam triste',
    'Hoje o dia esta excelente',
]

N = 10000
# frases = flt.sents()[:N] + mch.sents()[:N] + mcm.sents()[:N]

frases = list(generate_corpus(frases, tokenize=False))
print(frases)

ldocs = [f'D{i}' for i in range(len(frases))]

Iniciando processamento...
Executando processo de remo√ß√£o das stopwords...
Filtro e finaliza√ß√£o...
['bom dia senador agora estar claro porque ped√°gio baixar judici√°rio manifestar ser provocar cpis servir pr√° corrup√ß√£o dar dar üôÑ', 'basto apenas retirar candidatar preferencial maioria eleitor brasileiro ter impedir eleitor poder comparecer s urna democracia ser gente poder judici√°rio comprometer cabelo golpe destruir pa√≠s', 'deus aben√ßoar dia todo ter bom trabalhar bom estudar todo pra trabalhar estudar bom curti√ß√£o suar cama üôÇ', 'aprender ter amor pr√≥prio banana q fazer umar tatuagem d', 'estar feliz hoje', 'dia chuvoso deixar triste', 'hoje dia excelente']


In [214]:
print('Tf-Idf:')
vectorizer = TfidfVectorizer(max_df=1, sublinear_tf=True, use_idf=True, ngram_range=(1, 1))
X_tfidf = vectorizer.fit_transform(frases)
print("   Actual number of tfidf features: %d" % X_tfidf.get_shape()[1])
weights_df = pd.DataFrame(np.round(X_tfidf.toarray().T, 3), index=vectorizer.get_feature_names(), columns=ldocs)
display(weights_df.head(15))

Tf-Idf:
   Actual number of tfidf features: 53


Unnamed: 0,D0,D1,D2,D3,D4,D5,D6
aben√ßoar,0.0,0.0,0.262,0.0,0.0,0.0,0.0
agora,0.259,0.0,0.0,0.0,0.0,0.0,0.0
amor,0.0,0.0,0.0,0.378,0.0,0.0,0.0
apenas,0.0,0.21,0.0,0.0,0.0,0.0,0.0
aprender,0.0,0.0,0.0,0.378,0.0,0.0,0.0
baixar,0.259,0.0,0.0,0.0,0.0,0.0,0.0
banana,0.0,0.0,0.0,0.378,0.0,0.0,0.0
basto,0.0,0.21,0.0,0.0,0.0,0.0,0.0
brasileiro,0.0,0.21,0.0,0.0,0.0,0.0,0.0
cabelo,0.0,0.21,0.0,0.0,0.0,0.0,0.0


In [215]:
print('Count:')
vectorizer = CountVectorizer()
X_count = vectorizer.fit_transform(frases)
print("   Actual number of tfidf features: %d" % X_count.get_shape()[1])
weights_df = pd.DataFrame(X_count.toarray().T, index=vectorizer.get_feature_names(), columns=ldocs)
display(weights_df.head(15))

Count:
   Actual number of tfidf features: 60


Unnamed: 0,D0,D1,D2,D3,D4,D5,D6
aben√ßoar,0,0,1,0,0,0,0
agora,1,0,0,0,0,0,0
amor,0,0,0,1,0,0,0
apenas,0,1,0,0,0,0,0
aprender,0,0,0,1,0,0,0
baixar,1,0,0,0,0,0,0
banana,0,0,0,1,0,0,0
basto,0,1,0,0,0,0,0
bom,1,0,3,0,0,0,0
brasileiro,0,1,0,0,0,0,0


In [223]:
print('SVD: ')
AC = copy.deepcopy(X_count.toarray().T)
u, s, v = np.linalg.svd(AC, full_matrices=False)
print('Original and SVD equals: ', np.allclose(AC, np.dot(u, np.dot(np.diag(s), v))))

# print(AC)
# print(u.astype(np.float16))
# print('-' * 20)
# print(np.diag(s.astype(np.float16)))
# print('-' * 20)
# print(v.astype(np.float16))

SVD: 
Original and SVD equals:  True


In [0]:
# hmm-lda
# https://ieeexplore.ieee.org/document/7363382
# https://link.springer.com/chapter/10.1007/978-3-642-21802-6_57
# http://www.ppgia.pucpr.br/~paraiso/Projects/Emocoes/Emocoes.html

emotion_weigths = {}
for k, items in enumerate(emotion_words.items()):
    key, values = items
    emotion_weigths[key] = []
    for i, word in enumerate(values):
        idx_val = 0
        w = 0
        try:
            index = weights_df.index.get_loc(word)
            idx_val = u[index]
            w = weights_df.iloc[index].values
        except:
            w = np.zeros((len(ldocs, )))
        emotion_weigths[key].append(idx_val * w)
# pp.pprint(emotion_weigths)

In [3]:
dtframe = {d: np.zeros(len(ldocs)) for d in emotion_weigths}
for k, item in enumerate(emotion_weigths.items()):
    sent = np.array(item[1])
    for i, m in enumerate(cosine_distances(v.T, sent)):
        print(m)
        dtframe[item[0]][i] = np.sum(m)

# pp.pprint(dtframe)
df = pd.DataFrame(list(dtframe.values()), index=dtframe.keys(), columns=ldocs)
display(df.head(15))

NameError: ignored

In [0]:
print("LSA using TruncatedSVD:")

# Project the tfidf vectors onto the first N principal components.
# Though this is significantly fewer features than the original tfidf vector,
# they are stronger features, and the accuracy is higher.
svd = TruncatedSVD(50)
lsa = make_pipeline(svd, Normalizer(copy=False))

# Run SVD on the training data, then project the training data.
X_lsa = lsa.fit_transform(X_count)

explained_variance = svd.explained_variance_ratio_.sum()
print("   Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

print(svd.explained_variance_.shape)
print(svd.singular_values_.shape) # S
print(svd.components_.shape) # VT

LSA using TruncatedSVD:
   Explained variance of the SVD step: 100%
(4,)
(4,)
(4, 61)


In [0]:
print('LSA using numpy:')
u, s, v = np.linalg.svd(X_tfidf.toarray(), full_matrices=False)
print(u)
print(s.shape)
print(v.shape)

LSA using numpy:
[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]]
(4,)
(4, 58)


In [0]:
print('LSA using scikit-learn randomized_svd:')
U, Sigma, VT = randomized_svd(X_count, 
                              n_components=50,
                              n_iter=5,
                              random_state=None)
print(U.shape)
print(Sigma.shape)
print(VT.shape)

print(U)
# print(VT)

LSA using scikit-learn randomized_svd:
(4, 4)
(4,)
(4, 61)
[[ 8.23887410e-02  4.64153487e-01  8.81914756e-01  0.00000000e+00]
 [ 9.96228666e-01 -6.25206048e-02 -6.01632624e-02 -3.70560802e-16]
 [ 2.72128558e-02  8.83545536e-01 -4.67554003e-01  2.07365235e-16]
 [ 3.63520293e-16 -2.06384313e-16  7.46602991e-17  1.00000000e+00]]


In [195]:
# define a matrix
# A = array([[1, 2], [3, 4], [5, 6]])
A = np.array([
    [1, 1, 1, 0, 0],
    [3, 3, 3, 0, 0],
    [4, 4, 4, 0, 0],
    [5, 5, 5, 0, 0],
    [0, 2, 0, 4, 4],
    [0, 0, 0, 5, 5],
    [0, 1, 0, 2, 2],
])
print(A.shape)


u, s, v = np.linalg.svd(copy.deepcopy(A), full_matrices=False)

print(u.astype(np.float16))
print('-' * 20)
print(np.diag(s.astype(np.float16)))
print('-' * 20)
print(v.astype(np.float16))

(7, 5)
[[-0.1376  -0.0236  -0.01081  0.56    -0.3757 ]
 [-0.4128  -0.07086 -0.03244  0.2064   0.756  ]
 [-0.5503  -0.0944  -0.04324 -0.7246  -0.1846 ]
 [-0.688   -0.11804 -0.05405  0.344   -0.2307 ]
 [-0.1528   0.5913   0.654    0.       0.2    ]
 [-0.0722   0.7314  -0.678    0.       0.     ]
 [-0.0764   0.2957   0.327    0.      -0.4    ]]
--------------------
[[12.484  0.     0.     0.     0.   ]
 [ 0.     9.51   0.     0.     0.   ]
 [ 0.     0.     1.346  0.     0.   ]
 [ 0.     0.     0.     0.     0.   ]
 [ 0.     0.     0.     0.     0.   ]]
--------------------
[[-0.5625  -0.593   -0.5625  -0.09015 -0.09015]
 [-0.1266   0.02878 -0.1266   0.6953   0.6953 ]
 [-0.4097   0.8047  -0.4097  -0.09125 -0.09125]
 [-0.707    0.       0.707    0.       0.     ]
 [ 0.      -0.       0.      -0.707    0.707  ]]


In [90]:
u, s, v = randomized_svd(copy.deepcopy(A), 
                          power_iteration_normalizer='auto',
                          flip_sign=True,
                          n_components=100,
                          n_iter=1,
                          random_state=None)
print(u.astype(np.float16))
print('-' * 20)
print(np.diag(s.astype(np.float16)))
print('-' * 20)
print(v.astype(np.float16))

[[ 1.3757e-01 -2.3605e-02  1.0811e-02  9.3652e-01  2.8711e-01]
 [ 4.1284e-01 -7.0862e-02  3.2440e-02 -3.1152e-01  8.4912e-01]
 [ 5.5029e-01 -9.4421e-02  4.3243e-02  1.2622e-01 -2.9834e-01]
 [ 6.8799e-01 -1.1804e-01  5.4047e-02 -1.0132e-01 -3.2812e-01]
 [ 1.5283e-01  5.9131e-01 -6.5381e-01 -1.5199e-04 -6.2406e-05]
 [ 7.2205e-02  7.3145e-01  6.7822e-01  0.0000e+00  0.0000e+00]
 [ 7.6416e-02  2.9565e-01 -3.2690e-01  3.0398e-04  1.2481e-04]]
--------------------
[[12.484  0.     0.     0.     0.   ]
 [ 0.     9.51   0.     0.     0.   ]
 [ 0.     0.     1.346  0.     0.   ]
 [ 0.     0.     0.     0.     0.   ]
 [ 0.     0.     0.     0.     0.   ]]
--------------------
[[ 0.5625   0.593    0.5625   0.09015  0.09015]
 [-0.1266   0.02878 -0.1266   0.6953   0.6953 ]
 [ 0.4097  -0.8047   0.4097   0.09125  0.09125]
 [ 0.707    0.      -0.707   -0.      -0.     ]
 [-0.      -0.       0.       0.707   -0.707  ]]


In [87]:
u, s, v = svd(copy.deepcopy(A))
print(u.astype(np.float16))
print('-' * 20)
print(np.diag(s.astype(np.float16)))
print('-' * 20)
print(v.astype(np.float16))

[[-0.1376  -0.0236  -0.01081  0.56    -0.3757  -0.7     -0.1879 ]
 [-0.4128  -0.07086 -0.03244  0.2064   0.756   -0.258    0.378  ]
 [-0.5503  -0.0944  -0.04324 -0.7246  -0.1846  -0.344   -0.0923 ]
 [-0.688   -0.11804 -0.05405  0.344   -0.2307   0.57    -0.11536]
 [-0.1528   0.5913   0.654    0.       0.2      0.      -0.4    ]
 [-0.0722   0.7314  -0.678    0.       0.       0.       0.     ]
 [-0.0764   0.2957   0.327    0.      -0.4      0.       0.8    ]]
--------------------
[[12.484  0.     0.     0.     0.   ]
 [ 0.     9.51   0.     0.     0.   ]
 [ 0.     0.     1.346  0.     0.   ]
 [ 0.     0.     0.     0.     0.   ]
 [ 0.     0.     0.     0.     0.   ]]
--------------------
[[-0.5625  -0.593   -0.5625  -0.09015 -0.09015]
 [-0.1266   0.02878 -0.1266   0.6953   0.6953 ]
 [-0.4097   0.8047  -0.4097  -0.09125 -0.09125]
 [-0.707    0.       0.707    0.       0.     ]
 [ 0.      -0.       0.      -0.707    0.707  ]]


In [91]:
# SVD
U, s, VT = svd(A)
print(U.astype(np.float16))
print(np.diag(s.astype(np.float16)))
print(VT.astype(np.float16))

[[-0.1376  -0.0236  -0.01081  0.56    -0.3757  -0.7     -0.1879 ]
 [-0.4128  -0.07086 -0.03244  0.2064   0.756   -0.258    0.378  ]
 [-0.5503  -0.0944  -0.04324 -0.7246  -0.1846  -0.344   -0.0923 ]
 [-0.688   -0.11804 -0.05405  0.344   -0.2307   0.57    -0.11536]
 [-0.1528   0.5913   0.654    0.       0.2      0.      -0.4    ]
 [-0.0722   0.7314  -0.678    0.       0.       0.       0.     ]
 [-0.0764   0.2957   0.327    0.      -0.4      0.       0.8    ]]
[[12.484  0.     0.     0.     0.   ]
 [ 0.     9.51   0.     0.     0.   ]
 [ 0.     0.     1.346  0.     0.   ]
 [ 0.     0.     0.     0.     0.   ]
 [ 0.     0.     0.     0.     0.   ]]
[[-0.5625  -0.593   -0.5625  -0.09015 -0.09015]
 [-0.1266   0.02878 -0.1266   0.6953   0.6953 ]
 [-0.4097   0.8047  -0.4097  -0.09125 -0.09125]
 [-0.707    0.       0.707    0.       0.     ]
 [ 0.      -0.       0.      -0.707    0.707  ]]


In [100]:
A = [
    [1, 2],
    [3, 4],
    [5, 6],
    [7, 8]
]

U, s, VT = svd(A)
print(U.astype(np.float16))
print(np.diag(s.astype(np.float16)))
print(VT.T.astype(np.float16))

[[-0.1525  -0.8228  -0.3945  -0.38   ]
 [-0.3499  -0.4214   0.2428   0.801  ]
 [-0.5474  -0.0201   0.6978  -0.4614 ]
 [-0.7446   0.381   -0.5464   0.04074]]
[[14.266  0.   ]
 [ 0.     0.627]]
[[-0.6416  0.767 ]
 [-0.767  -0.6416]]


In [131]:
A = [
    [2, 4],
    [1, 3],
    [0, 0],
    [0, 0]
    # [0, -1],
    # [-2, 1],
    # [1, 0]
]

U, s, VT = svd(A, full_matrices=False)
print(U.astype(np.float16))
print(np.diag(s.astype(np.float16)))
print(VT.T.astype(np.float16))

print(np.dot(U, U.T).astype(np.float16))
print(np.dot(VT, VT.T).astype(np.float16))


print(np.allclose(A, np.dot(U, np.dot(np.diag(s), VT))))

print(np.dot(U, np.dot(np.diag(s), VT)).astype(np.float16))

[[-0.8174 -0.576 ]
 [-0.576   0.8174]
 [ 0.      0.    ]
 [ 0.      0.    ]]
[[5.465 0.   ]
 [0.    0.366]]
[[-0.4045 -0.9146]
 [-0.9146  0.4045]]
[[ 1. -0.  0.  0.]
 [-0.  1.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]
[[ 1. -0.]
 [-0.  1.]]
True
[[2. 4.]
 [1. 3.]
 [0. 0.]
 [0. 0.]]
