In [1]:
#!-*- coding: utf8 -*-
import re
import csv
import nltk
import string
import numpy as np
import pandas as pd
import unicodedata
import warnings

from nltk import tokenize
from nltk.corpus import stopwords
from collections import Counter
from sklearn.naive_bayes import MultinomialNB      
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

warnings.filterwarnings('ignore')

In [None]:
""" 
Necessário utilizar esse bloco de código somente se nunca tiver instalado o nltk
"""

nltk.download('stopwords')
nltk.download('rslp')
nltk.download('punkt')


In [2]:
"""
Lendo o arquivo csv com o pandas
"""

emails_dados = pd.read_csv('./dados/emails.csv', encoding="utf-8")
stemmer = nltk.stem.RSLPStemmer()

In [None]:
emails_dados.head(10)

In [6]:
"""Removendo as pontuações e as acentuações das palavras"""
lista = list()
for posicao in range(len(emails_dados['email'])):
    lista.append(remove_stopwords(emails_dados['email'][posicao]))

emails_dados['dados_tratados'] = lista

In [None]:
emails_dados.head(10)

In [7]:
tratamento_tfidf = TfidfVectorizer(lowercase=False, max_features=200)

caracteristicas = tratamento_tfidf.fit_transform(emails_dados['dados_tratados'])

In [8]:
""" Transformando dados esparços em um Dataframe """

pd.DataFrame(caracteristicas.todense(), columns=tratamento_tfidf.get_feature_names())

Unnamed: 0,absorv,acess,ach,agradec,aind,ajud,alem,alguem,algum,almej,...,ver,vers,vez,via,vide,virtu,vist,vontad,vou,web
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.343506,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.405007,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.414627,0.0,0.344377,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
tf_idf = tratamento_tfidf.fit_transform(emails_dados['dados_tratados'])
treino, teste, classe_treino, classe_teste = train_test_split(tf_idf,
                                                              emails_dados["classificacao"],
                                                              random_state = 42)
regressao_logistica = LogisticRegression(solver = "lbfgs")
regressao_logistica.fit(treino, classe_treino)
acuracia_tf_idf = regressao_logistica.score(teste, classe_teste)
print("Média da taxa de acertos para o algoritmo {}, é de: {:.2f}" .format("Regressão Logística", 100 * acuracia_tf_idf))

Média da taxa de acertos para o algoritmo Regressão Logística, é de: 72.73


In [10]:
""" Atenção no uso de Ngrams pois dobra a quantidade de dimenssões """

tratamento_tfidf = TfidfVectorizer(lowercase=False, ngram_range=(1, 2), max_features=200)

tf_idf_ngrams = tratamento_tfidf.fit_transform(emails_dados['dados_tratados'] )
treino_ngrams, teste_ngrams, classe_treino_ngrams, classe_teste_ngrams = train_test_split(tf_idf_ngrams,
                                                              emails_dados["classificacao"],
                                                              random_state = 42)
regressao_logistica.fit(treino_ngrams, classe_treino_ngrams)
acuracia_tf_idf_ngrams = regressao_logistica.score(teste_ngrams, classe_teste_ngrams)
print("Média da taxa de acertos para o algoritmo {}, é de: {:.2f}" .format("Ngrams", 100 * acuracia_tf_idf_ngrams))

Média da taxa de acertos para o algoritmo Ngrams, é de: 72.73


In [11]:
pesos = pd.DataFrame(regressao_logistica.coef_[0].T, index=tratamento_tfidf.get_feature_names())


#Pesos positivos
pesos.nlargest(10,0)

Unnamed: 0,0
pag,0.567486
plan,0.429406
compr,0.390916
payp,0.389707
poss,0.377552
cadastr,0.362924
certific,0.357435
poss imprim,0.353641
plan premium,0.320879
premium,0.320879


In [None]:
#Pesos negativos
pesos.nsmallest(10,0)

In [None]:
"""
Separando as colunas do csv entre:
X = Características do dados
Y = Classificação das características 
"""

X = np.array(emails_dados["dados_tratados"])
Y = np.array(emails_dados["classificacao"].tolist())

In [None]:
"""
Transformando o Dataframe EMAILS_DADOS em listas de palavras e adicionando todas uma lista unica,
com todas as palavras em caixa baixa
"""

array_palavras = emails_dados['dados_tratados'].str.split(' ')

In [None]:
"""
Criando um atributo SET e percorrendo a lista de listas de palavras, adicionando somente palavras não repitidas
"""
dicionario = set()
for palavra in array_palavras:
    dicionario.update(palavra)

In [None]:
"""
Transformando o atributo DICIONARIO em uma lista de TUPLAS e depois transformando em uma MAP de palavras:posicao.
"""
tuplas = zip(dicionario, range(len(dicionario)))
dict_palavras = {palavra:indice for palavra,indice in tuplas}

In [None]:
X = [vetorizar_texto(texto, dict_palavras) for texto in array_palavras]
Y = emails_dados["classificacao"]

In [None]:
"""
Separando do total de dados os dados que seram usados para TREINO e TESTE dos dados de VALIDAÇÃO

"""
porcentagem_treino = 0.8

tamanho_treino = int(porcentagem_treino * len(X))

treino_dados = X[0:tamanho_treino]
treino_marcacoes = Y[0: tamanho_treino]

validacao_dados = X[tamanho_treino:]
validacao_marcacoes = Y[tamanho_treino:]

In [None]:
"""Verificando a acuracia do modelo, comparando com um algoritmo burro"""

contador = Counter(validacao_marcacoes)
taxa_acerto_base = 100.0 * max(Counter(validacao_marcacoes).values())/len(validacao_marcacoes)

print("Contador: ", contador)
print("Taxa de acertos base: {:.2f}" .format(taxa_acerto_base))

In [None]:
"""
Criando quatro modelos: MultinomialNB, AdaBoostClassifier, OneVersusRest, OneVersusOne

Obs.: De maneira geral a previsão dos algoritmos funcionam identificando e classificando os elementos por meio
das características que foram passadas no treinamento.
Obs.1: Se a performance não for problema o algoritmo OneVersusOne deve ser executado.
"""
resultados = {}
print("Taxa de acertos base: {:.2f}" .format(taxa_acerto_base))

modelo_multinomial = MultinomialNB()
resultado_multinomial = fit_and_predict(modelo_multinomial, "MultinomialNB", treino_dados, treino_marcacoes)
resultados[resultado_multinomial] = modelo_multinomial

modelo_adaBoostClassifier = AdaBoostClassifier()
resultado_adaboost = fit_and_predict(modelo_adaBoostClassifier, "AdaBoostClassifier", treino_dados, treino_marcacoes)
resultados[resultado_adaboost] = modelo_adaBoostClassifier

modelo_oneVersusRest = OneVsRestClassifier(LinearSVC(random_state=0))
resultado_oneVersusRest = fit_and_predict(modelo_oneVersusRest, "OneVersusRest", treino_dados, treino_marcacoes)                                         
resultados[resultado_oneVersusRest] = modelo_oneVersusRest

modelo_OneVersusOne = OneVsOneClassifier(LinearSVC(random_state=0))
resultado_OneVersusOne = fit_and_predict(modelo_OneVersusOne, "OneVersusOne", treino_dados, treino_marcacoes)
resultados[resultado_OneVersusOne] = modelo_OneVersusOne

vencedor = resultados[max(resultados)]
fit_and_predict_dados_reais(vencedor, treino_dados, treino_marcacoes, validacao_dados, validacao_marcacoes)

In [None]:
def vetorizar_texto(texto, dict_palavras):
    """
        Transformando uma frase em um array de números para identificar quais palavras foram usadas no
        meu universo de palavras conhecidas
    """
    
    vetor = [0] * len(dict_palavras)

    for palavra in texto:
        if stemmer.stem(palavra) in dict_palavras:
            posicao = dict_palavras[stemmer.stem(palavra)]
            vetor[posicao] +=1
    return vetor

In [None]:
def fit_and_predict(modelo, nome_algoritmo, treino_dados, treino_marcacoes):
    """
        Treinando e predizendo o modelo
        Verificando a acuracia do modelo
        cross_val_score faz a quantidade de predições que forem colocadas no K e depois tira a média, é feito esse
        tipo de abordagem para garantir que o resultado da predição não esta viciada devido ao posicionamento dos dados.
    """
    k = 10

    scores = cross_val_score(modelo, treino_dados, treino_marcacoes, cv = k)

    taxa_media_acerto = np.mean(scores)
                    
    print("Média da taxa de acertos para o algoritmo {}, é de: {:.2f}" .format(nome_algoritmo, 100 * taxa_media_acerto))
    return taxa_media_acerto

In [None]:
def fit_and_predict_dados_reais(modelo, treino_dados, treino_marcacoes, validacao_dados, validacao_marcacoes):
    """
        Treinando e predizendo o modelo
        Verificando a acuracia do modelo
        resultado = São as inferências que o modelo predice tendo por base as caracteristicas dos dados passados 
    """
    modelo.fit(treino_dados, treino_marcacoes)
    
    resultado = modelo.predict(validacao_dados)

    diferencas = (resultado == validacao_marcacoes)

    acertos = sum(diferencas)
    print("Taxa de acertos para algoritmo real, é de: {:.2f}" .format(100 * (acertos / len(validacao_dados))))

In [4]:
def remove_stopwords(text):
    """ Função resposavel por tirar tratar todos as stopwords e erros do texto """
    
    # Retirar as pontiações de cada texto
    regex = re.compile('[%s]' %re.escape(string.punctuation))
    
    # Criando atributo do tipo list
    palavras_filtradas = []
    
    #Criando um atributo do tipo list das palavras separadas por frase
    palavras = text.split()
    
    #Iterando sobre a lista de palavras
    for palavra in palavras:
        token = regex.sub(u'', palavra)
        if not token == u'':
            palavras_filtradas.append(token)
            
    #Setando as stopwords em uma variavel
    stop_words = set(stopwords.words('portuguese'))
    
    # Retirando os espaços das palavras e também retirando as palavras stopwords
    content = [stemmer.stem(elemento) for elemento in palavras_filtradas if elemento.lower().strip() not in stop_words]
    
    retorno = remove_acentuacao_emoticons(content)
   
    return retorno

In [5]:
def remove_acentuacao_emoticons(text):
    """ 
        Filtrando todas palavras para ficarem sem acento e remoção dos emoticons 
    """
        
    # Criando atributo do tipo list
    palavras_limpas = []
        
    for row in text:
        nfkd = unicodedata.normalize('NFKD', row)
        palavra_sem_acento = u"".join([elemento for elemento in nfkd if not unicodedata.combining(elemento)])
        
        regra = re.sub('[^a-zA-Z-9 \\\]'," ", palavra_sem_acento) 
        palavras_limpas.append(regra.lower().strip())
        
    # Iquinorar palarvas com extensão menor ou igual a dois e não pode ser numeral
    tokens = [t for t in palavras_limpas if len(t) > 2 and not t.isdigit()]
    
    retorno = ' '.join(tokens)
    
    return retorno

In [None]:
def remove_hashtag(text):
    """ 
        Removendo hashtag 
    """
    
    words = text.split()
    for i in words:
        if i.startswith('#'):
            words.remove(i)
        elif i.startswith('@'):
            words.remove(i)
            
    retorno = ' '.join(words)
    
    return retorno

In [None]:
def remove_URL(text):
    """ 
        Removendo URL nas linhas 
    """
    
    retorno = re.sub(r"http\S+", "", text)
    
    return retorno