In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import RSLPStemmer

# **Pipeline** 

- **Tokenizer** (eg. word tokenizer)
- **Stopword** (eg. portuguese)
- **Count Vectorize**
- **Tf-idf**
- **Modeling**

## **Tokenizer**

In [2]:
"""
Maiusculo e minusculo importa para nosso problema?
"""

document1_s = "Estudar machine Learning é muito legal"
document2_s = "Aplicações de machine learning em texto são bem interessantes"
document3_n = "Eu nunca consegui aplicar ml em texto, mas já consegui aplicar em imagens."
document4_n = "Nunca entendi machine learning, não acho muito interessante."

classes = np.array([1,1,0,0])

documents = [document1_s,document2_s,document3_n,document4_n]
documents_lower = [document.lower() for document in documents]
print(documents_lower[0])

estudar machine learning é muito legal


In [3]:
"""
Perceba que é diferente do método split das strings.
"""
documents_tok = [word_tokenize(document) for document in documents_lower]
documents_tok[3]

['nunca',
 'entendi',
 'machine',
 'learning',
 ',',
 'não',
 'acho',
 'muito',
 'interessante',
 '.']

## **Stopword** 

In [4]:
"""
O que consideramos tokens?
- Não estão na lista de stopwords.
- O método isalnum() retorna True caso a string seja alphanumeric (a-z e 0-9).
"""

stopwords_portuguese = stopwords.words('portuguese')
documents = [" ".join([word for word in document if not word in stopwords_portuguese and word.isalnum()]) 
             for document in documents_tok]
    
documents

['estudar machine learning legal',
 'aplicações machine learning texto bem interessantes',
 'nunca consegui aplicar ml texto consegui aplicar imagens',
 'nunca entendi machine learning acho interessante']

## **Stemming**

In [5]:
"""
Outro preprocessamento comum é lemmarizar as palavras.
Perceba que algumas palavras se tornaram a mesma em documentos diferentes.
"""

stemmer = RSLPStemmer()
stopwords_portuguese = stopwords.words('portuguese')


documents = [" ".join([stemmer.stem(word) for word in document if not word in stopwords_portuguese and word.isalnum()]) 
             for document in documents_tok]
    
documents

['estud machin learning legal',
 'aplic machin learning text bem interess',
 'nunc consegu aplic ml text consegu aplic imag',
 'nunc entend machin learning ach interess']

## **Count Vectorizer** 

In [6]:
vectorizer = CountVectorizer()

In [7]:
"""
O mesmo que:

X = vectorizer.fit_transform(documents)
"""
vectorizer.fit(documents) # Learn a vocabulary dictionary of all tokens in the raw documents.
X=vectorizer.transform(documents) # Learn the vocabulary dictionary and return document-term matrix.

In [8]:
data = np.hstack([X.toarray(),classes.reshape(-1,1)])
columns = vectorizer.get_feature_names() + ["classes"]
df = pd.DataFrame(data,columns=columns)
df

Unnamed: 0,ach,aplic,bem,consegu,entend,estud,imag,interess,learning,legal,machin,ml,nunc,text,classes
0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,1
1,0,1,1,0,0,0,0,1,1,0,1,0,0,1,1
2,0,2,0,2,0,0,1,0,0,0,0,1,1,1,0
3,1,0,0,0,1,0,0,1,1,0,1,0,1,0,0


## **Modelagem** 

Agora podemos calcular a probabilidade de cada documento pertencer a uma certa classe:

 - $$
P(y|x_1, ..., x_n) \propto P(y)\prod_{i=1}^{n}P(x_i|y) \\
$$

Como vamos trabalhar com números pequenos utilizamos um truque matemático para previnir underflow. Basicamente aplicamos a operação logaritimica, assim as multiplicações viram somas:
 - $$
logP(y|x_1, ..., x_n) \propto logP(y) + \sum_{i=1}^{n}logP(x_i|y) \\
$$


In [9]:
"""
Probabilidade a priori - Py
"""
py=df["classes"].value_counts(normalize=True)
py

0    0.5
1    0.5
Name: classes, dtype: float64

$$
P(x_i|y) = \frac{N_{yi} + \alpha}{N_y + \alpha n} 
$$

onde,
- $N_{yi}$ é o numero de vezes que a feature $i$ aparece na classe $y$.
- $N_y = \sum_{i=1}^{n} N_{yi}$, isto é, total de ocorrência das features para a classe $y$.
- $n$ é o número de features.
- $\alpha$ é um parâmetro.
> The smoothing priors $\alpha \geq 0$  accounts for features not present in the learning samples and prevents zero probabilities in further computations. Setting $\alpha = 1$ is called Laplace smoothing, while $\alpha \leq 1$ is called Lidstone smoothing.

In [10]:
"""
Nyi
"""
nyi=df.groupby("classes").agg(sum)
nyi

Unnamed: 0_level_0,ach,aplic,bem,consegu,entend,estud,imag,interess,learning,legal,machin,ml,nunc,text
classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1,2,0,2,1,0,1,1,1,0,1,1,2,1
1,0,1,1,0,0,1,0,1,2,1,2,0,0,1


In [11]:
"""
Ny
"""
ny=nyi.apply(sum,axis=1)
ny

classes
0    14
1    10
dtype: int64

In [12]:
n = len(df.columns)-1
aux=pd.DataFrame
alpha = 0.1
p_xy = (nyi+alpha).div((ny + alpha*n),axis=0)

In [13]:
p_xy

Unnamed: 0_level_0,ach,aplic,bem,consegu,entend,estud,imag,interess,learning,legal,machin,ml,nunc,text
classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.071429,0.136364,0.006494,0.136364,0.071429,0.006494,0.071429,0.071429,0.071429,0.006494,0.071429,0.071429,0.136364,0.071429
1,0.008772,0.096491,0.096491,0.008772,0.008772,0.096491,0.008772,0.096491,0.184211,0.096491,0.184211,0.008772,0.008772,0.096491


O maior valor representa a classe predita!

In [14]:
def processing(document):
    # lower case
    document=document.lower()
    # tokenizing
    tokens = word_tokenize(document)
    # stemming, only words, not stopwords
    return [stemmer.stem(word) for word in tokens if not word in stopwords_portuguese and word.isalnum()]

In [15]:
document1 = "Aplicações de machine learning são bem legais!"

filtered_tokens = processing(document1)
filtered_tokens

['aplic', 'machin', 'learning', 'bem', 'legal']

In [16]:
p_xy.filter(filtered_tokens)

Unnamed: 0_level_0,aplic,machin,learning,bem,legal
classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.136364,0.071429,0.071429,0.006494,0.006494
1,0.096491,0.184211,0.184211,0.096491,0.096491


In [17]:
p_xy.filter(filtered_tokens).applymap(np.log).sum(axis=1) + py

classes
0   -16.844450
1    -9.898262
dtype: float64