In [1]:
#Este código procura implementar o algoritmo de naive-bayes utilizando técnicas de pré-processamento de texto
#para melhorar os resultados

In [2]:
import re
import nltk
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from contractions import CONTRACTION_MAP

In [3]:
#Text Pre-processing
def remove_characters_before_tokenization(sentence, keep_apostrophes=False):
    sentence = sentence.strip()
    if keep_apostrophes:
        PATTERN = r'[?|!|.|;|:|,|#|&|@|$|%|¨|*|+|-|_|-|=]'
        filtered_sentence = re.sub(PATTERN,'', sentence)
    else:
        PATTERN = r'[^a-zA-Z0-9 ]'
        filtered_sentence = re.sub(PATTERN,'', sentence)
    return filtered_sentence

def expand_contractions(sentence, contraction_mapping):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_sentence = contractions_pattern.sub(expand_match, sentence)
    return expanded_sentence

def tokenize_text(text):
    sentences = nltk.sent_tokenize(text)
    word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
    return word_tokens

def remove_stopwords(tokens):
    stopword_list = nltk.corpus.stopwords.words('english')
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    return filtered_tokens

In [4]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [5]:
cleaned_corpus_train = [remove_characters_before_tokenization(sentence, keep_apostrophes=True) 
                  for sentence in newsgroups_train['data']]

In [6]:
expanded_corpus_train = [expand_contractions(sentence, CONTRACTION_MAP) 
                    for sentence in cleaned_corpus_train]   

In [7]:
expanded_corpus_tokens_train = [tokenize_text(text) for text in expanded_corpus_train]    

In [8]:
filtered_list_train =  [[remove_stopwords(tokens) for tokens in sentence_tokens] 
                         for sentence_tokens in expanded_corpus_tokens_train]

In [9]:
documents_train = []
for sublist in filtered_list_train:
    for item in sublist:
        document_train = ' '.join(item)
        documents_train.append(document_train)

In [10]:
vect = TfidfVectorizer()
X_train = vect.fit_transform(documents_train)
X_train.shape

(11314, 150375)

In [11]:
mnb = MultinomialNB()
mnb.fit(X_train, newsgroups_train['target'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
cleaned_corpus_test = [remove_characters_before_tokenization(sentence, keep_apostrophes=True) 
                       for sentence in newsgroups_test['data']]

In [13]:
expanded_corpus_test = [expand_contractions(sentence, CONTRACTION_MAP) for sentence in cleaned_corpus_test]   

In [14]:
expanded_corpus_tokens_test = [tokenize_text(text) for text in expanded_corpus_test]    

In [15]:
filtered_list_test =  [[remove_stopwords(tokens) for tokens in sentence_tokens] 
                  for sentence_tokens in expanded_corpus_tokens_test]

In [16]:
documents_test = []
for sublist in filtered_list_test:
    for item in sublist:
        document_test = ' '.join(item)
        documents_test.append(document_test)

In [17]:
X_test = vect.transform(documents_test) 
y_pred = mnb.predict(X_test) 
X_test.shape

(7532, 150375)

In [18]:
print('Accuracy score: ', accuracy_score(newsgroups_test['target'], y_pred))

Accuracy score:  0.817578332448221


In [20]:
#Saving the Model
import pickle
with open('MT_Multinomial_Naive_Bayes', 'wb') as picklefile:  
    pickle.dump(mnb,picklefile)