# NLP Naive Bayes

[Disaster Tweets Dataset](https://www.kaggle.com/c/nlp-getting-started)

Naive Bayes computes the probability of each word appearing under each label, based on token frequency, the sums the log Likelihood of all words in a tweet added to the log prior (to account for an unbalanced dataset).

# Imports

In [None]:
import numpy  as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
import pydash
import math
import os
import time
from pydash import flatten
from collections import Counter, OrderedDict
from humanize import intcomma
from operator import itemgetter
from typing import *
from sklearn.model_selection import train_test_split

# import spacy
# nlp = spacy.load('en')

# CSV Data

In [None]:
df_train = pd.read_csv('../input/nlp-getting-started/train.csv', index_col=0)
df_test  = pd.read_csv('../input/nlp-getting-started/test.csv', index_col=0)
df_train

# Tokenization and Word Frequencies

Here we tokenize the text using nltk.TweetTokenizer, apply lowercasing, tweet preprocessing, and stemming.

Then compute a dictionary lookup of word counts for each label

In [None]:
def tokenize_df(
    dfs: List[pd.DataFrame], 
    keys          = ('text', 'keyword', 'location'), 
    stemmer       = False, 
    ngrams        = 1,
    preserve_case = True, 
    reduce_len    = False, 
    strip_handles = True,
    use_stopwords = True,
    **kwargs,
) -> List[List[str]]:
    # tokenizer = nltk.TweetTokenizer(preserve_case=True,  reduce_len=False, strip_handles=False)  # defaults 
    tokenizer = nltk.TweetTokenizer(preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles) 
    porter    = nltk.PorterStemmer()
    stopwords = set(nltk.corpus.stopwords.words('english') + [ 'nan' ])

    output    = []
    for df in flatten([ dfs ]):
        for index, row in df.iterrows():
            tokens = flatten([
                tokenizer.tokenize(str(row[key] or ""))
                for key in keys    
            ])
            if use_stopwords:
                tokens = [ 
                    token 
                    for token in tokens 
                    if token.lower() not in stopwords
                    and len(token) >= 2
                ]        
            if stemmer:
                tokens = [ 
                    porter.stem(token) 
                    for token in tokens 
                ]
            if ngrams:
                tokens = [
                    " ".join(tokens[i:i+n])
                    for n in range(1,ngrams+1)
                    for i in range(0,len(tokens)-n+1)
                ]            
            output.append(tokens)

    return output


def get_labeled_tokens(df, **kwargs) -> Dict[int, List[str]]:
    tokens = {
        0: flatten(tokenize_df( df[df['target'] == 0], **kwargs )),
        1: flatten(tokenize_df( df[df['target'] == 1], **kwargs )),
    }
    return tokens


def get_word_frequencies(df, **kwargs) -> Dict[int, Counter]:
    tokens = get_labeled_tokens(df, **kwargs)
    freqs = { 
        0: Counter(dict(Counter(tokens[0]).most_common())), 
        1: Counter(dict(Counter(tokens[1]).most_common())), 
    }  # sort and cast
    return freqs


def get_log_likelihood(df, vocab_df, **kwargs):
    vocab  = set(flatten(tokenize_df(vocab_df, **kwargs)))
    tokens = tokenize_df( df, **kwargs )
    freqs  = get_word_frequencies(df, **kwargs)
    log_likelihood = {}
    for token in vocab:
        # Implement Laplacian Smoothing
        p_false = (freqs[0].get(token, 0) + 1) / ( len(tokens[0]) + len(vocab) )  # [0] == False 
        p_true  = (freqs[1].get(token, 0) + 1) / ( len(tokens[1]) + len(vocab) )  # [1] == True
        log_likelihood[token] = np.log( p_true / p_false )
    return log_likelihood
    
    
def get_logprior(df, **kwargs):
    """ Log probability of a word being positive given imbalanced data """
    tokens = tokenize_df( df, **kwargs )
    return np.log( len(tokens[0]) / len(tokens[1]) ) if len(tokens[1]) else 0   
    return np.log( len(tokens[0]) / len(tokens[1]) ) if len(tokens[1]) else 0   

# Logprior

`exp()` undoes `log()` and `** -1` inverts the ratio. 

This shows we have a nearly balanced dataset with about 15% more tokens in the disaster category

In [None]:
def print_logprior():
    tokens   = get_labeled_tokens(df_train)
    logprior = get_logprior(df_train)

    print('len(tokens[0])                    =', len(tokens[0]))
    print('len(tokens[1])                    =', len(tokens[1]))
    print('logprior(df_test)                 =', logprior)
    print('math.exp(logprior(df_test))       =', math.exp(logprior))
    print('math.exp(logprior(df_test)) ** -1 =', math.exp(logprior)**-1)
    
print_logprior()

# Naive Bayes Solver

Simply sum up the negative log likelihood and logprior for each word in each tweet and check the number is positive

In [None]:
def naive_bayes_classifier( df_train, df_test, **kwargs ) -> np.array:
    vocab_df       = [ df_train, df_test ]
    log_likelihood = get_log_likelihood( df_train, vocab_df, **kwargs )    
    logprior       = get_logprior(df_train, **kwargs)
    
    predictions = []
    for tweet_tokens in tokenize_df(df_test, **kwargs):
        log_prob = np.sum([ 
            log_likelihood.get(token, 0)
            for token in tweet_tokens
        ]) + logprior
        prediction = int(log_prob > 0)
        predictions.append(prediction)
    
    return np.array(predictions)            

In [None]:
def test_accuracy(splits=3, **kwargs):
    time_start  = time.perf_counter()

    accuracy = 0
    for _ in range(splits):
        train, test = train_test_split(df_train, test_size=1/splits)      
        predictions = naive_bayes_classifier(train, test, **kwargs)
        accuracy   += np.sum( test['target'] == predictions ) / len(predictions) / splits
        
    time_taken  = time.perf_counter() - time_start
    time_taken /= splits
    print(f'ngrams = {ngrams} | accuracy = {accuracy*100:.2f}% | time = {time_taken:.1f}s')
    
for ngrams in [1,2,3,4,5]:
    test_accuracy( splits=3, ngrams=ngrams )

# Submission

In [None]:
kwargs = { "ngrams": 3 }
df_submission = pd.DataFrame({
    "id":     df_test.index,
    "target": naive_bayes_classifier(df_train, df_test, **kwargs)
})
df_submission.to_csv('submission.csv', index=False)
! head submission.csv

# Further Reading

This notebook is part of a series exploring Natural Language Processing
- 0.77536 - [NLP TF-IDF Classifier](https://www.kaggle.com/jamesmcguigan/disaster-tweets-tf-idf-classifier)
- 0.74164 - [NLP Logistic Regression](https://www.kaggle.com/jamesmcguigan/disaster-tweets-logistic-regression/)
- 0.79773 - [NLP Naive Bayes](https://www.kaggle.com/jamesmcguigan/nlp-naive-bayes)