# TEXT CLASSIFICATION USING NAIVE-BAYES WITH LAPLACE SMOOTHING

We're going to train a naive-bayes model to predict the label (language) given a sequence of text.

### Import libraries

In [22]:
# We are going to use the Naive Bayes algorithm for language modeling.
import io
import math
import operator
from typing import List, Tuple, Dict
from collections import defaultdict

### Load data

The `read_data` function takes in the text file and split the sentence and the corresponding label into a list of tuples

In [4]:
#First let create a function to load the dataset
def read_data(filename:str) -> List[Tuple]: 
  files = io.open(filename, 'r', encoding='utf-8')
  data = []
  for lines in files:
    tokens = lines.split() # by default this splits each line by tab, spaces and newline character
    data.append((tokens[0], tokens[1:]))

  return data

In [16]:
data = read_data('data/train1.txt') # load the dataset 
print(data[:3]) # print first 3 tuples in the list [label, sentence]

[('__label__de', ['Ich', 'würde', 'alles', 'tun,', 'um', 'dich', 'zu', 'beschützen.']), ('__label__de', ['Tom', 'ist', 'an', 'Kunst', 'völlig', 'uninteressiert.']), ('__label__hu', ['Végeztem', 'Tomival.'])]


### Count number of words

`n_examples` is the total number of examples (number of sentences).

`n_words_per_label` is the total number of words for a given label. Given a particular label, what is the total number of words.

`label_counts` is the number of times a given label appears in the training data. ie. how many times does a given class label appears in the dataset.

`word_counts` is the number of times a word appears with a given label. ie. Given a particular class, how many times does a particular word occur.

It will be convinient to save the results in a dictionary.

In [15]:
def word_count(data:str) -> Dict:
  # initialize counts
  n_examples = 0
  n_words_per_label = defaultdict(lambda: 0)
  label_counts = defaultdict(lambda: 0)
  word_counts = defaultdict(lambda: defaultdict(lambda: 0.0))

  for example in data:
    label, sentence = example
    n_examples += 1
    label_counts[label] += 1
    n_words_per_label[label] += len(sentence)

    for words in sentence:
      word_counts[label][words] += 1

  return {'n_examples': n_examples, 'n_words_per_label': n_words_per_label, 'label_counts': label_counts, 'word_counts': word_counts}

In [17]:
counts = word_count(data) # Load the data for counting 

Show all counts

In [20]:
counts['n_examples'] # show the total number of examples (size of dataset)

100000

In [27]:
sorted(counts['label_counts'].items(), key=operator.itemgetter(1), reverse=True) # Get the number of times a particular label appears in the dataset.
# We can see that english words dominates the examples

[('__label__en', 21352),
 ('__label__it', 12946),
 ('__label__ru', 12293),
 ('__label__tr', 12130),
 ('__label__eo', 10741),
 ('__label__de', 8147),
 ('__label__fr', 6890),
 ('__label__pt', 5838),
 ('__label__es', 5390),
 ('__label__hu', 4273)]

In [28]:
sorted(counts['n_words_per_label'].items(), key=operator.itemgetter(1), reverse=True) # Get the number of words for a given label

#again we have a lot of english words followed by italian etc.

[('__label__en', 164223),
 ('__label__it', 76489),
 ('__label__eo', 76302),
 ('__label__ru', 70468),
 ('__label__de', 64636),
 ('__label__tr', 60013),
 ('__label__fr', 52233),
 ('__label__pt', 39808),
 ('__label__es', 37741),
 ('__label__hu', 22400)]

### Preprocessing of labels

As we can see the label is not really readable hence, we need some preprocessing. eg. __label__en -> english, __label__it -> italian etc). This will be meaningful for the classification problems.


In [30]:
# define a dictionary that maps the original label to the new.
label_mapping = {
    '__label__en': 'english',
    '__label__it': 'italian',
    '__label__eo': 'esperanto',
    '__label__ru': 'russian',
    '__label__de': 'german',
    '__label__tr': 'turkish',
    '__label__fr': 'french',
    '__label__pt': 'portuguese',
    '__label__es': 'spanish',
    '__label__hu': 'hungarian'
}


# define the the label preprocessing function
def rename_label(data:List[Tuple], label_mapping:Dict) -> List[Tuple]:
    """
    parameter:
    data (list of tuples): The dataset where each tuple is (label, sentence)
    label_mapping (dictionary): The dictionary that maps old labels to new labels

    Returns: 
    renamed_data (list of tuples): dataset with renamed labels
    
    """
    preprocesed_data = []
    for label, sentence in data:
        if label in label_mapping:
            new_label = label_mapping[label]
            preprocesed_data.append((new_label, sentence))

        else: 
            preprocesed_data.append((label, sentence))

    return preprocesed_data

In [39]:
preprocesed_data = rename_label(data, label_mapping)

In [40]:
print(preprocesed_data[:3]) # looks great now.

[('german', ['Ich', 'würde', 'alles', 'tun,', 'um', 'dich', 'zu', 'beschützen.']), ('german', ['Tom', 'ist', 'an', 'Kunst', 'völlig', 'uninteressiert.']), ('hungarian', ['Végeztem', 'Tomival.'])]


In [42]:
# Great, let's now pass this through the count function!
preprocesed_count = word_count(preprocesed_data)

In [43]:
sorted(preprocesed_count['label_counts'].items(), key=operator.itemgetter(1), reverse=True) # Get the number of times a particular label appears in the dataset.
# This looks great!

[('english', 21352),
 ('italian', 12946),
 ('russian', 12293),
 ('turkish', 12130),
 ('esperanto', 10741),
 ('german', 8147),
 ('french', 6890),
 ('portuguese', 5838),
 ('spanish', 5390),
 ('hungarian', 4273)]

### Predict function

In [45]:
def predict(sentence:List, mu:float, label_counts:Dict, word_counts:Dict, n_examples:int, n_words_per_label:Dict)->str:
  best_label = None
  best_score = float('-inf')

  for label in word_counts.keys():
    score = 0
    prior = label_counts[label] / sum(label_counts.values()) # calculate the prior

    score += math.log(prior)

    for word in sentence:
      word_count = word_counts[label][word]
      total_words = n_words_per_label[label]
      vocab_size = len(word_counts[label])

      word_probability = (mu + word_count) / (mu * vocab_size + total_words)  # likelihood with laplacian smoothing
      score += math.log(word_probability) # log-likelihood.

    # update the scores
    if score > best_score:
      best_score = score
      best_label = label

  return best_label

In [46]:
# Check results with simple example

sentence = ['whats up', 'guys', 'how', 'far']
mu = 1.0  # Laplace smoothing parameter
counts = word_count(preprocesed_data)
predicted_label = predict(sentence, mu, counts['label_counts'], counts['word_counts'], counts['n_examples'], counts['n_words_per_label'])
print(predicted_label)


english


In [48]:
# Check results with simple example 

sentence = ['Questo', 'è', 'un', 'esempio', 'di', 'frase']
mu = 1.0  # Laplace smoothing parameter
counts = word_count(preprocesed_data)
predicted_label = predict(sentence, mu, counts['label_counts'], counts['word_counts'], counts['n_examples'], counts['n_words_per_label'])
print(predicted_label)


italian


### Accuracy Function

In [49]:
# Let's compute the accuracy.
def compute_accuracy(valid_data:List[Tuple[str, List[str]]], mu:float, counts:Dict):
  correct_predictions = 0
  for label, sentence in valid_data:
    predicted_label = predict(sentence, mu, counts['label_counts'], counts['word_counts'], counts['n_examples'], counts['n_words_per_label'])
    if predicted_label == label:
      correct_predictions += 1

  accuracy = correct_predictions / len(valid_data)
  return accuracy

In [51]:
# compute accuracy on validation set
mu=1.0 # ininitialize laplacian smoothing
train_data = read_data('data/train1.txt') # load the dataset 
valid_data = read_data('data/valid1.txt') # load the dataset 

# pass through label renaming function.
preprocesed_train = rename_label(train_data, label_mapping) # get new labels 
preprocesed_valid = rename_label(valid_data, label_mapping)

In [52]:
# counts
counts = word_count(preprocesed_train)

In [53]:
# now compute accuracy.
compute_accuracy(preprocesed_valid, mu, counts)


0.941