In [1]:
import io, sys, math, re
from collections import defaultdict
import numpy as np

In [2]:
def load_data(filename):
    '''
    Parameters:
    filename (string): path to file to be read
    
    Return: 
    List of tuples (explained in first question)
    '''
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    for line in fin:
        tokens = line.split()
        data.append((tokens[0], tokens[1:]))
    return data

In [3]:
data = load_data("train1.txt")
data
# Tuple 

[('__label__deu',
  ['Ich', 'würde', 'alles', 'tun,', 'um', 'dich', 'zu', 'beschützen.']),
 ('__label__deu', ['Tom', 'ist', 'an', 'Kunst', 'völlig', 'uninteressiert.']),
 ('__label__hun', ['Végeztem', 'Tomival.']),
 ('__label__deu',
  ['„Wird',
   'das',
   'in',
   'der',
   'Werkstatt',
   'gemacht?“',
   '–',
   '„Nein,',
   'das',
   'muss',
   'an',
   'Ort',
   'und',
   'Stelle',
   'erledigt',
   'werden.“']),
 ('__label__rus', ['У', 'меня', 'есть', 'яблоко.']),
 ('__label__ita', ['Non', 'possiamo', 'lasciarle', 'lì.']),
 ('__label__rus',
  ['Том',
   'считает,',
   'что',
   'школа',
   '—',
   'это',
   'пустая',
   'трата',
   'времени.']),
 ('__label__eng', ['My', 'fathers', "don't", 'speak', 'Dutch.']),
 ('__label__spa', ['El', 'niño', 'no', 'sabe', 'cómo', 'comportarse.']),
 ('__label__rus', ['Она', 'думала,', 'что', 'он', 'переночует', 'у', 'неё.']),
 ('__label__tur', ['Helikopter', 'neden', 'kentin', 'üstünde', 'uçuyor?']),
 ('__label__ita', ['Lo', 'prenda!', 'È', 'un',

In [4]:
def count_words(data):
    '''
    Parameters:
    
    data is  list of [(label, words), (label, worlds), ......]
    list of tuples in the shape (string, [list of strings]) )
    
    Returns: 
    
    This function should return a dictionary containing the following:
    { 
    # label_counts (python dictionary): 
         {label:  no. of times the label appeared },
    # word_counts  (dictionary of dictionaries): 
         {label: {word: no. of times this word appeared with this label }},
    # label_total (int): 
        total number of labels. (size of train data),
    # word_total  (python dictionary) total number of words (from the entire corupus) of the particular label:
          {label: no.of words}
          
          }
    
    '''
    label_total = 0
    word_total = defaultdict(lambda: 0)
    label_counts = defaultdict(lambda: 0)
    word_counts = defaultdict(lambda: defaultdict(lambda: 0.0))

    for example in data:
        label, sentence = example
        ## FILL CODE
        label_counts[label]+=1.0
        label_total+=1.0
        
        for word in sentence:
            word_counts[label][word]+=1.0
            word_total[word]+=1.0
            
        
    
        
    return {'label_counts': label_counts, 
            'word_counts': word_counts, 
            'label_total': label_total, 
            'word_total': word_total}

In [5]:
def predict(sentence, mu, label_counts, word_counts, label_total, word_total):
    '''
     Parameters: 
        sentence (string): sentence to be classified
        mu (positive real number): Laplace Smoothing hyperparameter
        ** The other parameters introduced in the count_words function
    
    Returns:
    best_label (string): the label that has the highest score. 
    
    Implement the function to predict the best label for the given sentence using Naive Bayes algorithm 
    
    '''
    best_label = None
    best_score = float('-inf')

    for label in word_counts.keys():
        score = 0.0
        ## FILL CODE
        vocab_size=len(word_counts[label])
        for word in sentence:
            wc=word_counts[label][word]+mu
            tc=word_total[label]+mu*vocab_size
            score+=math.log(wc/tc)
        prior=label_counts[label]/sum(label_counts.values())
        score=score+math.log(prior)
        
        if best_score<score:
            best_label=label
            best_score=score
    return best_label


In [6]:
def compute_accuracy(valid_data, mu, counts):
    '''
    Parameters:
    valid_data (list of tuples): returned value of load_data function 
    mu (positive real): Laplace smoothing hyper-parameter
    counts (dictionary of dictionaries): return value of count_words_function
    
    Returns: 
    accuracy (float): the accuracy of the Naive Bayes classifier
    '''
    accuracy = 0.0
    for label, sentence in valid_data:
         ## FILL CODE
            if predict(sentence, mu, **counts)==label:
                accuracy+=1
                
            
            

    return accuracy/len(valid_data)


In [7]:
print("")
print("** Naive Bayes **")
print("")

mu = 1.0
train_data = load_data("train1.txt")
valid_data = load_data("valid1.txt")
counts = count_words(train_data)

print("Validation accuracy: %.3f" % compute_accuracy(valid_data, mu, counts))
print("")



** Naive Bayes **

Validation accuracy: 0.936

