In [1]:
import math, random
from collections import defaultdict
import os

In [2]:
################################################################################
# Part 0: Utility Functions
################################################################################

COUNTRY_CODES = ['af', 'cn', 'de', 'fi', 'fr', 'in', 'ir', 'pk', 'za']

def start_pad(n):
    ''' Returns a padding string of length n to append to the front of text
        as a pre-processing step to building n-grams '''
    return '~' * n

def ngrams(n, text):
    ''' Returns the ngrams of the text as tuples where the first element is
        the length-n context and the second is the character '''
    text = start_pad(n) + text
    grams = []
    for j in range(len(text)-n):
        context = text[j:j+n]
        char = text[j+n]
        grams.append((context, char))
    return grams

def create_ngram_model(model_class, path, n = 2, k = 0):
    ''' Creates and returns a new n-gram model trained on the city names
        found in the path file '''
    model = model_class(n, k)
    with open(path, encoding = 'utf-8', errors = 'ignore') as f:
        model.update(f.read())
    return model

def create_ngram_model_lines(model_class, path, n = 2, k = 0):
    ''' Creates and returns a new n-gram model trained on the city names
        found in the path file '''
    model = model_class(n, k)
    with open(path, encoding = 'utf-8', errors = 'ignore') as f:
        for line in f:
            model.update(line.strip())
    return model

In [3]:
################################################################################
# Part 1: Basic N-Gram Model
################################################################################

class NgramModel(object):
    ''' A basic n-gram model using add-k smoothing '''

    def __init__(self, n, k):
        self.n = n # order of n-gram model
        self.vocab = set() # initialize vocabulary
        self.context_counts = defaultdict(lambda:0) # frequency of contexts
        self.sequence_counts = defaultdict(lambda:0) # frequency of (context, char) sequences
        self.k = k # smoothing parameter

    def get_vocab(self):
        ''' Returns the set of characters in the vocab '''
        return self.vocab

    def update(self, text):
        ''' Updates the model n-grams based on text '''
        all_ngrams = ngrams(self.n, text)
        for (context, char) in all_ngrams:
            self.vocab.add(char)
            self.context_counts[context] += 1 # increment the context count
            self.sequence_counts[(context, char)] += 1 # increment the (context, character) sequence count

    def prob(self, context, char):
        ''' Returns the probability of char appearing after context '''
        # print(self.n)
        if context in self.context_counts.keys():
            denominator = self.context_counts[context] # frequency of context followed by any token
            numerator = self.sequence_counts[(context, char)] # frequency of exact (context, character) sequence
            prob = (numerator + self.k)/(denominator + (self.k * len(self.vocab)))
            return prob
        else:
            return 1/len(self.vocab)
                
    def random_char(self, context):
        ''' Returns a random character based on the given context and the 
            n-grams learned by this model '''
        r = random.random()
        pre_sum = 0
        for i, char in enumerate(sorted(self.vocab)):
            # pre_sum is sum of probabilities up to, but excluding, the current token
            post_sum = pre_sum + self.prob(context, char)
            # post_sum also includes the probability of the current token
            if pre_sum <= r < post_sum:
                return char
            pre_sum = post_sum
                
    def random_text(self, length):
        output_text = ''
        all_context = start_pad(self.n) # keep a running context list initialized with '~'s
        for i in range(length):
            curr_context = all_context[len(all_context)-self.n:] # extract context from running context list
            next_char = self.random_char(curr_context)
            output_text += next_char
            all_context += next_char
        return output_text

    def perplexity(self, text):
        ''' Returns the perplexity of text based on the n-grams learned by
            this model '''
        m = len(text)
        all_ngrams = ngrams(self.n, text)
        log_prob_sum = 0
        for (context, char) in all_ngrams:
            prob = self.prob(context, char)
            if prob == 0:
                return float('inf')
            log_prob_sum += math.log(prob)
        perplexity = math.exp(-1/(m) * log_prob_sum)
        return perplexity

In [4]:
################################################################################
# Part 2: N-Gram Model with Interpolation
################################################################################

class NgramModelWithInterpolation(NgramModel):
    ''' An n-gram model with interpolation '''

    def __init__(self, n, k):
        self.n = n # highest order n-gram model
        self.models = {} # initialize empty dictionary for NgramModels
        self.weights = {} # lambdas corresponding to each NgramModel
        for order in range(n + 1): # extra model accounts for zeroth order
            self.models[order] = NgramModel(order, k)
            self.weights[order] = 1/(n + 1)
        self.k = k # smoothing parameter
            
    def get_vocab(self):
        vocab = set()
        for order in range(self.n + 1):
            model = self.models[order]
            vocab = vocab.union(model.get_vocab()) # merge vocabularies 
        return vocab

    def update(self, text):
        for order in range(self.n + 1):
            model = self.models[order]
            model.update(text)

    def prob(self, context, char):
        prob = 0
        for order in range(self.n + 1):
            model = self.models[order]
            weight = self.weights[order]
            if model.n == 0:
                sliced_context = ''
            else:
                sliced_context = context[-model.n:]
            prob += weight * model.prob(sliced_context, char)
        return prob
    
    def perplexity(self, text):
        

In [5]:
# m = create_ngram_model(NgramModel, 'shakespeare_input.txt', 2)
# m.random_text(250)

In [6]:
# m = create_ngram_model(NgramModel, 'shakespeare_input.txt', 3)
# m.random_text(250)

In [7]:
# m = create_ngram_model(NgramModel, 'shakespeare_input.txt', 4)
# m.random_text(250)

In [8]:
# m = create_ngram_model(NgramModel, 'shakespeare_input.txt', 7)
# m.random_text(250)

In [53]:
################################################################################
# Part 3: Your N-Gram Model Experimentation
################################################################################

def accuracy(all_pred_labels, all_true_labels):
    num_correct = 0
    for i, pred in enumerate(all_pred_labels):
        if pred == all_true_labels[i]:
            num_correct += 1
    accuracy = num_correct/len(all_true_labels)
    return accuracy

if __name__ == '__main__':
    
    # training data
    all_training_data = '/Users/sppatankar/Desktop/CIS 530/Homework 3/train'
    models = {}
    n = 5
    k = 0
    for filename in os.listdir(all_training_data):
        filepath = os.path.join(all_training_data, filename) # of the form 'country_code.txt'
        country = filename.split('.')[0] # split on the period to isolate country code 
        models[country] = create_ngram_model_lines(NgramModel, filepath, n, k)
    
    # validation data
    all_validation_data = '/Users/sppatankar/Desktop/CIS 530/Homework 3/val'
    all_true_labels = []
    all_pred_labels = []
    for filename in os.listdir(all_validation_data):
        filepath = os.path.join(all_validation_data, filename) 
        true_country = filename.split('.')[0]
        with open(filepath, encoding = 'utf-8', errors = 'ignore') as f:
            for line in f:
                perplexities = {}
                for country in models.keys(): # loop over the models for all countries
                    perplexities[country] = models[country].perplexity(line)
                # find the model that has the lowest perplexity and assign prediction
                print(line, perplexities)
                pred_country = min(perplexities, key = perplexities.get)
                all_true_labels.append(true_country)
                all_pred_labels.append(pred_country)

    print(accuracy(all_pred_labels, all_true_labels))

torce
 {'in': inf, 'pk': inf, 'fr': inf, 'af': inf, 'cn': inf, 'za': inf, 'fi': inf, 'ir': inf, 'de': inf}
saint-georges-nigremont
 {'in': inf, 'pk': inf, 'fr': inf, 'af': inf, 'cn': inf, 'za': inf, 'fi': inf, 'ir': inf, 'de': inf}
kerbouzard
 {'in': inf, 'pk': inf, 'fr': inf, 'af': inf, 'cn': inf, 'za': inf, 'fi': inf, 'ir': inf, 'de': inf}
la giraudiere
 {'in': inf, 'pk': inf, 'fr': inf, 'af': inf, 'cn': inf, 'za': inf, 'fi': inf, 'ir': inf, 'de': inf}
lepanges
 {'in': inf, 'pk': inf, 'fr': inf, 'af': inf, 'cn': inf, 'za': inf, 'fi': inf, 'ir': inf, 'de': inf}
montauban-de-luchon
 {'in': inf, 'pk': inf, 'fr': inf, 'af': inf, 'cn': inf, 'za': inf, 'fi': inf, 'ir': inf, 'de': inf}
wanquetin
 {'in': inf, 'pk': inf, 'fr': inf, 'af': inf, 'cn': inf, 'za': inf, 'fi': inf, 'ir': inf, 'de': inf}
dommartin
 {'in': inf, 'pk': inf, 'fr': inf, 'af': inf, 'cn': inf, 'za': inf, 'fi': inf, 'ir': inf, 'de': inf}
chalautre
 {'in': inf, 'pk': inf, 'fr': inf, 'af': inf, 'cn': inf, 'za': inf, 'fi': inf,

In [57]:
test_dict = {}
test_dict['in'] = 0.25
test_dict['fr'] = 0.24
test_dict['cn'] = 10

In [58]:
pred_country = min(test_dict, key = test_dict.get)
pred_country

'fr'

In [42]:
models['in'].perplexity('xinma')

7.937104728247185

In [43]:
models['cn'].perplexity('xinma')

inf

In [44]:
models['fr'].perplexity('xinma')

inf

In [41]:
models['fr'].sequence_counts

defaultdict(<function __main__.NgramModel.__init__.<locals>.<lambda>()>,
            {('~~~', 'p'): 138,
             ('~~p', 'r'): 21,
             ('~pr', 'e'): 8,
             ('pre', 'm'): 2,
             ('rem', 'e'): 3,
             ('eme', 'r'): 2,
             ('mer', 'y'): 3,
             ('~~~', 'b'): 230,
             ('~~b', 'o'): 56,
             ('~bo', 'u'): 30,
             ('bou', 's'): 4,
             ('ous', 's'): 17,
             ('uss', 'o'): 10,
             ('sso', 'n'): 12,
             ('son', '-'): 5,
             ('on-', 'l'): 8,
             ('n-l', 'e'): 10,
             ('-le', ' '): 2,
             ('le ', 'b'): 15,
             ('e b', 'a'): 4,
             (' ba', 's'): 6,
             ('~bo', 'i'): 8,
             ('boi', 's'): 25,
             ('ois', ' '): 1,
             ('is ', 'l'): 1,
             ('s l', 'e'): 1,
             (' le', 's'): 2,
             ('les', 's'): 7,
             ('ess', 'u'): 2,
             ('ssu', 's'): 1,
             (

In [24]:
city = 'fontaine des gazelles'
models['in'].perplexity(city)

inf

In [25]:
models['pk'].perplexity(city)

inf

In [26]:
models['fr'].perplexity(city)

inf

In [27]:
models['af'].perplexity(city)

inf

In [28]:
models['cn'].perplexity(city)

inf

In [29]:
models['za'].perplexity(city)

inf

In [30]:
models['fi'].perplexity(city)

inf

In [31]:
models['ir'].perplexity(city)

inf

In [32]:
models['de'].perplexity(city)

inf

In [10]:
all_true_labels

['in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',
 'pk',

In [11]:
all_pred_labels

['in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'pk',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'pk',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'af',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',
 'in',