# Language Model
Implementation of a language model using n-gram language models.

In [1]:
import random as r
import os
import math
import pandas as pd
import numpy as np
import nltk

In [4]:
class BigramLM():
    # Implementeer hier je BigramLM klasse
    def __init__(self, vocabulary, language):
        self.vocabulary = vocabulary
        self.n = len(vocabulary)
        self.dataTable = pd.DataFrame()
        self.numpyTable = np.ones((self.n, self.n), dtype=int)
        self.charsCount = dict()
        self.charToInt = dict()
        self.language = language
        self.probsTable = pd.DataFrame(0.0, index=self.vocabulary, columns=self.vocabulary)
        self.getIntsForChars()
    
    def train(self, trainSet):
        """Trains the bigram model on the training set"""
        self.getTokensCount(trainSet) # Compute the counts for each token
        self.getBigramCounts(trainSet) # Compute the bigram counts
        self.getLogProbsFromCounts() # Compute from the counts the log probs
        self.getCumulatedProbsTable() # Compute the cumulated probs table for sampling
        
    def getTokensCount(self, trainSet): # Correct
        """Compute the count dictionary that holds a count for all chars"""
        for char in trainSet:
            if char in self.charsCount:
                self.charsCount[char] += 1
            else:
                self.charsCount[char] = 1

    def getBigramCounts(self, trainset):
        """Computes the bigram table that holds the counts for each entry"""
        # Use numpy tables for this step. It is a lot faster.
        prevCharIntValue = self.charToInt[' ']
        for char in trainset:
            charIntValue = self.charToInt[char]
            self.numpyTable[prevCharIntValue][charIntValue] += 1
            prevCharIntValue = charIntValue
        # Transform the numpy table into a pandas table
        self.dataTable = pd.DataFrame(self.numpyTable, index=self.vocabulary, columns=self.vocabulary, dtype=float)

    def getLogProbsFromCounts(self):
        """Computes the probabilities from the counts table"""
        for rowChar in self.dataTable.index:
            for colChar in self.dataTable.columns:
                countColCharAfterRowChar = self.dataTable.at[rowChar, colChar]
                countRowChar = self.charsCount[rowChar]
                prob = countColCharAfterRowChar / (countRowChar + self.n) # Add the size of the vocabulary for the smoothing
                self.probsTable.at[rowChar, colChar] = prob
                logProb = math.log(prob)
                self.dataTable.at[rowChar, colChar] = logProb

    def getIntsForChars(self):
        """Map all chars from vocabulary to integers"""
        counter = 0
        for char in self.vocabulary:
            self.charToInt[char] = counter
            counter += 1

    def perplexity(self, testSet):
        """Computes the perplexity for the given testSet"""
        perplexity = 0
        prevChar = ' '
        for char in testSet:
            perplexity += self.dataTable.at[prevChar, char]
            prevChar = char
        
        perplexity /= len(testSet)
        perplexity = math.exp(-perplexity)
        return perplexity

    def getCumulatedProbsTable(self):
        """Compute a special probs table that will get at each column the prob
        from current col and all the cols before, added to it."""
        for rowChar in self.probsTable.index:
            prevColProb = 0
            for colChar in self.probsTable.columns:
                colProb = self.probsTable.at[rowChar, colChar]
                self.probsTable.at[rowChar, colChar] = prevColProb + colProb
                prevColProb += colProb

    def sample(self):
        """Function that generates text"""
        rowChar = r.choice(self.vocabulary) # Pick a random char to begin the generation with
        outputSentence = [rowChar]
        LENGTH_GEN_TEXT = 100
        currLength = 1
        while (currLength < LENGTH_GEN_TEXT):
            randNum = int.from_bytes(os.urandom(8), byteorder="big") / ((1 << 64) - 1)
            for colChar in self.probsTable.columns:
                if self.probsTable.at[rowChar, colChar] > randNum:
                    outputSentence.append(colChar)
                    rowChar = colChar
                    break

            currLength += 1
        result = ''.join(outputSentence)
        print(result)

In [None]:
# Read the files and build the vocabularies
dutchFile = open('corpora/corpora/nld_news_2020-sentences.txt', 'r', encoding='utf-8').read()
dutchVocabulary = sorted(set(dutchFile))

basqueFile = open('corpora/corpora/eus_news_2020-sentences.txt', 'r', encoding='utf-8').read()
basqueVocabulary = sorted(set(basqueFile))
    
turkishFile = open('corpora/corpora/tur_news_2020-sentences.txt', 'r', encoding='utf-8').read()
turkishVocabulary = sorted(set(turkishFile))

dutch_model = BigramLM(dutchVocabulary, 'dutch')
dutch_model.train(dutchFile)

basque_model = BigramLM(basqueVocabulary, 'basque')
basque_model.train(basqueFile)

turkish_model = BigramLM(turkishVocabulary, 'turkish')
turkish_model.train(turkishFile)

In [None]:
listModels = [dutch_model, basque_model, turkish_model]

# Identify language based on the perplexity for each model
def identify_language(sentence):
    language = "None"
    currMin = float('inf')
    for model in listModels:
        currPerplexity = model.perplexity(sentence)
        if currPerplexity < currMin:
            currMin = currPerplexity
            language = model.language
    
    return language
    

identify_language("kaixo")

'basque'

In [13]:
# Test identify language feature
print(identify_language("kaixo"))
print(identify_language("Wiskunde is moeilijk"))

basque
dutch


In [8]:
# Generate text in basque
basque_model.sample()

n emurreniagikozurtxupo ezerrribieneko enguz heko le a ma lar e beakonda Jrasti duk Osta COKe ekaizh


In [9]:
# Generate text in dutch
dutch_model.sample()

Pen he t eroe eeetenovarelachoprgooger.
Iket be von mogs slaudjkun, om.
VDar.
Alie jntene vaabe bete


In [10]:
# Generate text in turkish
turkish_model.sample()

Uzati bike göncergöyelanera kumığran ön Avaririzdünisi.
Olev'din ve 3.
Araparosiliçosır biti"
Özadek
