# Importing Libraries

In [1]:
import nltk

# Import Word Tokenizer
import re
from nltk.tokenize import word_tokenize

# Import Laplace Language Model
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import Laplace

# Reading Files

In [2]:
eng = None
lat = None

with open('./eng.txt', 'r') as f:
    eng = f.read()

with open('./lat.txt', 'r') as f:
    lat = f.read()

In [3]:
eng = eng.split('\n')
lat = lat.split('\n')

In [4]:
eng[:3]

['Approval of the Minutes of the previous sitting .',
 "The Minutes of yesterday 's sitting have been distributed . ",
 'Are there any comments ? ']

Normalisation of eng

In [5]:
eng_normalized = []
for line in eng:
    line = line.lower()
    line = re.sub(r'[^\w\s]','',line)
    line = word_tokenize(line)
    eng_normalized.append(line)

In [6]:
eng_normalized[:3]

[['approval', 'of', 'the', 'minutes', 'of', 'the', 'previous', 'sitting'],
 ['the',
  'minutes',
  'of',
  'yesterday',
  's',
  'sitting',
  'have',
  'been',
  'distributed'],
 ['are', 'there', 'any', 'comments']]

In [7]:
lat_normalized = [line.split() for line in lat]

In [8]:
lat_normalized = lat_normalized[0]

In [9]:
# lat_normalized is a list of words
# form sentences by splitting by '.'
lat_normalized = ' '.join(lat_normalized).split('.')

In [10]:
lat_normalized = [line.split() for line in lat_normalized]

In [11]:
lat_normalized[:3]

[['Approbatio', 'scrupulorum', 'prioris', 'sedentis'],
 ['Minuta', 'hesterna', 'sedentis', 'distributa', 'sunt'],
 ['Esne',
  'ulla',
  'commenta?',
  'Mr',
  'Praeses,',
  'die',
  'Lunae',
  'punctum',
  'ordinis',
  'feci',
  'de',
  'Praeside',
  'Nicole',
  'Fontaine',
  'commentarias',
  'relatas',
  'in',
  'torculari',
  'Britannico',
  'de',
  'recenti',
  'visitatione',
  'cum',
  'Hera',
  'Maiestate',
  'Regina',
  'Elizabeth',
  'II']]

## Creating Models

In [13]:
Model_English = Laplace(3)
Model_Latin = Laplace(3)

In [14]:
# Create Training Data
train_data_English, padded_sents_English = padded_everygram_pipeline(3, eng_normalized)
Model_English.fit(train_data_English, padded_sents_English)

train_data_Latin, padded_sents_Latin = padded_everygram_pipeline(3, lat_normalized)
Model_Latin.fit(train_data_Latin, padded_sents_Latin)

In [15]:
sent1 = 'Petitions are filed'
sent2 = 'Praeses ad omnes aspectus'

In [16]:
# Find Probabilities

# Normalise Sent1
sent1 = sent1.lower()
sent1 = re.sub(r'[^\w\s]','',sent1)
sent1 = word_tokenize(sent1)

# Normalise Sent2
sent2 = sent2.split()

In [17]:
sent1, sent2

(['petitions', 'are', 'filed'], ['Praeses', 'ad', 'omnes', 'aspectus'])

In [18]:
sent1 = list(pad_both_ends(sent1, n=3))
sent2 = list(pad_both_ends(sent2, n=3))

In [19]:
sent1, sent2

(['<s>', '<s>', 'petitions', 'are', 'filed', '</s>', '</s>'],
 ['<s>', '<s>', 'Praeses', 'ad', 'omnes', 'aspectus', '</s>', '</s>'])

In [20]:
# Find Probabilities
def find_prob(sent, model):
    prob = 1
    for i in range(len(sent)-2):
        prob *= model.score(sent[i+2], [sent[i], sent[i+1]])
    return prob

In [21]:
# Classify Sentences

# Sentence1
prob1 = find_prob(sent1, Model_English)
prob2 = find_prob(sent1, Model_Latin)

print('English Probability: ', prob1)
print('Latin Probability: ', prob2)

if prob1 > prob2:
    print('English')
else:
    print('Latin')

English Probability:  4.911300098200164e-16
Latin Probability:  5.778686910842287e-17
English


In [22]:
# Sentence2
prob1 = find_prob(sent2, Model_English)
prob2 = find_prob(sent2, Model_Latin)

print('English Probability: ', prob1)
print('Latin Probability: ', prob2)

if prob1 > prob2:
    print('English')
else:
    print('Latin')

English Probability:  4.412654354352257e-19
Latin Probability:  1.3322621120097489e-19
English
