In [54]:
#importing the necessary libraries along the way
from collections import Counter,defaultdict
#we need this to clean the data from punctuation and unwanted stuff
import re
from math import log as l

In [55]:
#loading the data files
#English
import urllib.request
en_url="https://drive.google.com/uc?export=download&id=1hOw7FCT5cmMjlWdrA88Dg6whe7_Blx7B"
en_file=urllib.request.urlopen(en_url)
#more and more cleaning
en=re.findall(r'\w+', en_file.read().decode("utf-8"))
count_en=Counter(en)

#German
de_file=open("europarl-v7.de-en.lc.de", "r", encoding="utf-8").read()
#more and more cleaning
de=re.findall(r'\w+', de_file)

#French
fr_file=open("europarl-v7.fr-en.lc.fr", "r", encoding="utf-8").read()
#more and more cleaning
fr=re.findall(r'\w+', fr_file)

#Swedish
sv_url="https://drive.google.com/uc?export=download&id=1b0CvmX80SLYcBkbKBz3DU0o2dBwfqFoI"
sv_file=urllib.request.urlopen(sv_url)
#more and more cleaning
sv=re.findall(r'\w+', sv_file.read().decode("utf-8"))
count_sv=Counter(sv)

#European Parliment Proceedings aka europe
europe=en+de+fr+sv

(a) Warmup:

In [56]:
#importing the library to count the common words
from collections import Counter

#the 10 most frequent words in each language
#English
print("The 10 most frequent words in English are:")
print(Counter(en).most_common(10))
print()

#German
print("The 10 most frequent words in German are:")
print(Counter(de).most_common(10))
print()

#French
print("The 10 most frequent words in French are:")
print(Counter(fr).most_common(10))
print()

#Swedish
print("The 10 most frequent words in Swedish are:")
print(Counter(sv).most_common(10))
print()

The 10 most frequent words in English are:
[('the', 19327), ('of', 9344), ('to', 8814), ('and', 6949), ('in', 6124), ('is', 4400), ('that', 4357), ('a', 4271), ('we', 3223), ('this', 3222)]

The 10 most frequent words in German are:
[('die', 10521), ('der', 9374), ('und', 7028), ('in', 4175), ('zu', 3169), ('den', 2976), ('wir', 2863), ('daß', 2738), ('ich', 2670), ('das', 2669)]

The 10 most frequent words in French are:
[('apos', 16729), ('de', 14528), ('la', 9746), ('et', 6620), ('l', 6536), ('le', 6177), ('à', 5588), ('les', 5587), ('des', 5232), ('que', 4797)]

The 10 most frequent words in Swedish are:
[('att', 9181), ('och', 7038), ('i', 5954), ('det', 5687), ('som', 5028), ('för', 4959), ('av', 4013), ('är', 3840), ('en', 3724), ('vi', 3211)]



In [57]:
#The probabilities are calculated as the frequency of a given word divide by the total number of words
count_europe=Counter(europe)

#speaker
print("The probability of the word speaker is", count_europe["speaker"]/sum(count_europe.values()))

#for a nicer output
print()

#Zebra
print("The probability of the word zebra is", count_europe["zebra"]/sum(count_europe.values()))

The probability of the word speaker is 9.740636083017494e-06

The probability of the word zebra is 0.0


(b) Language modeling

In [58]:
def frequency(url):
  frequencies={}
  file=urllib.request.urlopen(url)
  for line in file:
    words=re.findall(r'\w+', line.decode("utf-8"))
    frequencies=finding_bigrams(words, frequencies)
  return frequencies

def finding_bigrams(words, bigrams={}):
  for i in range(len(words)-1):
    bigram=(words[i], words[i+1])
    if not bigram in bigrams:
      bigrams[bigram]=1
    else:
      bigrams[bigram]+=1
  return bigrams

def probabilities(count_bigrams, count_words):
  probabilities={}
  for bigram, count in count_bigrams.items(): 
    word=bigram[0]
    probabilities[bigram]=count/count_words[word]
  return probabilities

def bigram_language_model(bigrams_probabilities, count_bigrams, laplace_smoothing):
  prob=1
  #prob=l(1)
  for bigram, count in count_bigrams.items():
    if bigram in bigrams_probabilities:
      prob=prob*bigrams_probabilities[bigram]
      #prob=prob+l(bigrams_probabilities[bigram])
    else:
      prob=prob*laplace_smoothing
      #prob=prob+l(laplace_smoothing)
  return prob

In [59]:
en_bigrams=frequency(en_url)
probs=probabilities(en_bigrams, count_en)
pseudocount=1/(sum(en_bigrams.values())+1)

In the previous part (a), we observed that the probability of a word which did not appear in the text was zero. To prevent any sequential troubles we used Adaptive Smoothing also known as Laplace Smoothing and added a pseudocount to the probabilities so it won't be zero! Furthermore, to compute the probabilities for individual long English sentences, we can use logarithmic probabilities instead.

In [60]:
#computing the probability of a short random sentence irrelevant to the training data
sentence="she was gone and yet she was more present than anyone else"
words=re.findall(r'\w+', sentence)
sentence_bigrams=finding_bigrams(words)
bigrams=finding_bigrams(words)
print(bigram_language_model(probs, bigrams, pseudocount))

1.736539063879752e-37


In [61]:
#this one is obtained from the training text
sentence="it is the case of alexander nikitin"
words=re.findall(r'\w+', sentence)
sentence_bigrams=finding_bigrams(words)
bigrams=finding_bigrams(words)
print(bigram_language_model(probs, bigrams, pseudocount))

4.881798402577779e-46


(c) Translation modeling

In [62]:
def split_sentences(url):
  file=urllib.request.urlopen(url)
  sentences=list()
  for line in file:
    sentences.append(line.decode("utf-8"))
  return sentences

In [63]:
#we need a flag or etc to point the beginning/ending of a sentence
flag="null"
en_sentences=split_sentences(en_url)
sv_sentences=split_sentences(sv_url)
en.append(flag)

In [64]:
transition_probabilities=dict()
#default value provided as uniform probability
for i in range(0, len(en_sentences)):
  en_words=re.findall(r'\w+', en_sentences[i])
  en_words.append(flag)
  sv_words=re.findall(r'\w+', sv_sentences[i])
  for j in sv_words:
    for k in en_words:
      transition_probabilities[(j,k)]=1.0/len(en_words)

In [65]:
number_of_iterations=10

for i in range(0, number_of_iterations):
  count=defaultdict(float)
  es_total=defaultdict(float)
  total=defaultdict(float)
  for j in range(len(sv_sentences)):
    es=re.findall(r'\w+', en_sentences[j])
    es.append(flag)
    fs=re.findall(r'\w+', sv_sentences[j])
    #compute normalization
    for f in fs:
      total[f]=0.0
      for e in es:
        total[f]+=transition_probabilities[(f, e)]
    for f in fs:
      for e in es:
        delta=transition_probabilities[(f, e)]/total[f]
        count[(f, e)]+=delta
        es_total[e]+=delta
  for j in range(len(sv_sentences)):
    es=re.findall(r'\w+', en_sentences[j])
    es.append(flag)
    fs=re.findall(r'\w+', sv_sentences[j])
    #estimate probability
    for f in fs:
      for e in es:
        transition_probabilities[(f, e)]=count[(f, e)]/es_total[e]

In [66]:
#we need this for the process of sorting the probabilities as we did it wrong previously
import heapq
from operator import itemgetter

testing={k:v for (k,v) in transition_probabilities.items() if k[1]=="european"}
print(dict(heapq.nlargest(10, testing.items(), key=itemgetter(1))))

{('europeiska', 'european'): 0.8450574393627105, ('europeisk', 'european'): 0.08375866478837797, ('den', 'european'): 0.01152108595074964, ('i', 'european'): 0.010893956404072248, ('att', 'european'): 0.00610113029186832, ('en', 'european'): 0.0055281168349736575, ('till', 'european'): 0.005247648979045568, ('och', 'european'): 0.004863528509948655, ('det', 'european'): 0.004710207166560048, ('för', 'european'): 0.00398211502502309}


(d) Decoding

To be able to decode the sentence from Swedish to English (or any other source and target language, though we only chose Swedish as we knew it) we first find the highest transition probability for each word in the sentence. Second, using the library itertools, take all the possible permutations into account. And at last, find the sentence with the highest probability among all.
The problem is for short sentences with almost 3 sentences it works quite well, but as the number of words increase and the sentence becomes longer the number of permutations also increase. This makes the decoding process to reqire more computational power. Therefore, one mighty solution is to consider n-most common translations to each word!
After all, decoding three sentences from the Swedish training data, the model translate those to English with mean accuracy around 85% which could be a good baseline for other advanced available models.

In [67]:
def translation(sv):
  t={k:v for (k,v) in transition_probabilities.items() if k[0]==sv}
  sorting=heapq.nlargest(1, t.items(), key=itemgetter(1))
  #return sorting[0][0][1:n]
  return sorting[0][0][1]

def permute(words):
  #we need this to check the possible permutations for translation
  from itertools import permutations
  return [list(p) for p in permutations(words)]

def max_prob(possible_orders):
  prob=0.0
  sentence={}
  for order in possible_orders:
    cb=finding_bigrams(order)
    cbp=bigram_language_model(probs, cb, pseudocount)
    if cbp>prob:
      prob=cbp
      sentence=order
  return sentence

def decode(sv_sentence):
  fs=re.findall(r'\w+', sv_sentence)
  translated=list()
  for f in fs:
    translated.append(translation(f))
  possible_orders=permute(translated)
  final_translation=max_prob(possible_orders)
  return ' '.join(final_translation)

In [68]:
#It is like a hello world! but in a different way :D
decode("men jag är bra")

'but i is good'

In [69]:
decode("ni känner till från media")

'you familiar returned from media'

In [70]:
decode("på den skamliga politik")

'based oz discredited politics'