**Tokenization**

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
import math

def tokenize(line, tokenizer=word_tokenize):
    utf_line = line.lower()
    return [token for token in tokenizer(utf_line)]

[nltk_data] Downloading package punkt to /Users/kalai-
[nltk_data]     pt6931/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /Users/kalai-
[nltk_data]     pt6931/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Example for tokenization**


In [2]:
from functions import tokenize
tokenize("Nichts ist, wie es scheint.")

[nltk_data] Downloading package punkt to /Users/kalai-
[nltk_data]     pt6931/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kalai-
[nltk_data]     pt6931/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['nichts', 'ist', ',', 'wie', 'es', 'scheint', '.']

**Naming the path of the data files as variables**

In [4]:
DEVELOPMENT_DOCS = 'data/devel.docs' 

DEVELOPMENT_QUERIES = 'data/devel.queries' 

DEVELOPMENT_QREL = 'data/devel.qrel' 

BITEXT_ENG = 'data/bitext.en' 

BITEXT_DE = 'data/bitext.de' 

NEWSTEST_ENG = 'data/newstest.en' 

**Loading the devel.docs file, extracting and tokenizing the terms, and storing them in a python dictionary with the document ids as keys.**

In [5]:
import nltk
import re

stopwords = set(nltk.corpus.stopwords.words('english')) 
stemmer = nltk.stem.PorterStemmer() 

def extract_and_tokenize_terms(doc):
    terms = []
    for token in tokenize(doc):
        if token not in stopwords: 
            if not re.search(r'\d',token) and not re.search(r'[^A-Za-z-]',token): #Removing numbers and punctuations 
                terms.append(stemmer.stem(token.lower()))
    return terms

documents = {}

In [6]:
f = open(DEVELOPMENT_DOCS)

for line in f:
    print(line)
    doc = line.split("\t")
    print(doc)
    print(doc[1])
    print(doc[0])
    break
f.close()

12	anarchism anarchism is a political philosophy that advocates stateless societies based on non hierarchical free associations anarchism holds the state to be undesirable unnecessary or harmful the following sources cite anarchism as a political philosophy slevin carl anarchism the concise oxford dictionary of politics ed iain mclean and alistair mcmillan oxford university press 2003 while anti statism is central some argue that anarchism entails opposing authority or hierarchical organization in the conduct of human relations including but not limited to the state system as a subtle and anti dogmatic philosophy anarchism draws on many currents of thought and strategy anarchism does not offer a fixed body of doctrine from a single particular world view instead fluxing and flowing as a philosophy there are many types and traditions of anarchism not all of which are mutually exclusive anarchist schools of thought can differ fundamentally supporting anything from extreme individualism to

In [7]:
f = open(DEVELOPMENT_DOCS, encoding='utf-8')

for line in f:
    doc = line.split("\t")
    terms = extract_and_tokenize_terms(doc[1])
    documents[doc[0]] = terms
f.close()

KeyboardInterrupt: 

In [None]:
documents['308'][:65] 

**Building an inverted index for the documents**

In [24]:
from collections import defaultdict
    
inverted_index = defaultdict(set)

for docid, terms in documents.items():
    for term in terms:
        inverted_index[term].add(docid) 

In [None]:
sorted(inverted_index['state'])

**Building a TF-IDF representation using BM25**

In [26]:
NO_DOCS = len(documents) 

AVG_LEN_DOC = sum([len(doc) for doc in documents.values()])/len(documents) 

def tf_idf_score(k1,b,term,docid):  
    
    ft = len(inverted_index[term]) 
    term = stemmer.stem(term.lower())
    fdt =  documents[docid].count(term)
    
    idf_comp = math.log((NO_DOCS - ft + 0.5)/(ft+0.5))
    
    tf_comp = ((k1 + 1)*fdt)/(k1*((1-b) + b*(len(documents[docid])/AVG_LEN_DOC))+fdt)
    
    return idf_comp * tf_comp

def create_tf_idf(k1,b):
    tf_idf = defaultdict(dict)
    for term in set(inverted_index.keys()):
        for docid in inverted_index[term]:
            tf_idf[term][docid] = tf_idf_score(k1,b,term,docid)
    return tf_idf

In [27]:
tf_idf = create_tf_idf(1.6,0.875)

In [28]:
def get_qtf_comp(k3,term,fqt):
    return ((k3+1)*fqt[term])/(k3 + fqt[term])


#Function to retrieve documents || Returns a set of documents and their relevance scores. 
def retr_docs(query,result_count):
    q_terms = [stemmer.stem(term.lower()) for term in query.split() if term not in stopwords] 
    fqt = {}
    for term in q_terms:
        fqt[term] = fqt.get(term,0) + 1
    
    scores = {}
    
    for word in fqt.keys():
        for document in inverted_index[word]:
            scores[document] = scores.get(document,0) + (tf_idf[word][document]*get_qtf_comp(1.5,word,fqt)) 
    
    return sorted(scores.items(),key = lambda x : x[1] , reverse=True)[:result_count] 

In [None]:
retr_docs("which is the tallest building in the world?",5)

In [None]:
documents['34080'][:30]

In [None]:
f = open(DEVELOPMENT_DOCS)

for line in f:
    doc = line.split("\t")
    if doc[0]=='34080':
      print(doc[1])
      break
f.close()

In [35]:
#Calculating the unigram, bigram and trigram counts. 

f = open(BITEXT_ENG, encoding='utf-8')

train_sentences = []

for line in f:
    train_sentences.append(tokenize(line))

f.close()    

#Function to mark the first occurence of words as unknown, for training.
def check_for_unk_train(word,unigram_counts):
    if word in unigram_counts:
        return word
    else:
        unigram_counts[word] = 0
        return "UNK"

#Function to convert sentences for training the language model.    
def convert_sentence_train(sentence,unigram_counts):
    #<s1> and <s2> are sentinel tokens added to the start and end
    return ["<s1>"] + ["<s2>"] + [check_for_unk_train(token.lower(),unigram_counts) for token in sentence] + ["</s2>"]+ ["</s1>"]

#Function to obtain unigram, bigram and trigram counts.
def get_counts(sentences):
    trigram_counts = defaultdict(lambda: defaultdict(dict))
    bigram_counts = defaultdict(dict)
    unigram_counts = {}
    for sentence in sentences:
        sentence = convert_sentence_train(sentence, unigram_counts)
        for i in range(len(sentence) - 2):
            trigram_counts[sentence[i]][sentence[i+1]][sentence[i+2]] = trigram_counts[sentence[i]][sentence[i+1]].get(sentence[i+2],0) + 1
            bigram_counts[sentence[i]][sentence[i+1]] = bigram_counts[sentence[i]].get(sentence[i+1],0) + 1
            unigram_counts[sentence[i]] = unigram_counts.get(sentence[i],0) + 1
    unigram_counts["</s1>"] = unigram_counts["<s1>"]
    unigram_counts["</s2>"] = unigram_counts["<s2>"]
    bigram_counts["</s2>"]["</s1>"] = bigram_counts["<s1>"]["<s2>"]
    return unigram_counts, bigram_counts, trigram_counts

In [36]:
unigram_counts, bigram_counts,trigram_counts = get_counts(train_sentences)


In [37]:
#Constructing unigram model with 'add-k' smoothing
token_count = sum(unigram_counts.values())

#Function to convert unknown words for testing. 
def check_for_unk_test(word,unigram_counts):
    if word in unigram_counts and unigram_counts[word] > 0:
        return word
    else:
        return "UNK"


def convert_sentence_test(sentence,unigram_counts):
    return ["<s1>"] + ["<s2>"] + [check_for_unk_test(word.lower(),unigram_counts) for word in sentence] + ["</s2>"]  + ["</s1>"]

#Returns the log probability of a unigram, with add-k smoothing.
def get_log_prob_addk(word,unigram_counts,k):
    return math.log((unigram_counts[word] + k)/ \
                    (token_count + k*len(unigram_counts)))

#Returns the log probability of a sentence.
def get_sent_log_prob_addk(sentence, unigram_counts,k):
    sentence = convert_sentence_test(sentence, unigram_counts)
    return sum([get_log_prob_addk(word, unigram_counts,k) for word in sentence])


def calculate_perplexity_uni(sentences,unigram_counts, token_count, k):
    total_log_prob = 0
    test_token_count = 0
    for sentence in sentences:
        test_token_count += len(sentence) + 2 # have to consider the end token
        total_log_prob += get_sent_log_prob_addk(sentence,unigram_counts,k)
    return math.exp(-total_log_prob/test_token_count)


f = open(NEWSTEST_ENG)

test_sents = []
for line in f:
    test_sents.append(tokenize(line))
f.close()

In [38]:
#Calculating the perplexity for different ks
ks = [0.0001,0.01,0.1,1,10]

for k in ks:
    print(str(k) +": " + str(calculate_perplexity_uni(test_sents,unigram_counts,token_count,k)))
    

0.0001: 631.1001268265437
0.01: 631.1751762621166
0.1: 631.9125779449058
1: 642.807839891329
10: 814.3070354176195


In [39]:
#Calculating the N1/N paramaters for Trigrams/Bigrams/Unigrams in Katz-Backoff Smoothing

TRI_ONES = 0 
TRI_TOTAL = 0 

for twod in trigram_counts.values():
    for oned in twod.values():
        for val in oned.values():
            if val==1:
                TRI_ONES+=1 
            TRI_TOTAL += 1 

BI_ONES = 0 
BI_TOTAL = 0 

for oned in bigram_counts.values():
    for val in oned.values():
        if val==1:
            BI_ONES += 1 
        BI_TOTAL += 1 
        
UNI_ONES = list(unigram_counts.values()).count(1)
UNI_TOTAL = len(unigram_counts)

In [40]:
#Constructing trigram model with backoff smoothing

TRI_ALPHA = TRI_ONES/TRI_TOTAL #Alpha parameter for trigram counts
    
BI_ALPHA = BI_ONES/BI_TOTAL #Alpha parameter for bigram counts

UNI_ALPHA = UNI_ONES/UNI_TOTAL
    
def get_log_prob_back(sentence,i,unigram_counts,bigram_counts,trigram_counts,token_count):
    if trigram_counts[sentence[i-2]][sentence[i-1]].get(sentence[i],0) > 0:
        return math.log((1-TRI_ALPHA)*trigram_counts[sentence[i-2]][sentence[i-1]].get(sentence[i])/bigram_counts[sentence[i-2]][sentence[i-1]])
    else:
        if bigram_counts[sentence[i-1]].get(sentence[i],0)>0:
            return math.log(TRI_ALPHA*((1-BI_ALPHA)*bigram_counts[sentence[i-1]][sentence[i]]/unigram_counts[sentence[i-1]]))
        else:
            return math.log(TRI_ALPHA*BI_ALPHA*(1-UNI_ALPHA)*((unigram_counts[sentence[i]]+0.0001)/(token_count+(0.0001)*len(unigram_counts)))) 
        
        
def get_sent_log_prob_back(sentence, unigram_counts, bigram_counts,trigram_counts, token_count):
    sentence = convert_sentence_test(sentence, unigram_counts)
    return sum([get_log_prob_back(sentence,i, unigram_counts,bigram_counts,trigram_counts,token_count) for i in range(2,len(sentence))])


def calculate_perplexity_tri(sentences,unigram_counts,bigram_counts,trigram_counts, token_count):
    total_log_prob = 0
    test_token_count = 0
    for sentence in sentences:
        test_token_count += len(sentence) + 2 # have to consider the end token
        total_log_prob += get_sent_log_prob_back(sentence,unigram_counts,bigram_counts,trigram_counts,token_count)
    return math.exp(-total_log_prob/test_token_count)

In [41]:
#Calculating the perplexity 
calculate_perplexity_tri(test_sents,unigram_counts,bigram_counts,trigram_counts,token_count)

463.4051799314212

In [44]:
#Creating lists of English and German sentences from bitext.

from nltk.translate import IBMModel1
from nltk.translate import AlignedSent, Alignment

eng_sents = []
de_sents = []

f = open(BITEXT_ENG, encoding='utf-8')
for line in f:
    terms = tokenize(line)
    eng_sents.append(terms)
f.close()

f = open(BITEXT_DE, encoding = 'utf-8')
for line in f:
    terms = tokenize(line)
    de_sents.append(terms)
f.close()

In [45]:
#Zipping together the bitexts for easier access
paral_sents = list(zip(eng_sents,de_sents))

In [None]:
print(paral_sents[:20])

In [None]:
print(eng_sents[:20])

In [46]:
#Building English to German translation table for words (Forward alignment)
eng_de_bt = [AlignedSent(E,G) for E,G in paral_sents]
eng_de_m = IBMModel1(eng_de_bt, 5)

In [47]:
#Building German to English translation table for words (Backward alignment)
de_eng_bt = [AlignedSent(G,E) for E,G in paral_sents]
de_eng_m = IBMModel1(de_eng_bt, 5)

In [68]:
#Script below to combine alignments using set intersections
combined_align = []

for i in range(len(eng_de_bt)):

    forward = {x for x in eng_de_bt[i].alignment}
    back_reversed = {x[::-1] for x in de_eng_bt[i].alignment}
    
    combined_align.append(forward.intersection(back_reversed))

In [69]:
#Creating German to English dictionary with occurence count of word pairs
de_eng_count = defaultdict(dict)

for i in range(len(de_eng_bt)):
    for item in combined_align[i]:
        de_eng_count[de_eng_bt[i].words[item[1]]][de_eng_bt[i].mots[item[0]]] =  de_eng_count[de_eng_bt[i].words[item[1]]].get(de_eng_bt[i].mots[item[0]],0) + 1

#Creating a English to German dict with occ count of word pais
eng_de_count = defaultdict(dict)

for i in range(len(eng_de_bt)):
    for item in combined_align[i]:
        eng_de_count[eng_de_bt[i].words[item[0]]][eng_de_bt[i].mots[item[1]]] =  eng_de_count[eng_de_bt[i].words[item[0]]].get(eng_de_bt[i].mots[item[1]],0) + 1

In [57]:
#Creating German to English table with word translation probabilities          
de_eng_prob = defaultdict(dict)

for de in de_eng_count.keys():
    for eng in de_eng_count[de].keys():
        de_eng_prob[de][eng] = de_eng_count[de][eng]/sum(de_eng_count[de].values())

#Creating English to German dict with word translation probabilities 
eng_de_prob = defaultdict(dict)

for eng in eng_de_count.keys():
    for de in eng_de_count[eng].keys():
        eng_de_prob[eng][de] = eng_de_count[eng][de]/sum(eng_de_count[eng].values())

In [58]:
i=9
for de in de_eng_count.items():
  print(de)
  if i<1:
    break
  i=i-1
  

('10.000', {'10,000': 1, 'waited': 1})
('gold', {'gold': 5, 'ites': 1})
('?', {'?': 388})
('dollar', {'$': 4, 'dollar': 7})
('zu', {'to': 668, ',': 2, 'it': 2})
('san', {'san': 3, 'border': 1})
('.', {'.': 15769})
('den', {'the': 186, 's': 9, 'to': 29, 'in': 2})
('es', {'it': 281, 'is': 66, 'not': 25, 'to': 3, 'there': 2})
('nie', {'never': 22})


In [59]:
i=9
for de in eng_de_count.items():
  print(de)
  if i<1:
    break
  i=i-1
  

('10,000', {'10.000': 1, 'lebhafte': 1, 'festzustellen': 1})
('gold', {'gold': 5})
('?', {'?': 388, 'kim': 1, 'wie': 1})
('$', {'dollar': 4})
('to', {'zu': 668, 'von': 9, ',': 169, 'den': 29, 'für': 6, 'auf': 2, 'es': 3, 'werden': 4, 'sich': 4, 'um': 1, 'sie': 2})
('san', {'san': 3})
('.', {'.': 15769, 'unaufhörlich': 1, 'zurückgeben': 1, 'tripolis': 1})
('the', {'den': 186, 'die': 4601, 'der': 1236, 'das': 91, 'des': 14, 'dem': 10, 'im': 9})
('it', {'es': 281, 'sie': 59, 'hat': 1, 'ist': 9, 'nicht': 3, 'er': 1, 'dasselbe': 1, 'zu': 2, 'mehr': 1, 'wird': 1, 'mithalten': 1, 'zugewanderte': 1})
('never', {'nie': 22, 'gesund': 1})


In [60]:
#Examples of translating individual words from German to English
print(de_eng_prob['frage'])

print(de_eng_prob['nie'])

print(de_eng_prob['haus'])

{'question': 1.0}
{'never': 1.0}
{'house': 0.625, 'charity': 0.125, 'hospitalized': 0.125, 'offset': 0.125}


In [61]:
#Building noisy channel translation model
def de_eng_noisy(german):
    noisy={}
    for eng in de_eng_prob[german].keys():
        noisy[eng] = eng_de_prob[eng][german]+ get_log_prob_addk(eng,unigram_counts,0.0001)
    return noisy

In [62]:
#Test block to check alignments
print(de_eng_noisy('nie'))
print(de_eng_noisy('haus'))
print(de_eng_noisy('das'))
print(de_eng_noisy('entschuldigung'))

{'never': -7.034609059797315}
{'house': -8.153224281882322, 'charity': -11.032065847914977, 'hospitalized': -11.319739587276473, 'offset': -11.225188029412386}
{'this': -4.991400947456017, 'that': -4.857303935975769, 'is': -4.278625574868814, 'the': -3.0973206970208214, 'for': -5.074260930518287, 'of': -3.88129895540149}
{'pledging': -21.128725580698557}


In [63]:
eng_de_prob['father']

{}

In [64]:

#Function for direct translation
def de_eng_direct(query):
    query_english = [] 
    query_tokens = tokenize(query)
    
    for token in query_tokens:
        try:
            query_english.append(max(de_eng_prob[token], key=de_eng_prob[token].get))
        except:
            query_english.append(token) 
    return " ".join(query_english)

#Function for noisy channel translation
def de_eng_noisy_translate(query):  
    query_english = [] 
    query_tokens = tokenize(query)
    
    for token in query_tokens:
        try:
            query_english.append(max(de_eng_noisy(token), key=de_eng_noisy(token).get))
        except:
            query_english.append(token) 
    return " ".join(query_english)
            
f = open(DEVELOPMENT_QUERIES, encoding='utf-8')

lno = 0
plno = 0

#Also building a dictionary of query ids and query content 
german_qs = {}

test_query_trans_sents = [] 

for line in f:
    lno+=1
    query_id = line.split('\t')[0]
    query_german = line.split('\t')[1]  
    
    german_qs[query_id] = query_german.strip()
    
    translation = str(de_eng_noisy_translate(query_german))
 
    if plno<5:
        print(query_id + "\n" + "German: " + str(query_german) + "\n" + "English: " + translation +"\n\n")
        plno+=1
    test_query_trans_sents.append(translation)
    if lno==100:
        break

f.close()

82
German: der ( von engl . action : tat , handlung , bewegung ) ist ein filmgenre des unterhaltungskinos , in welchem der fortgang der äußeren handlung von zumeist spektakulär inszenierten kampf - und gewaltszenen vorangetrieben und illustriert wird .

English: the ( , leninism . action : rattling , side , movement ) is a filmgenre the unterhaltungskinos , in paulson the fortgang the trumpet side , zumeist spektakulär inszenierten fight - and gewaltszenen annan and illustriert is .


116
German: die ( einheitenzeichen : u für unified atomic mass unit , veraltet amu für atomic mass unit ) ist eine maßeinheit der masse .

English: the ( einheitenzeichen : u for unified atomic manipulation unit , regime amu for atomic manipulation unit ) is a befuddled the masse .


240
German: der von lateinisch actualis , " wirklich " , auch aktualitätsprinzip , uniformitäts - oder gleichförmigkeitsprinzip , englisch uniformitarianism , ist die grundlegende wissenschaftliche methode in der .

English: 

In [65]:
translation = str(de_eng_noisy_translate("welches ist das höchste gebäude der welt?"))
print(translation)
retr_docs(translation,5)

welches is the notions edifice the world ?


[('63341', 10.4405268930542),
 ('49702', 10.319217561251946),
 ('245990', 10.094533935342444),
 ('91031', 10.084863853266132),
 ('6042', 8.852462540069059)]

In [88]:
f = open(DEVELOPMENT_DOCS, encoding='utf-8')

for line in f:
    doc = line.split("\t")
    if doc[0]=='3788':
      print(doc[1])
      break
f.close()

body with regard to living things a body is the physical body of an individual body often is used in connection with appearance health issues and death the study of the workings of the body is physiology human body the human body mostly consists of a head neck torso two arms and two legs as well as numerous internal organ groups such as respiratory circulatory and a central nervous system variations the dead body of a human is referred to as a cadaver or corpse the dead bodies of vertebrate animals insects and humans are sometimes called carcasses the study of the structure of the body is called anatomy a carcase is the body of a slaughtered animal after the removal of offal that is to be used as meat antonymin the views emerging from the mind body dichotomy the body is considered in behavior and therefore considered as little valued the mind body problem by robert m. young and trivial in comparison to mind spirit or soul materialist philosophers of mind maintain that the mind is not s

In [None]:
translation = str(de_eng_direct("welches ist das höchste gebäude der welt?"))
print(translation)
retr_docs(translation,5)

In [67]:
import joblib

In [70]:
joblib.dump(documents,'Documents')

['Documents']

In [71]:
joblib.dump(tf_idf,'TF-IDF')

['TF-IDF']

In [72]:
joblib.dump(eng_sents, 'EngSents')

['EngSents']

In [73]:
joblib.dump(de_sents,'DeSents')

['DeSents']

In [75]:
joblib.dump(eng_de_m, 'E2D-Model')

PicklingError: Can't pickle <function IBMModel.reset_probabilities.<locals>.<lambda> at 0x0000023AE1AEA950>: it's not found as nltk.translate.ibm_model.IBMModel.reset_probabilities.<locals>.<lambda>

In [76]:
joblib.dump(de_eng_m, 'D2E-Model')

PicklingError: Can't pickle <function IBMModel.reset_probabilities.<locals>.<lambda> at 0x0000023AE5A20940>: it's not found as nltk.translate.ibm_model.IBMModel.reset_probabilities.<locals>.<lambda>

In [77]:
joblib.dump(unigram_counts, 'Unigram-Counts')

['Unigram-Counts']