## Part 1 (Morphology) - Obtain words in present tense and in past tense

In [None]:

import re
import nltk
import numpy as np
from nltk.tokenize import word_tokenize


with open('en.txt', 'r',encoding='utf8') as f:
    content_eng = f.read()

temp = re.sub(r'—', ' ', content_eng)  # '—' in this file was used to connecting sentences
temp = re.sub(r'[`´]', "'", temp) #replace uncommon characters 
temp = re.sub(r'[“”]', '"',temp)
filter_content = re.sub(r"(?<![a-zA-Z])\'", "' ", temp)

tokens_eng = word_tokenize(filter_content)
tagged_eng = nltk.pos_tag(tokens_eng)
pres_eng = {word for word, tag in tagged_eng if tag in {'VBP', 'VBZ'}} 
past_eng = {word for word, tag in tagged_eng if tag == 'VBD'} 
print(f'Words in present tense:\n{pres_eng}')
print(f'Word in past tense:\n{past_eng}')

Words in present tense:
{'Brave', 'comforts', 'myself', 'informs', 'heareth', 'wastes', 'wounds', 'drain', 'hares', 'overlook', 'judge', 'heedful', 'triumph', 'disregard', 'patrician', 'ante-chamber', 'persuades', 'sounds', 'hangers-on', 'aggravates', 'disentangle', 'breakfast', 'Want', 'cried', 'lent', 'listens', 'care', 'aboard', 'Widows', 'edge', 'warier', 'bore', 'confound', 'Withers', 'repel', 'row', 'chastise', 'clothes', 'worthy', 'meet', 'circulates', 'presume', 'form', 'ask', 'strangle', 'fears', 'nigher', 'encompasses', 'yard', 'ejaculated.', 'haunt', 'knots', 'mine', 'pilgrimages', 'deserves', 'splendour', 'intend', 'snib', 'oppresses', 'confuse', 'agonistes', 'includes', 'Hope', 'Wishes', 'shot', 'cover', 'complains', 'registers', 'Catharines', 'spelt', 'slew', 'shrieks', 'situate', 'natures', 'tips', 'names', 'tell', 'responsibilities', 'dear', 'treasures', 'argue', 'cares', 'dries', 'glad', 'lead', 'races', 'wherewith', 'announces', 'overtake', 'paused.', 'drops', 'worser

In [None]:
import stanza
stanza.download('es')
spa_lmodel = stanza.Pipeline(lang='es',processors='tokenize,mwt,pos')

In [None]:
# Obtain words in present tense and in past tense from a spanish corpus
with open('sp.txt', 'r',encoding='utf8') as f:
    content_spa = f.read()
sub_spa = content_spa[:500000]   #the amount of data in sp.txt is too large and takes too much time to process, so I used a subset
tagged_spa = spa_lmodel(sub_spa)
pres_spa = {word.text for sent in tagged_spa.sentences for word in sent.words if word.upos == 'VERB' and 'Pres' in word.feats}
past_spa = {word.text for sent in tagged_spa.sentences for word in sent.words if word.upos == 'VERB' and 'Past' in word.feats}
print(f'Words in present tense:\n{pres_spa}')
print(f'Word in past tense:\n{past_spa}')

In [None]:
stanza.download('sv')
swe_lmodel = stanza.Pipeline(lang='sv',processors='tokenize,lemma,pos')

In [None]:
# Obtain words in present tense and in past tense from a swedish corpus
with open('swe.txt', 'r',encoding='utf8') as f:
    content_swe = f.read()
sub_swe = content_swe[:500000]  #the amount of data in sp.txt is too large and takes too much time to process, so I used a subset
tagged_swe = swe_lmodel(sub_swe)
pres_swe = {word.text for sent in tagged_swe.sentences for word in sent.words if word.upos == 'VERB' and 'Pres' in word.feats}
past_swe = {word.text for sent in tagged_swe.sentences for word in sent.words if word.upos == 'VERB' and 'Past' in word.feats}
print(f'Words in present tense:\n{pres_swe}')
print(f'Word in past tense:\n{past_swe}')

## Part 2 (Language Models) -Train a language recognizer using character bigrams (pairs of adjacent characters)

In [None]:
from math import prod
def model_gen(content):
    bigrams = nltk.bigrams(content.lower())
    cfd = nltk.ConditionalFreqDist(bigrams)
    cpd = nltk.ConditionalProbDist(cfd, nltk.MLEProbDist)
    return cpd
cpd_eng = model_gen(content_eng)
cpd_spa = model_gen(content_spa)
cpd_swe = model_gen(content_swe)

In [12]:
def probability_lan(sentence, cpd):
    sentence1 = sentence.lower()
    prob = prod([cpd[bigram[0]].prob(bigram[1]) for bigram in nltk.bigrams(sentence1)])
    return prob

def predict_lan(sentence):
    prob_eng = probability_lan(sentence, cpd_eng)
    prob_spa = probability_lan(sentence, cpd_spa)
    prob_swe = probability_lan(sentence, cpd_swe)
    prob_dic = {'English': prob_eng, 'Spanish': prob_spa, 'Swedish': prob_swe}
    return max(prob_dic, key = prob_dic.get)

sample = "If I seem to speak exultantly, it is only because my intellect enjoys the clear perception of a fact."
print(f'Probability of being English: {probability_lan(sample, cpd_eng)}')
print(f'Probability of being Spanish: {probability_lan(sample, cpd_spa)}')
print(f'Probability of being Swedish: {probability_lan(sample, cpd_swe)}')
print(f'The sentence is predicted as {predict_lan(sample)}')


Probability of being English: 8.198576282955312e-118
Probability of being Spanish: 2.7844844654332315e-159
Probability of being Swedish: 5.16785490688622e-151
The sentence is predicted as English


In [None]:
#Examples
sent1 = 'When the servant came to clear the table, he strolled slowly away, humming a tune.'
sent2 = 'Parecía como si se hubiera frotado la cara con grana de coscoja.'
sent3 = 'Det var patron Kristersons. Hvad de voro vackra!'

print(f'The probaility of "{sent1}" being each language:\nEnglish: {probability_lan(sent1, cpd_eng)}\nSpanish: {probability_lan(sent1, cpd_spa)}\nSwedish: {probability_lan(sent1, cpd_swe)}' )
print(f'Hence, this sentence is predicted as {predict_lan(sent1)}.\n')
print(f'The probaility of "{sent2}" being each language:\nEnglish: {probability_lan(sent2, cpd_eng)}\nSpanish: {probability_lan(sent2, cpd_spa)}\nSwedish: {probability_lan(sent2, cpd_swe)}' )
print(f'Hence, this sentence is predicted as {predict_lan(sent2)}.\n')
print(f'The probaility of "{sent3}" being each language:\nEnglish: {probability_lan(sent3, cpd_eng)}\nSpanish: {probability_lan(sent3, cpd_spa)}\nSwedish: {probability_lan(sent3, cpd_swe)}' )
print(f'Hence, this sentence is predicted as {predict_lan(sent3)}.\n')

#not good enough, did not perform well at unknown bigrams

The probaility of "When the servant came to clear the table, he strolled slowly away, humming a tune." being each language:
English: 2.656259539002828e-85
Spanish: 1.626768685798974e-113
Swedish: 4.480884354010982e-109
Hence, this sentence is predicted as English.

The probaility of "Parecía como si se hubiera frotado la cara con grana de coscoja." being each language:
English: 0.0
Spanish: 4.352887600804348e-62
Swedish: 0.0
Hence, this sentence is predicted as Spanish.

The probaility of "Det var patron Kristersons. Hvad de voro vackra!" being each language:
English: 1.287056093138343e-67
Spanish: 2.3824781792748216e-63
Swedish: 7.936278448034972e-51
Hence, this sentence is predicted as Swedish.



## Part 3 (Minimum Edit Distance)

In [None]:
# implementation of Minimum Edit Distance.
ops=['i','d','s']

def MED(src_str,trg_str):
    src_str = "#"+src_str
    trg_str = "#"+trg_str
    
    ins_cost = 1
    del_cost = 1
    sub_cost = 2
    
    m = len(src_str)
    n = len(trg_str)
    
    #INITIALIZE DISTANCE MATRIX WITH ZEROS
    distance_matrix = np.zeros((n,m))

    #INITIALIZE COLUMN 0 VALUES
    distance_matrix [:,0] = np.arange(0,n,del_cost)
    #INITIALIZE ROW 0 VALUES
    distance_matrix [0,:] = np.arange(0,m,ins_cost)    
    
    for i in range(1,n): #each column
        for j in range(1,m): #each row
            insert = distance_matrix[i-1,j] + ins_cost
            delete = distance_matrix[i,j-1] + del_cost
            if src_str[j]==trg_str[i]:
                substi = distance_matrix[i-1,j-1]
            else:
                substi = distance_matrix[i-1,j-1] + sub_cost

            distance_matrix[i,j] = min([insert,delete,substi])
            which_op = np.argmin([insert,delete,substi])
    
    #RETURN THE LAST ELEMENT
    return distance_matrix[-1,-1]

MED("PRNAP","PAP")

2.0

In [None]:
#Return one valid alignment using MED
ops=['i','d','s']
def MED_alignment(src_str, trg_str):
    src_str = "#"+src_str
    trg_str = "#"+trg_str
    
    ins_cost = 1
    del_cost = 1
    sub_cost = 2
    
    m = len(src_str)
    n = len(trg_str)
    
    #INITIALIZE DISTANCE MATRIX WITH ZEROS
    distance_matrix = np.zeros((n,m))

    #INITIALIZE COLUMN 0 VALUES
    distance_matrix [:,0] = np.arange(0,n,del_cost)
    #INITIALIZE ROW 0 VALUES
    distance_matrix [0,:] = np.arange(0,m,ins_cost)
    
    bck = np.zeros((n,m),dtype=str)
    bck[1:,0] = 'i'
    bck[0,1:] = 'd'
    
    for i in range(1,n): #each column
        for j in range(1,m): #each row
            insert = distance_matrix[i-1,j] + ins_cost
            delete = distance_matrix[i,j-1] + del_cost
            if src_str[j]==trg_str[i]:
                substi = distance_matrix[i-1,j-1]
            else:
                substi = distance_matrix[i-1,j-1] + sub_cost

            distance_matrix[i,j] = min([insert,delete,substi])
            which_op = np.argmin([insert,delete,substi])
            bck[i,j] = ops[which_op]
            if bck[i,j] == 's' and src_str[j]==trg_str[i]:    
                bck[i,j] = '-'
            #print (distance_matrix[i,j],i,j, "-",insert,delete,substi, ops[which_op])  

    alignment = []
    i, j = n-1, m-1
    while i > 0 or j > 0:
        alignment.append(bck[i, j])
        if bck[i, j] == 'i':
            i -= 1
        elif bck[i, j] == 'd':
            j -= 1
        else:
            i -= 1
            j -= 1
    alignment.reverse()
            
    return alignment

alignment= MED_alignment("intention","execution")
distance = MED("intention","execution")
print(f'Minimum Edit Distance: {distance}')
print(f'alignment: {alignment}')

Minimum Edit Distance: 8.0
alignment: ['d', 'd', 'd', '-', 'd', 'i', 'i', 'i', 'i', '-', '-', '-', '-']


In [None]:
#using MED_Alignment to see the difference between presnt tense an past tense
word_pairs = [['forget', 'forgot'], ['eat', 'ate'],['looks', 'looked'], ['changes', 'changed'], 
              ['como', 'comí'], ['viene', 'vino'], ['dicen', 'dijeron'], ['conducen', 'condujeron'],
              ['läser', 'läste'], ['säger', 'sade'], ['talar', 'talade'], ['stannar', 'stannade']]

print(f"Present -> Past{' '*8}MED{' '*3}Alignment")

for i in range(12):
    present = word_pairs[i][0]
    past = word_pairs[i][1]
    distances = MED(present, past)
    alignment = MED_alignment(present, past)
    print(f"{present:<11}{past:<12}{distances:<6}{alignment}")


Present -> Past        MED   Alignment
forget     forgot      2.0   ['-', '-', '-', '-', 'd', 'i', '-']
eat        ate         2.0   ['d', '-', '-', 'i']
looks      looked      3.0   ['-', '-', '-', '-', 'd', 'i', 'i']
changes    changed     2.0   ['-', '-', '-', '-', '-', '-', 'd', 'i']
como       comí        2.0   ['-', '-', '-', 'd', 'i']
viene      vino        3.0   ['-', '-', 'd', '-', 'd', 'i']
dicen      dijeron     4.0   ['-', '-', 'd', 'i', '-', 'i', 'i', '-']
conducen   condujeron  4.0   ['-', '-', '-', '-', '-', 'd', 'i', '-', 'i', 'i', '-']
läser      läste       2.0   ['-', '-', '-', 'i', '-', 'd']
säger      sade        5.0   ['-', 'd', 'd', 'i', 'i', '-', 'd']
talar      talade      3.0   ['-', '-', '-', '-', 'd', 'i', 'i']
stannar    stannade    3.0   ['-', '-', '-', '-', '-', '-', 'd', 'i', 'i']


## Part 4 (Sentiment Analysis-The Naive Bayes model)


In [18]:
import os
import numpy as np
import random

In [19]:
def filetowordlist(path, sfx):
    words = []
    for item in sorted(os.listdir(path)):    
        if sfx in item:
            f=open(path + item, encoding="iso8859-1")
            lines = [line.strip() for line in f]
            f.close()
            wordsinfile = []
            for l in lines:
                sentencewords = l.split()
                wordsinfile = wordsinfile + sentencewords
            words.append(wordsinfile)
    return words

def log(number):
    return np.log(number)

In [20]:
#Change the directory below
#Put the directory where you downloaded the corpus
posreviews_all = filetowordlist("/Users/charlottexx/Downloads/Assigment 2/mix20_rand700_tokens_0211/tokens/pos/", ".txt")
negreviews_all = filetowordlist("/Users/charlottexx/Downloads/Assigment 2/mix20_rand700_tokens_0211/tokens/neg/", ".txt")

In [None]:
#training data
posreviews_train = posreviews_all[:550]
negreviews_train = negreviews_all[:550]

#test data
posreviews_test  = posreviews_all[550:]
negreviews_test  = negreviews_all[550:]

In [None]:
#the number of positive and negative reviews in each of the portions of the data (train and test)
len(posreviews_train), len(negreviews_train),len(posreviews_test), len(negreviews_test)

(550, 550, 144, 142)

In [None]:
#two lists with all words in each group of reviews
#poswords_train contains a list of words, with concatenated positive reviews
poswords_train=[word for sent in posreviews_train for word in sent]
negwords_train=[word for sent in negreviews_train for word in sent]

In [26]:
#Vocabularies for positive and negative reviews
pos_vocab_train = set(poswords_train)
neg_vocab_train = set(negwords_train)
vocab_train = pos_vocab_train.union(neg_vocab_train)

#Number of types (vocabulary size)
pos_vocab_size_train = len(pos_vocab_train)
neg_vocab_size_train = len(neg_vocab_train)
vocab_size_train = len(vocab_train)

#Number of words (tokens)
noposwords_train=len(poswords_train)
nonegwords_train=len(negwords_train)

#Number of reviews
noposreviews_train=len(posreviews_train)
nonegreviews_train=len(negreviews_train)

In [30]:
pos_vocab_size_train,noposwords_train,noposreviews_train

(27813, 445289, 550)

In [31]:
log(5),np.log10(5)

(1.6094379124341003, 0.6989700043360189)

In [None]:
#calculate proir probabilities
prior_probabiolity_pos_train = log(noposreviews_train / (noposreviews_train + nonegreviews_train))
prior_probabiolity_neg_train = log(nonegreviews_train / (noposreviews_train + nonegreviews_train))
print ("This is the log probability of a review to be positive:")
print (prior_probabiolity_pos_train)
print ("This is the log probability of a review to be negative:")
print (prior_probabiolity_neg_train)

This is the log probability of a review to be positive:
-0.6931471805599453
This is the log probability of a review to be negative:
-0.6931471805599453


In [None]:
from nltk import FreqDist

pos_frequencies = FreqDist(poswords_train)
neg_frequencies = FreqDist(negwords_train)

print ("The word nice appears "+str(pos_frequencies["nice"])+" times in the positive reviews")
print ("The word nice appears "+str(neg_frequencies["nice"])+" times in the negative reviews")
print()

print ("The word bad appears "+str(pos_frequencies["bad"])+" times in the positive reviews")
print ("The word bad appears "+str(neg_frequencies["bad"])+" times in the negative reviews")


The word nice appears 118 times in the positive reviews
The word nice appears 92 times in the negative reviews

The word bad appears 223 times in the positive reviews
The word bad appears 509 times in the negative reviews


In [None]:
#estimate the probability of each word to appear in a positive review
pos_logprobs = {}
neg_logprobs = {}

for word in vocab_train:
    word_pos_count = pos_frequencies.get(word,0)
    word_pos_prob = (word_pos_count + 1) / (noposwords_train + vocab_size_train)
    pos_logprobs[word] = log(word_pos_prob)
    
    word_neg_count = neg_frequencies.get(word,0)
    word_neg_prob = (word_neg_count + 1) / (nonegwords_train + vocab_size_train)
    neg_logprobs[word] = log(word_neg_prob)

print ("The log probability of the word nice to appear in the positive reviews is: "+str(pos_logprobs["nice"]))
print ("The log probability of the word nice to appear in the negative reviews is: "+str(neg_logprobs["nice"]))

print ()

print ("The log probability of the word bad to appear in the positive reviews is: "+str(pos_logprobs["bad"]))
print ("The log probability of the word bad to appear in the negative reviews is: "+str(neg_logprobs["bad"]))


The log probability of the word nice to appear in the positive reviews is: -8.310737353530826
The log probability of the word nice to appear in the negative reviews is: -8.438475770037291

The log probability of the word bad to appear in the positive reviews is: -7.678214794787315
The log probability of the word bad to appear in the negative reviews is: -6.736664537472175


In [None]:
#handle out-of-vocabulary words, estimate the log probability of unseen words for both positive and negative classes in training set
pos_oov_word_logprob = log(1 / (noposwords_train + vocab_size_train))
neg_oov_word_logprob = log(1 / (nonegwords_train + vocab_size_train))

print ("The log probability of an out of vocabulary word in the positive reviews is:")
print (pos_oov_word_logprob)
print ("The log probability of an out of vocabulary word in the negative reviews is:")
print (neg_oov_word_logprob)

The log probability of an out of vocabulary word in the positive reviews is:
-13.089860846642354
The log probability of an out of vocabulary word in the negative reviews is:
-12.971075263190547


In [None]:
#return true if an input review is positive
def positive_or_not(s):
    pos_logprob_total = prior_probabiolity_pos_train
    neg_logprob_total = prior_probabiolity_neg_train
    
    for word in s:
        pos_logprob_total += pos_logprobs.get(word, pos_oov_word_logprob)
        neg_logprob_total += neg_logprobs.get(word, neg_oov_word_logprob)
        
    if pos_logprob_total == neg_logprob_total:
        return np.random.choice([True, False])
                           
    return pos_logprob_total > neg_logprob_total


In [43]:
review_test = "the movie is nice".split()
positive_or_not(review_test) #This is supposed to be positive

True

In [44]:
review_test = "the movie is bad".split()
positive_or_not(review_test) #This is supposed to be negative

False

In [45]:
review_test = "such an awful movie".split()
positive_or_not(review_test) #This is supposed to be negative

False

In [46]:
review_test = "lovely experience".split()
positive_or_not(review_test) #This is supposed to be positive

True