# Part 2.1

Reading training and parsing file

In [1]:
def read_file(file_path):
    file = open(file_path)
    parsed_file = []
    for line in file:
        if(line == ''): 
            break
        elif(line == '\n'):
            parsed_file.append((None, None))
            continue
        else: 
            temp = line.replace('\n','').split(' ')
            parsed_file.append(tuple(temp)) #return a tuple of word and its tag
      
    return parsed_file

# print(read_file("C:\\Users\\Lenovo\\Google Drive\\Courses\Term 6\\Machine Learning\\Project\\EN\\train"))
'''
returns something like this:
[('We', 'O'), ('were', 'O'), ('then', 'O'), ('charged', 'O'), ('for', 'O')......]
'''
        
        

"\nreturns something like this:\n[('We', 'O'), ('were', 'O'), ('then', 'O'), ('charged', 'O'), ('for', 'O')......]\n"

A helper function, smartify() is a function that takes file_path as input and returns training data in a 'smart' format.
It returns a dictionary smart_data which is basically training data but in a smart format.

Smart format is defined as: {tag:{word:frequency}}
example: {'O': {'We': 83, 'were': 125, 'then': 15, 'charged': 2, 'for': 279}

Also, smart_data.keys() (and also smartify().keys()) is list of all the tags 

In [2]:
def smartify(file_path):
    training_data = read_file(file_path)
    smart_data = {} #smart_data is basically training data but in a smart format 
                    #smart format is {tag:{word:frequency}}
                    #example: {'O': {'We': 83, 'were': 125, 'then': 15, 'charged': 2, 'for': 279}
    for word_tag_pair in training_data:
        
        if word_tag_pair==(None,None): #annoying case -.-
            continue 
        
        #if tag not in smart_data yet, add it 
        if not word_tag_pair[1] in smart_data.keys():
            smart_data[word_tag_pair[1]]={}
        
        # if word not in smart_data[tag] yet, add it
        if not word_tag_pair[0] in smart_data[word_tag_pair[1]].keys():
            smart_data[word_tag_pair[1]][word_tag_pair[0]] = 0
        
        #increase frequency for each word-tag pair if seen
        smart_data[word_tag_pair[1]][word_tag_pair[0]] += 1
    return smart_data
    
# print (smartify("C:\\Users\\Lenovo\\Google Drive\\Courses\Term 6\\Machine Learning\\Project\\EN\\train"))
    
    

A function to modify training set

In [3]:
def modify_training_set(training_data, k):
    #input: smart_data
    count = 0 #not sure if we will need number of words replaced by unk, but lets keep count anyway
    
    training_data = training_data
    for tag, word in training_data.items():
        for specific_word, f in list(word.items()):
            if (f<k):
                count +=f
#                 print(specific_word,f)
                #replace with #unk#
                word.pop(specific_word)
        word["#UNK#"] = count

    return training_data
# modify_training_set(smartify("C:\\Users\\Lenovo\\Google Drive\\Courses\Term 6\\Machine Learning\\Project\\EN\\train"),3) 

A function to estimate emission parameters



In [4]:
import math
def emission(file_path):
    emissions = {}
    training_data = smartify(file_path)
    training_data = modify_training_set(training_data, k=3) #modify training data 
    
    for tag, word in training_data.items():
        
        count_y = sum(word.values())
#         print (count_y)
        for specific_word, f in word.items():
            temp_tup = (specific_word,tag)
#             print(specific_word, f)
            emissions[temp_tup] = float(f/(count_y))

    return emissions
        

# print(emission("C:\\Users\\Lenovo\\Google Drive\\Courses\Term 6\\Machine Learning\\Project\\EN\\train"))

A function for reading test file

In [5]:
def read_test_file(file_path):
    file = open(file_path)
    parsed_file = []
    for line in file:
        if(line == ''):
            break
        if(line == '\n'):
            parsed_file.append(None)
            continue
        parsed_file.append(line.replace('\n',''))
    return parsed_file

# print(read_test_file("C:\\Users\\Lenovo\\Google Drive\\Courses\\Term 6\Machine Learning\\Project\\EN\\dev.in"))
    

A function to clean up the train file (replace words not in training set with '#UNK#')

In [6]:
def clean_up_test_data(test_data, emissions_dict):
    
    test_data = test_data
#     print("Original test data:",len(test_data))
    
    training_data = []
    list_of_replaced_words = []
    for tup in emissions_dict.keys():
        training_data.append(tup[0])
#     print(training_data)
    #compare training and test data and replace non-common words with #UNK#
    for word in test_data:
        if word =='START' or word=='STOP':
            continue
        elif word not in training_data:
            list_of_replaced_words.append(word) #we might need this for testing if the function is working
            index = test_data.index(word)
            test_data[index] = "#UNK#"
            
#     print("Number of replaced words:",len(list_of_replaced_words))        
#     print("Cleaned up test data:",len(test_data))

    return test_data
# print (clean_up_test_data( read_test_file("C:\\Users\\Lenovo\\Google Drive\\Courses\\Term 6\Machine Learning\\Project\\EN\\dev.in"),emission("C:\\Users\\Lenovo\\Google Drive\\Courses\Term 6\\Machine Learning\\Project\\EN\\train")))

Simple sentiment analysis function that computes argmax

In [17]:
def sentiment_anal(train_file_path, test_file_path):
    test_data = read_test_file(test_file_path)
    emissions = emission(train_file_path)
    cleaned_up_test_data = clean_up_test_data(test_data,emissions)
    
#     print(cleaned_up_test_data)
    argmax = []
    for word in cleaned_up_test_data:
#         print(word)
        if word == None:
            argmax.append((None, None))
            continue
        temp = {}
        
        #filter out common words into temp 
        for key in emissions.keys():

            if(key[0] == word):
#                 print(emissions[key])
                temp[key] = emissions[key]
                

        if temp=={}:
            continue
        else:
            result = max(temp, key=temp.get)
        argmax.append(result)
    return argmax


# sentiment_anal("C:\\Users\\Lenovo\\Google Drive\\Courses\Term 6\\Machine Learning\\Project\\EN\\train" ,"C:\\Users\\Lenovo\\Google Drive\\Courses\\Term 6\Machine Learning\\Project\\EN\\dev.in")
        

Write into dev.pn.out

In [8]:
def write_dev_out(input_data):
    path = "C:\\Users\\Lenovo\\Google Drive\\Courses\\Term 6\\Machine Learning\\Project\EN\\"
    filename = "dev.p2.out"
    with open(path+filename, 'w') as f:
        for tup in input_data:
            f.write(tup[0] + ' ' + tup[1]+'\n')
    f.close()

# write_dev_out(sentiment_anal("C:\\Users\\Lenovo\\Google Drive\\Courses\Term 6\\Machine Learning\\Project\\EN\\train" ,"C:\\Users\\Lenovo\\Google Drive\\Courses\\Term 6\Machine Learning\\Project\\EN\\dev.in"))

# Part 3

Function to take training data as input

In [9]:
def prepare_input_data_for_transition_paramaters(file_path):
    input_data = (read_file(file_path))
    input_data = [("START",None)]+input_data
    
    #wherever (None, None) is seen, instert a STOP and START
    for tup in input_data:
        if (tup == (None,None)):
            index = input_data.index(tup)
            #add a stop
            input_data[index] = ("STOP",None)
            input_data = input_data[:index+1] + [("START", None)]+ input_data[index+1:]
            
    #take care of the ending of the file
    input_data.pop(-2)
    return input_data 
    
    
# prepare_input_data_for_transition_paramaters("C:\\Users\\Lenovo\\Google Drive\\Courses\Term 6\\Machine Learning\\Project\\EN\\train")

In [10]:
def estimate_transition_parameters(file_path):
    input_data = prepare_input_data_for_transition_paramaters(file_path)
    
    emission_pairs = {}
    yi_1_count = {}
    
    
    for i in range (len(input_data)-1):
        yi_1 = input_data[i][1]
        yi = input_data[i+1][1]
        
        temp_tup = (yi_1,yi)
        if not temp_tup in emission_pairs.keys():
            emission_pairs[temp_tup] = 1
        else:
            emission_pairs[temp_tup]+=1
        
        if not yi_1 in yi_1_count.keys():
            yi_1_count[yi_1] = 1
        else: 
            yi_1_count[yi_1] += 1
            
            
#     print (emission_pairs)
#     print (yi_1_count)

    transition_params = emission_pairs #this is just for convenience
    
    for yi_1, count in yi_1_count.items():
        
        for tup in emission_pairs.keys():
            if (tup[0]==yi_1):
                transition_params[tup] = float(emission_pairs[tup]/count)
    
#     print(transition_params)
    return(transition_params)
           
    
# estimate_transition_parameters("C:\\Users\\Lenovo\\Google Drive\\Courses\Term 6\\Machine Learning\\Project\\EN\\train")

VITERBBBEEEEEHHHHHHHHHHHHHHHHHHHHH

In [11]:
def construct_sentences(file_path):
    file = open(file_path)
    sentences = []
    sentence = ['START']
    for line in file:
        if(line == ''):
            break
        if(line == '\n'):
            sentence.append('STOP')
            sentences.append(sentence)
            sentence = ['START']
            continue
        else:
            sentence.append(line.replace('\n',''))
            
    return sentences 

# print(construct_sentences("C:\\Users\\Lenovo\\Google Drive\\Courses\\Term 6\Machine Learning\\Project\\EN\\dev.in"))
    

In [15]:
import math
def viterbi(train_file_path, test_file_path):
    
    tags = ['O','B-positive','B-neutral','B-negative','I-positive','I-negative','I-neutral']
    emission_p = emission(train_file_path) #{('We', 'O'): 0.003423668687868663, ('were', 'O'): 0.005156127541970878, ('then', 'O'): 0.0006187353050365054,}
    transition_p = estimate_transition_parameters(train_file_path) #{(None, 'O'): 0.47022696929238983, ('O', 'O'): 0.8601188020790363, ('O', None): 0.07623133404834585,}
    sentences = construct_sentences(test_file_path)

    for i in range(len(sentences)):
        sentences[i]=clean_up_test_data(sentences[i],emission_p)
    
    for sentence in sentences:
        dp_table = [{},{}]
        for tag in tags:
            if((None,tag) in transition_p.keys()):                
                a_i_j = transition_p[(None,tag)] #transition probability from 'START'to tag
            else:
                a_i_j = 0
                            
            if( (sentence[1],tag) in emission_p.keys()):                
                b_j = emission_p[(sentence[1],tag)] #probability that tag emits word
            else:
                b_j = 0
            dp_table[1][tag] = {'p': a_i_j*b_j, 'previous':None}

        for t in range(2,len(sentence)-1):
            dp_table.append({})
            transition_probabilities = {}

            for tag in tags:
                for prev_tag in tags:

                    if((prev_tag,tag) in transition_p.keys()):                           
                        a_i_j = transition_p[(prev_tag,tag)] #transition probability from 'START'to tag
                    else:
                        a_i_j = 0

                    if(prev_tag in dp_table[t-1].keys()):
                        t_p = dp_table[t-1][prev_tag]['p']*a_i_j
                        transition_probabilities[(prev_tag,tag)]=t_p

                max_t_p = max(transition_probabilities.values())
                
                for key in transition_probabilities.keys():
                    if(transition_probabilities[key]==max_t_p):
                        argmax_t_p = key
                
                if((sentence[t],tag) in emission_p.keys()):
                    b_j = emission_p[(sentence[t],tag)] #probability that tag emits word
                else:
                    b_j = 0
                dp_table[t][tag] = {'p':max_t_p*b_j, 'previous':argmax_t_p[0]}

        opt = []
        pmax = max(probabilities['p'] for probabilities in dp_table[-1].values())
        backward_previous = None
        for tag, data in dp_table[-1].items():
            if data["p"] == pmax:
                opt.append(tag)
                prev_tag = tag
                break
        
        for t in range(len(dp_table) - 2, -1, -1):
            opt.insert(0, dp_table[t + 1][prev_tag]["previous"])
            prev_tag = dp_table[t + 1][prev_tag]["previous"]
        
        print(opt)
        
        
viterbi("C:\\Users\\Lenovo\\Google Drive\\Courses\Term 6\\Machine Learning\\Project\\EN\\train" ,"C:\\Users\\Lenovo\\Google Drive\\Courses\\Term 6\Machine Learning\\Project\\EN\\dev.in")
    
    

['START', 'When', 'I', 'called', 'this', '#UNK#', ',', 'I', "didn't", 'think', 'I', 'would', 'be', 'able', 'to', 'get', 'in', 'at', '#UNK#', ',', 'but', 'I', 'was', 'able', 'to', 'get', 'in', ',', '#UNK#', 'with', 'four', 'other', 'guests', '.', 'STOP']
[None, 'O', 'O', 'O', 'O', 'B-neutral', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-neutral', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-neutral', 'O', 'O', 'O', 'O', 'O']
['START', 'The', '#UNK#', 'and', '#UNK#', '#UNK#', 'are', 'excellent', '.', 'STOP']
[None, 'O', 'B-neutral', 'O', 'B-neutral', 'B-neutral', 'O', 'O', 'O']
['START', 'My', '#UNK#', 'pizza', 'joint', 'in', 'Seattle', 'STOP']
[None, 'O', 'B-neutral', 'B-positive', 'O', 'O', 'O']
['START', 'Food', '#UNK#', ',', 'its', 'ok', 'but', 'a', 'bit', 'pricey', 'for', 'what', 'you', 'get', 'considering', 'the', 'restaurant', '#UNK#', 'a', '#UNK#', 'place', '.', 'STOP']
[None, 'B-positive', 'B-neutral', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 