In [None]:
import re
import numpy as np

#get data from the fileName passed as argument (in the same folder)
def getData(filename):
    x = list()
    y = list()
    z = list()
    
    with open(filename) as file:
        line = file.readline()
        while line:
            line = line.strip()
            st = line.split(" ")
            if len(st) == 3:
                x.append(st[0])
                y.append(st[1])
                z.append(st[2])
            elif len(st) == 1:
                x.append("-1")
                y.append("-1")
                z.append("-1")
            line = file.readline()
    return x,y,z

train_words,train_POS_tags,train_chunking_tags = getData('train.txt')
test_words,test_POS_tags,test_chunking_tags = getData('test.txt')

print('Train-Set size: ' + str(len(train_words)))
print('Test-Set size: ' + str(len(test_words)))

In [None]:
#dictionary of tags
dic = {}
dic[0] = "#"
dic[1] = "$"
dic[2] = "''"
dic[3] = "("
dic[4] = ")"
dic[5] = ","
dic[6] = "."
dic[7] = ":"
dic[8] = "CC"
dic[9] = "CD"
dic[10] = "DT"
dic[11] = "EX"
dic[12] = "FW"
dic[13] = "IN"
dic[14] = "JJ"
dic[15] = "JJR"
dic[16] = "JJS"
dic[17] = "MD"
dic[18] = "NN"
dic[19] = "NNP"
dic[20] = "NNPS"
dic[21] = "NNS"
dic[22] = "PDT"
dic[23] = "POS"
dic[24] = "PRP"
dic[25] = "PRP$"
dic[26] = "RB"
dic[27] = "RBR"
dic[28] = "RBS"
dic[29] = "RP"
dic[30] = "SYM"
dic[31] = "TO"
dic[32] = "UH"
dic[33] = "VB"
dic[34] = "VBD"
dic[35] = "VBG"
dic[36] = "VBN"
dic[37] = "VBP"
dic[38] = "VBZ"
dic[39] = "WDT"
dic[40] = "WP"
dic[41] = "WP$"
dic[42] = "WRB"
dic[43] = "``"

In [None]:
#getting the accuracy of the model
def getAccuracy(prediction):
    cnt = 0
    j = 0
    for i in len(prediction):
        if(test_POS_tag[j] == "-1"):
            j = j+1
            continue
        elif prediction[i] == test_POS_tag[j]:
            j = j+1
            cnt = cnt+1
    return cnt/len(prediction)

In [None]:
#for getting a feature vector
def getFeatures(words,index):
    feature = list()
    
    feature.append(True)
    #feature 1: is the first letter Capitalized?
    feature.append(words[index][0].isupper())
    
    #feature 2: is it ending in "ing"?
    feature.append(words[index][-3:] == "ing")
    
    #feature 3: is it ending in "ly"?
    feature.append(words[index][-2:] == "ly")
    
    #feature 4: is the previous word "the"?
    if index > 0:
        feature.append(words[index-1] == "the")
    else:
        feature.append(False)
    
    #feature 5: does it contain a number?
    pattern = re.compile(r'\d')
    feature.append(len(pattern.findall(words[index])) > 0)
    
    #featre 6: does it have a hyphen?
    pattern = re.compile(r'-')
    feature.append(len(pattern.findall(words[index])) > 0)
    
    return feature

In [None]:
def getFeatureMatrix(words):
    matrix = list()
    size = len(words)
    
    i=0
    while i<size:
        sentence = list()
        while i<size and words[i] != "-1":
            sentence.append(words[i])
            i = i+1
        j=0
        while j < len(sentence):
            matrix.append(getFeatures(sentence,j))
            j=j+1
        i = i+1
        
    return matrix

In [None]:
feature_matrix = np.asarray(getFeatureMatrix(train_words))
feature_matrix

In [None]:
weight_matrix = np.ones((7,len(dic)),dtype=int)
weight_matrix

In [None]:
Z = np.matmul(feature_matrix,weight_matrix)

#softmax layer
Z = np.exp(Z)
Z = Z/np.sum(Z)
Z