In [1]:
import re
import numpy as np

#get data from the fileName passed as argument (in the same folder)
def getData(filename):
    x = list()
    y = list()
    z = list()
    
    with open(filename) as file:
        line = file.readline()
        while line:
            line = line.strip()
            st = line.split(" ")
            if len(st) == 3:
                x.append(st[0])
                y.append(st[1])
                z.append(st[2])
            elif len(st) == 1:
                x.append("-1")
                y.append("-1")
                z.append("-1")
            line = file.readline()
    return x,y,z

train_words,train_POS_tags,train_chunking_tags = getData('train.txt')
test_words,test_POS_tags,test_chunking_tags = getData('test.txt')

print('Train-Set size: ' + str(len(train_words)))
print('Test-Set size: ' + str(len(test_words)))

Train-Set size: 220663
Test-Set size: 49389


In [2]:
#dictionary of tags
dic = {}
dic[0] = "#"
dic[1] = "$"
dic[2] = "''"
dic[3] = "("
dic[4] = ")"
dic[5] = ","
dic[6] = "."
dic[7] = ":"
dic[8] = "CC"
dic[9] = "CD"
dic[10] = "DT"
dic[11] = "EX"
dic[12] = "FW"
dic[13] = "IN"
dic[14] = "JJ"
dic[15] = "JJR"
dic[16] = "JJS"
dic[17] = "MD"
dic[18] = "NN"
dic[19] = "NNP"
dic[20] = "NNPS"
dic[21] = "NNS"
dic[22] = "PDT"
dic[23] = "POS"
dic[24] = "PRP"
dic[25] = "PRP$"
dic[26] = "RB"
dic[27] = "RBR"
dic[28] = "RBS"
dic[29] = "RP"
dic[30] = "SYM"
dic[31] = "TO"
dic[32] = "UH"
dic[33] = "VB"
dic[34] = "VBD"
dic[35] = "VBG"
dic[36] = "VBN"
dic[37] = "VBP"
dic[38] = "VBZ"
dic[39] = "WDT"
dic[40] = "WP"
dic[41] = "WP$"
dic[42] = "WRB"
dic[43] = "``"

In [3]:
#getting the accuracy of the model
def getAccuracy(prediction):
    cnt = 0
    j = 0
    for i in len(prediction):
        if(test_POS_tag[j] == "-1"):
            j = j+1
            continue
        elif prediction[i] == test_POS_tag[j]:
            j = j+1
            cnt = cnt+1
    return cnt/len(prediction)

In [4]:
#for getting a feature vector
def getFeatures(words,index):
    feature = list()
    
    #feature.append(True)
    #feature 1: is the first letter Capitalized?
    feature.append(words[index][0].isupper())
    
    #feature 2: is it ending in "ing"?
    feature.append(words[index][-3:] == "ing")
    
    #feature 3: is it ending in "ly"?
    feature.append(words[index][-2:] == "ly")
    
    #feature 4: is the previous word "the"?
    if index > 0:
        feature.append(words[index-1] == "the")
    else:
        feature.append(False)
    
    #feature 5: does it contain a number?
    pattern = re.compile(r'\d')
    feature.append(len(pattern.findall(words[index])) > 0)
    
    #featre 6: does it have a hyphen?
    pattern = re.compile(r'-')
    feature.append(len(pattern.findall(words[index])) > 0)
    
    return feature

In [5]:
def getFeatureMatrix(words):
    matrix = list()
    size = len(words)
    
    i=0
    while i<size:
        sentence = list()
        while i<size and words[i] != "-1":
            sentence.append(words[i])
            i = i+1
        j=0
        while j < len(sentence):
            matrix.append(getFeatures(sentence,j))
            j=j+1
        i = i+1
        
    return matrix

In [6]:
feature_matrix = np.asarray(getFeatureMatrix(train_words))
print(feature_matrix.shape)
feature_matrix[0]

(211727, 6)


array([ True, False, False, False, False, False], dtype=bool)

In [7]:
train_POS_tags_np = np.array(train_POS_tags)

In [8]:
train_POS_tags_np

array(['NN', 'IN', 'DT', ..., 'RB', '.', '-1'],
      dtype='<U4')

In [9]:
tag_list = np.unique(train_POS_tags_np)
index = np.argwhere(tag_list=='-1')
tag_list = np.delete(tag_list,index)
tag_list

array(['#', '$', "''", '(', ')', ',', '.', ':', 'CC', 'CD', 'DT', 'EX',
       'FW', 'IN', 'JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS',
       'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO',
       'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$',
       'WRB', '``'],
      dtype='<U4')

In [10]:
tag_to_int = dict((c,i) for i,c in enumerate(tag_list))
int_to_tag = dict((i,c) for i,c in enumerate(tag_list))
print(train_POS_tags[0])
print(tag_to_int)
print(int_to_tag)

NN
{'#': 0, '$': 1, "''": 2, '(': 3, ')': 4, ',': 5, '.': 6, ':': 7, 'CC': 8, 'CD': 9, 'DT': 10, 'EX': 11, 'FW': 12, 'IN': 13, 'JJ': 14, 'JJR': 15, 'JJS': 16, 'MD': 17, 'NN': 18, 'NNP': 19, 'NNPS': 20, 'NNS': 21, 'PDT': 22, 'POS': 23, 'PRP': 24, 'PRP$': 25, 'RB': 26, 'RBR': 27, 'RBS': 28, 'RP': 29, 'SYM': 30, 'TO': 31, 'UH': 32, 'VB': 33, 'VBD': 34, 'VBG': 35, 'VBN': 36, 'VBP': 37, 'VBZ': 38, 'WDT': 39, 'WP': 40, 'WP$': 41, 'WRB': 42, '``': 43}
{0: '#', 1: '$', 2: "''", 3: '(', 4: ')', 5: ',', 6: '.', 7: ':', 8: 'CC', 9: 'CD', 10: 'DT', 11: 'EX', 12: 'FW', 13: 'IN', 14: 'JJ', 15: 'JJR', 16: 'JJS', 17: 'MD', 18: 'NN', 19: 'NNP', 20: 'NNPS', 21: 'NNS', 22: 'PDT', 23: 'POS', 24: 'PRP', 25: 'PRP$', 26: 'RB', 27: 'RBR', 28: 'RBS', 29: 'RP', 30: 'SYM', 31: 'TO', 32: 'UH', 33: 'VB', 34: 'VBD', 35: 'VBG', 36: 'VBN', 37: 'VBP', 38: 'VBZ', 39: 'WDT', 40: 'WP', 41: 'WP$', 42: 'WRB', 43: '``'}


In [11]:
def getLabelMatrix():
    label_matrix = np.zeros((feature_matrix.shape[0],len(tag_list)),dtype=int)
    k = 0
    for i in train_POS_tags_np:
        if i!='-1':
            label_matrix[k][tag_to_int[i]] = 1
            k = k+1
    return label_matrix

In [12]:
oneHot_LabelMatrix = getLabelMatrix()
oneHot_LabelMatrix[0]
print(oneHot_LabelMatrix.shape)

(211727, 44)


In [13]:
import sys
import numpy

numpy.seterr(all='ignore')
 

def softmax(x):
    e = numpy.exp(x - numpy.max(x))  # prevent overflow
    if e.ndim == 1:
        return e / numpy.sum(e, axis=0)
    else:  
        return e / numpy.array([numpy.sum(e, axis=1)]).T  # ndim = 2"""


class LogisticRegression(object):
    def __init__(self, input, label, n_in, n_out):
        self.x = input
        self.y = label
        self.W = numpy.zeros((n_in, n_out))  # initialize W 0
        self.b = numpy.zeros(n_out)          # initialize bias 0

        # self.params = [self.W, self.b]

    def train(self, lr, input=None, L2_reg=0.00):
        if input is not None:
            self.x = input

        # p_y_given_x = sigmoid(numpy.dot(self.x, self.W) + self.b)
        p_y_given_x = softmax(numpy.dot(self.x, self.W) + self.b)
        d_y = self.y - p_y_given_x
        
        self.W += lr * numpy.dot(self.x.T, d_y) - lr * L2_reg * self.W
        self.b += lr * numpy.mean(d_y, axis=0)
        
        # cost = self.negative_log_likelihood()
        # return cost

    def negative_log_likelihood(self):
       
        sigmoid_activation = softmax(numpy.dot(self.x, self.W) + self.b)

        cross_entropy = - numpy.mean(
            numpy.sum(self.y * numpy.log(sigmoid_activation) +
            (1 - self.y) * numpy.log(1 - sigmoid_activation),axis=1))

        return cross_entropy


    def predict(self, x):
       
        return softmax(numpy.dot(x, self.W) + self.b)


def test_lr(x , y , w_row, w_col, learning_rate=0.01, n_epochs=50):
   
    
    # construct LogisticRegression
    classifier = LogisticRegression(input=x, label=y, n_in=w_row, n_out=w_col)
    # train
    for epoch in range(n_epochs):
        classifier.train(lr=learning_rate)
        cost = classifier.negative_log_likelihood()
        print(sys.stderr, 'Training epoch %d, cost is ' % epoch, cost)
        learning_rate *= 0.95

    z = x.dot(classifier.W) + classifier.b
    normal = softmax(z)
    return normal
    

initial_softmax= test_lr(feature_matrix, oneHot_LabelMatrix, feature_matrix.shape[1], len(tag_list))


<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 0, cost is  nan
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 1, cost is  nan
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 2, cost is  nan
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 3, cost is  nan
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 4, cost is  nan
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 5, cost is  nan
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 6, cost is  nan
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 7, cost is  nan
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 8, cost is  nan
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 9, cost is  nan
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 10, cost is  10

In [14]:

train_POS_pair_tags = list()
train_POS_pair_tags.append(train_POS_tags[0])
for i in range(1,len(train_POS_tags)):
    if(train_POS_tags[i-1]=='-1'):
        train_POS_pair_tags.append(train_POS_tags[i])
    elif(train_POS_tags[i]!='-1'):
        train_POS_pair_tags.append(train_POS_tags[i-1]+train_POS_tags[i])
    
print(train_POS_pair_tags)

['NN', 'NNIN', 'INDT', 'DTNN', 'NNVBZ', 'VBZRB', 'RBVBN', 'VBNTO', 'TOVB', 'VBDT', 'DTJJ', 'JJNN', 'NNIN', 'INNN', 'NNNNS', 'NNSIN', 'INNNP', 'NNP,', ',JJ', 'JJIN', 'INNN', 'NNNN', 'NN,', ',VB', 'VBTO', 'TOVB', 'VBDT', 'DTJJ', 'JJNN', 'NNIN', 'INNNP', 'NNPCC', 'CCNNP', 'NNPPOS', 'POSJJ', 'JJNNS', 'NNS.', 'NNP', 'NNPIN', 'INDT', 'DTNNP', 'NNPNNP', 'NNPNNP', 'NNPPOS', 'POSVBN', 'VBNNN', 'NNTO', 'TODT', 'DTNN', 'NNJJ', 'JJNN', 'NNVBZ', 'VBZVBN', 'VBNTO', 'TOVB', 'VBDT', 'DTNN', 'NNIN', 'INNN', 'NNIN', 'INDT', 'DTJJ', 'JJNN', 'NN.', 'CC', 'CCNNS', 'NNSVBP', 'VBPVBG', 'VBGNN', 'NNIN', 'INNN', 'NNVBZ', 'VBZVBN', 'VBNVBN', 'VBNIN', 'INDT', 'DTNN', 'NNPOS', 'POSNN', 'NNTO', 'TOVB', 'VBDT', 'DTJJ', 'JJNN', 'NNNNS', 'NNSIN', 'INPRP$', 'PRP$NNP', 'NNPNNP', 'NNPNN', 'NNJJ', 'JJNNP', 'NNP.', 'DT', 'DTVBZ', 'VBZVBN', 'VBNDT', 'DTNN', 'NNIN', 'INDT', 'DTNN', 'NNVBG', 'VBGVBN', 'VBNTO', 'TOVB', 'VBNN', 'NNNNS', 'NNSTO', 'TOCD', 'CDNN', 'NNIN', 'INPRP$', 'PRP$JJ', 'JJCD', 'CDNN', 'NNNN', 'NNTO', 'TOVB'

In [15]:
train_POS_pair_tags_np = np.array(train_POS_pair_tags)
train_POS_pair_tags_unique = np.unique(train_POS_pair_tags_np)
pair_tag_to_int = dict((c,i) for i,c in enumerate(train_POS_pair_tags_unique))
int_to_tag_pair = dict((i,c) for i,c in enumerate(train_POS_pair_tags_unique))

#print(pair_tag_to_int)
#print(int_to_tag_pair)
len(train_POS_pair_tags_np)

211727

In [16]:
def getLabelMatrix1():
    label_matrix = np.zeros((feature_matrix.shape[0],len(train_POS_pair_tags_unique)),dtype=int)
    k = 0
    for i in train_POS_pair_tags_np:
        label_matrix[k][pair_tag_to_int[i]] = 1
        k = k+1
    return label_matrix

oneHot_pair_LabelMatrix = getLabelMatrix1()

In [17]:
second_softmax= test_lr(feature_matrix, oneHot_pair_LabelMatrix, feature_matrix.shape[1], len(train_POS_pair_tags_unique))
second_softmax

<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 0, cost is  nan
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 1, cost is  20.6129552938
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 2, cost is  18.1954518465
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 3, cost is  17.9934973746
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 4, cost is  nan
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 5, cost is  21.8902197094
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 6, cost is  21.3942015352
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 7, cost is  16.9991396154
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 8, cost is  nan
<ipykernel.iostream.OutStream object at 0x000001CB7792E630> Training epoch 9, cost is  17.6993295621
<ipykernel.iostream.

array([[  7.57736833e-05,   7.57531195e-05,   7.57740092e-05, ...,
          1.08966273e-04,   4.49272174e-04,   5.32368458e-04],
       [  8.87198365e-04,   8.87214988e-04,   8.87202233e-04, ...,
          8.87199887e-04,   8.87205049e-04,   8.87208854e-04],
       [  8.87198365e-04,   8.87214988e-04,   8.87202233e-04, ...,
          8.87199887e-04,   8.87205049e-04,   8.87208854e-04],
       ..., 
       [  7.57736833e-05,   7.57531195e-05,   7.57740092e-05, ...,
          1.08966273e-04,   4.49272174e-04,   5.32368458e-04],
       [  8.87198365e-04,   8.87214988e-04,   8.87202233e-04, ...,
          8.87199887e-04,   8.87205049e-04,   8.87208854e-04],
       [  8.87198365e-04,   8.87214988e-04,   8.87202233e-04, ...,
          8.87199887e-04,   8.87205049e-04,   8.87208854e-04]])