In [417]:
import numpy as np
import pandas as pd
import re
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [418]:
all_data= './data/trainDataWithPOS.csv'
test_data = './data/testDataWithPOS.csv'
df = pd.read_csv(all_data, encoding = "ISO-8859-1", names=['sentence#', 'word', 'POS', 'tag'])
test_df = pd.read_csv(test_data, encoding = "ISO-8859-1", names=['sentence#', 'word', 'POS', 'tag'])

In [419]:
print(test_df)

          sentence#    word    POS  tag
0        Sentence #    Word    POS  Tag
1       Sentence: 1       I   PRON    O
2       Sentence: 1    just    ADV    O
3       Sentence: 1     did    AUX    O
4       Sentence: 1       a    DET    O
...             ...     ...    ...  ...
8676  Sentence: 600  breath   NOUN   IF
8677  Sentence: 600     the    DET    O
8678  Sentence: 600   whole    ADJ    O
8679  Sentence: 600    time   NOUN    O
8680  Sentence: 600       .  PUNCT    O

[8681 rows x 4 columns]


Loading Train Data

In [420]:
first = 1
end = 600
st = []
pos = []
test = []
for i, sentence in enumerate(test_df["sentence#"]):
    x = re.findall('[0-9]+', sentence)
    if len(x) == 0:
        pass
    else:
        x = int(x[0])
        if x != first:
            st = " ".join(st)
            test.append((st, pos))
            st = []
            pos = []
            first = first + 1
        s = (test_df[['word']].iloc[i]).to_string()
        s = list(s)
        s = s[8:]
        s = "".join(s)
        st.append(s)
        p = test_df[['tag']].iloc[i].to_string()
        p = list(p)
        p = p[7:]      
        p = "".join(p)
        pos.append(p)       

Loading Test Data

In [421]:
first = 1
end = 2400
st = []
pos = []
train = []
for i, sentence in enumerate(df["sentence#"]):
    x = re.findall('[0-9]+', sentence)
    if len(x) == 0:
        pass
    else:
        x = int(x[0])
        if x != first:
            st = " ".join(st)
            train.append((st, pos))
            st = []
            pos = []
            first = first + 1
        s = (df[['word']].iloc[i]).to_string()
        s = list(s)
        s = s[8:]
        s = "".join(s)
        st.append(s)
        p = df[['tag']].iloc[i].to_string()
        p = list(p)
        p = p[7:]      
        p = "".join(p)
        pos.append(p)  
        

Splitting into training and the validation set

In [422]:
print(len(train))
tr = int(len(train) * 0.8)
train_data = train[:tr]
test_data = train[tr:]
print(len(train_data))
print(len(test_data))
print(train[160])

2399
1919
480
('I walked 1.3 miles at a very brisk pace , and felt loosened up and less sore .', ['O', 'BE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'BF', 'IF', 'O', 'BF', 'IF', 'O'])


Logistic Regression Model

In [439]:
class LogRegClassifier():

    def __init__(self, train_corpus, test_corpus):
        super().__init__()
        print(len(train_corpus))
        self.train_words, self.train_labels = self.load_data(train_corpus)
        self.test_words, self.test_labels = self.load_data(test_corpus)
        self.vectorizer = CountVectorizer(ngram_range=(1, 1)) 
        self.vectorizer.fit_transform(self.train_words)
        self.classifier = LogisticRegression()
        self.train()

    def load_data(self, corpus):
        triplets = []
        labels = []
        for sample in corpus:
            words, tags = sample
            words = words.split(" ")
            for index in range(1, len(tags) - 1):
                prev_word = words[index - 1]
                word = words[index]
                next_word = words[index + 1]
                triplets.append(' '.join([prev_word, word, next_word]))
                labels.append(tags[index])
        return triplets, labels

    def train(self):
        X = self.vectorizer.transform(self.train_words)
        y = np.array(self.train_labels)
        self.classifier.fit(X, y)

    def predict(self, corpus):
        X = self.vectorizer.transform(corpus)
        return self.classifier.predict(X)

    def evaluate(self):      
        train_pred = self.predict(self.train_words)
        test_pred = self.predict(self.test_words)  
        print("Evaluating The Test Data")
        print("----------------------------------")
        print(classification_report(test_pred, self.test_labels))
        
    def on_held_out(self, test):
        test_w, test_l = self.load_data(test)
        y = self.predict(test_w)
        print("Evaluating The Held Out Test Data")
        print("---------------------------------")
        print(classification_report(y, test_l))

Evaluating The Model

In [440]:
model = LogRegClassifier(train_data, test_data)
model.evaluate()
model.on_held_out(test)

1919


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating The Test Data
----------------------------------
              precision    recall  f1-score   support

          BE       0.42      0.64      0.51       298
          BF       0.15      0.53      0.23       142
          IE       0.41      0.70      0.51       101
          IF       0.27      0.67      0.38        69
           O       0.96      0.83      0.89      5168

    accuracy                           0.81      5778
   macro avg       0.44      0.67      0.51      5778
weighted avg       0.90      0.81      0.84      5778

Evaluating The Held Out Test Data
---------------------------------
              precision    recall  f1-score   support

          BE       0.39      0.62      0.48       373
          BF       0.12      0.41      0.19       197
          IE       0.32      0.74      0.45       110
          IF       0.24      0.72      0.36        74
           O       0.96      0.82      0.88      6709

    accuracy                           0.80      7463
   

Model

In [475]:
class LSTM(nn.Module):
    def __init__(self, num_words, emb_dim, num_y, hidden_dim=32):
        super().__init__()
        self.emb = nn.Embedding(num_words, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers=1, bidirectional=False)
        self.linear = nn.Linear(hidden_dim, num_y)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, text):
        embeds = self.emb(text)
        out, (last_hidden, last_cell) = self.lstm(embeds.view(len(text), 1, -1))
        tag_space = self.linear(out.view(len(text), -1)) 
        return self.softmax(tag_space)

Loading The Vocabulary

In [476]:
def load_vocab_tags(train_data):
    word_to_ix = {}
    tag_to_ix = {} 
    ix_to_tag = {} 
    for sent, tags in train_data:
        for word in sent.split(" "):
            if word != " ":
                word_to_ix.setdefault(word, len(word_to_ix))
        for tag in tags:
            tag_to_ix.setdefault(tag, len(tag_to_ix))
            ix_to_tag[tag_to_ix[tag]] = tag
    word_to_ix["UNK"] = len(word_to_ix) 
    return word_to_ix, tag_to_ix, ix_to_tag

In [477]:
tok_to_ix, tag_to_ix, ix_to_tag = load_vocab_tags(train_data)
print(tag_to_ix)

{'O': 0, 'BE': 1, 'BF': 2, 'IE': 3, 'IF': 4}


In [478]:
emb_dim = 50
learning_rate = 0.001
model = LSTM(len(tok_to_ix), emb_dim, len(tag_to_ix))
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
loss_fn = nn.NLLLoss()

Traning

In [479]:
n_epochs = 10
count = 0
for epoch in range(n_epochs):
    model.train()
    for text, tags in train_data:
        x = [tok_to_ix[tok] for tok in text.split()]
        y = [tag_to_ix[tag] for tag in tags]
        x_train_tensor = torch.LongTensor(x)
        y_train_tensor = torch.LongTensor(y)
        pred_y = model(x_train_tensor)
        if len(y_train_tensor) != len(pred_y):
            count = count + 1
        else:
            loss = loss_fn(pred_y, y_train_tensor)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
    print("\nEpoch:", epoch)
    print("Training loss:", loss.item())


Epoch: 0
Training loss: 1.057151198387146

Epoch: 1
Training loss: 0.8587541580200195

Epoch: 2
Training loss: 0.7851794958114624

Epoch: 3
Training loss: 0.7491565346717834

Epoch: 4
Training loss: 0.7256338596343994

Epoch: 5
Training loss: 0.7069432735443115

Epoch: 6
Training loss: 0.690325140953064

Epoch: 7
Training loss: 0.6745810508728027

Epoch: 8
Training loss: 0.6590994596481323

Epoch: 9
Training loss: 0.6435345411300659


In [488]:
x_test = []
y_test = []
y_p = []
for sent, tags in test:
    x_test.append(sent)
    y_test.append(tags)
# print(len(x_test))
# print(len(y_test))
y_tes = []
for line in y_test:
    for i in line:
        y_tes.append(i)
# len(y_tes)

Testing

In [489]:
with torch.no_grad():
    model.eval()
    for sentence in x_test:
        x = []
        for tok in sentence.split():
            if tok in tok_to_ix:
                x.append(tok_to_ix[tok])
            else:
                x.append(tok_to_ix["UNK"])      
        x_test = torch.LongTensor(x)
        pred_y_test = model(x_test)
        k = [ix_to_tag[max_ix] for max_ix in pred_y_test.argmax(1).data.numpy()]
        for i in k:
            y_p.append(i)
            
# print(len(y_p))
y_tes = y_tes[:len(y_p)]
print("accuracy")
accuracy_score(y_tes, y_p)

accuracy


0.7740356601794662