# 1. Download Dataset



# 2. Preprocessing
This is just a very navie preprocessing.

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib

from google.colab import drive
drive.mount('/content/drive')
df_train = pd.read_csv('/content/drive/My Drive/train.csv')
df_test= pd.read_csv('/content/drive/My Drive/test.csv')
df_val= pd.read_csv('/content/drive/My Drive/val.csv')
df_train.head(10)

In [0]:
train_data=[sentence.split() for sentence in df_train.Sentence]
target_y_train=[sentence.split() for sentence in df_train.NER]

test_data=[sentence.split() for sentence in df_test.Sentence]
# target_y_test=[sentence.split() for sentence in df_test.NER]

validation_data=[sentence.split() for sentence in df_val.Sentence]
target_y_validation=[sentence.split() for sentence in df_val.NER]

In [77]:
word_to_ix = {}
for sentence in train_data+validation_data+test_data:
    for word in sentence:
        word = word.lower()
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_list = list(word_to_ix.keys())

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in target_y_train+target_y_validation:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

tag_list=list(tag_to_ix)
print('Word list length: ',len(word_list))
print(tag_to_ix)

Word list length:  13972
{'<START>': 0, '<STOP>': 1, 'O': 2, 'I-ORG': 3, 'I-MISC': 4, 'I-PER': 5, 'I-LOC': 6}


In [0]:
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

train_input_index =  to_index(train_data,word_to_ix)
train_output_index = to_index(target_y_train,tag_to_ix)
val_input_index = to_index(validation_data,word_to_ix)
val_output_index = to_index(target_y_validation,tag_to_ix)
test_input_index = to_index(test_data,word_to_ix)
# test_output_index = to_index(target_y_test,tag_to_ix)


In [80]:
MAX_LENGTH=max([len(s) for s in train_input_index+val_input_index+test_input_index])
MAX_LENGTH

124

# 3. Model

## 3.1 Encoder

In [0]:
import torch
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, embedding):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden) 
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)    

## 3.2 Decoder

In [0]:
class AttnDecoderRNN(nn.Module):
    ATTN_TYPE_DOT_PRODUCT = "Dot Product"
    ATTN_TYPE_SCALE_DOT_PRODUCT = "Scale Dot Product"

    def __init__(self, hidden_size, output_size, embedding, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = embedding
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size*2, self.output_size)


    def cal_attention(self, hidden, encoder_hiddens, method):
        if method == AttnDecoderRNN.ATTN_TYPE_DOT_PRODUCT:
            # bmm: https://pytorch.org/docs/master/generated/torch.bmm.html
            attn_weights = F.softmax(torch.bmm(hidden, encoder_hiddens.T.unsqueeze(0)),dim=-1)
            attn_output = torch.bmm(attn_weights, encoder_hiddens.unsqueeze(0))
            concat_output = torch.cat((attn_output[0], hidden[0]), 1)

        return concat_output

    def forward(self, input, hidden, encoder_hiddens):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        _, hidden = self.gru(embedded, hidden)

        concat_output = self.cal_attention(hidden, encoder_hiddens, AttnDecoderRNN.ATTN_TYPE_DOT_PRODUCT)

        output = F.log_softmax(self.out(concat_output), dim=1)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

## 3.3 Train Function

In [0]:

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_hiddens = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for i in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
        encoder_hiddens[i] = encoder_hidden[0, 0]

    decoder_input = torch.tensor([[0]], device=device)

    decoder_hidden = encoder_hidden

    # Teacher forcing: Feed the target as the next input
    for i in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_hiddens)
        loss += criterion(decoder_output, target_tensor[i])
        decoder_input = target_tensor[i]  # Teacher forcing

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

## 3.4 Train Iterations Function

In [0]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [0]:
import random
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        random_choice_ix = random.choice(range(n_data))
        input_index_r = [[ind] for ind in input_index[random_choice_ix]]
        target_index_r = [[ind] for ind in target_index[random_choice_ix]]
        
        input_tensor = torch.LongTensor(input_index_r).to(device)
        target_tensor = torch.LongTensor(target_index_r).to(device)

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0


# 4. Training Process

In [0]:
input_index=train_input_index
target_index=train_output_index
n_data=200

In [107]:
hidden_size = 50
embedding = nn.Embedding(len(word_to_ix), hidden_size)
encoder1 = EncoderRNN(len(word_to_ix), hidden_size, embedding).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, len(tag_to_ix),embedding, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 1000, print_every=50)

0m 3s (- 1m 0s) (50 5%) 0.8685
0m 6s (- 0m 56s) (100 10%) 0.7176
0m 9s (- 0m 55s) (150 15%) 0.7396
0m 12s (- 0m 51s) (200 20%) 0.6211
0m 16s (- 0m 48s) (250 25%) 0.6798
0m 20s (- 0m 47s) (300 30%) 0.6007
0m 23s (- 0m 44s) (350 35%) 0.5925
0m 27s (- 0m 41s) (400 40%) 0.6154
0m 30s (- 0m 37s) (450 45%) 0.7323
0m 34s (- 0m 34s) (500 50%) 0.5793
0m 37s (- 0m 30s) (550 55%) 0.5794
0m 40s (- 0m 26s) (600 60%) 0.5144
0m 44s (- 0m 23s) (650 65%) 0.5938
0m 47s (- 0m 20s) (700 70%) 0.6983
0m 51s (- 0m 17s) (750 75%) 0.5938
0m 55s (- 0m 13s) (800 80%) 0.5360
0m 59s (- 0m 10s) (850 85%) 0.5748
1m 2s (- 0m 6s) (900 90%) 0.5755
1m 6s (- 0m 3s) (950 95%) 0.5620
1m 9s (- 0m 0s) (1000 100%) 0.5611


# 5. Evaluation

In [0]:
def evaluate(encoder, decoder, input_index, max_length=MAX_LENGTH):
    # with torch.no_grad():
        
        input_tensor = torch.LongTensor([[ind] for ind in input_index]).to(device)

        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_hiddens = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_hiddens[ei] += encoder_hidden[0, 0]

        decoder_input = torch.tensor([[0]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoded_index = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_hiddens)
            
            topv, topi = decoder_output.data.topk(1)
            # print(topi.item())
            decoded_index.append(topi.item())
            if topi.item() == 1:
                decoded_words.append('<STOP>')
                
                break
            else:
                decoded_words.append(tag_list[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words,decoded_index,decoder_output

In [0]:
y_pred=[]
for s in val_input_index:
  _,output_s,_=evaluate(encoder1, attn_decoder1, s, max_length=MAX_LENGTH)
  y_pred.append(output_s)



In [92]:
from sklearn.metrics import classification_report
print(classification_report(val_output_index,y_pred,digits=4))


ValueError: ignored