<a href="https://colab.research.google.com/github/sayanlearn/ML-Paper-Notes-and-Implementation/blob/master/Named_entity_recognition_v1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Named Entity Recognition

## Uploading the dataset

In [0]:
import pandas as pd

ds = pd.read_csv('/content/drive/My Drive/Named Entity Recognition/ner_dataset.csv', engine='python') # To remove the unicode error (stackoverflow), engine = 'python' was added

## Preparation of training data

In [0]:
tags = ds["Tag"].tolist()
# This list consists of all the tags corresponding to all the words in the dataset
tag = []
for t in tags:
  if t in tag:
    continue
  else:
    tag.append(t)
# tag consists of all the possible categories 

In [0]:
op = []
for i in range(len(tags)):
  for j in range(len(tag)):
    if tags[i] == tag[j]:
      op.append(j)
# We have now converted the output to label form

In [0]:
w = ds["Word"].tolist()

In [0]:
vocab = []
for m in w:
  if m in vocab:
    continue
  else:
    vocab.append(m)
# We will create a vocabulary consisting of all possible words

In [0]:
dic = {}
i = 0
for m in range(len(w)):
  if w[m] == '.':
    j = m+1
    ix = tuple(w[i:j]) # Tuples can be used as keys in dictionaries and not lists
    ox = op[i:j]
    dic[ix] = ox
    i = j
# We have now segregated the sentences with the help of full stops

In [0]:
l = list(dic.values())

In [0]:
maxx = 0
for i in l:
  j = len(i)
  if j > maxx:
    maxx = j 
# We will find the max length of any sentence so that we can add appropriate padding

In [0]:
dicn = {}
for ix, ox in dic.items():
  ixn = list(ix)
  for i in range(maxx + 2 - len(ixn)):
    ixn.append('a')
    ox.append(0) 
  ixnn = tuple(ixn)
  dicn[ixnn] = ox
# We have done with padding

In [0]:
import numpy as np

def w2v(w):
  a = []
  for j in range(len(vocab)):
    a.append(0)
  for i in range(len(vocab)):
    if w == vocab[i]:
      a[i] = 1
  return a
# We have created a function to convert a word as a one hot encoding form

In [0]:
def o2o(n):
  a = []
  for j in range(len(tag)):
    a.append(0)
  a[n] = 1
  return a
# To create one hot enoding for the ooutput

In [0]:
inp = []
oup = []
for ix, ox in dicn.items():
  ix = list(ix)
  inp.append(ix)
  oup.append(ox)

In [0]:
inpn = []
m = 0
for i in inp:
  x = []
  for j in i:
    x.append(w2v(j))
  a = x.copy()
  inpn.append(a)
  m = m + 1
  if m == 100:
    break
inpn = np.array(inpn)
r = inpn.shape
inpn = inpn.reshape((r[0], r[2], r[1]))

In [0]:
oupn = []
m = 0
for i in oup:
  x = []
  for j in i:
    x.append(o2o(j))
  a = x.copy()
  oupn.append(a)
  m = m + 1
  if m == 100:
    break
oupn = np.array(oupn)
r = oupn.shape
oupn = oupn.reshape((r[0], r[2], r[1]))

## Training DL Model

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torchvision
from torchvision import transforms
import torch.optim as optim
import matplotlib.pyplot as plt

In [0]:
transformx = transforms.Compose([ 
    transforms.ToTensor(),
])

from torch.utils.data import Dataset, DataLoader

class trainset(data.Dataset):

  def __init__(self, transform = None):
    self.ip = inpn
    self.label = oupn
    self.transform = transform
  
  def __len__(self):
    return m
  
  def __getitem__(self, index):
    ix = self.transform(self.ip[index])
    ox = self.transform(self.label[index])
    return ix, ox

traindataset = trainset(transformx)

In [0]:
train_loader = data.DataLoader(traindataset, batch_size = 2, shuffle = True, num_workers = 4)

In [0]:
for i, data in enumerate(train_loader):
  continue

In [99]:
data[1].size()

torch.Size([2, 1, 17, 106])

In [0]:
class RNN(nn.Module):

  def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
    super().__init__()
    # Hidden Dimensions
    self.hidden_dim = hidden_dim

    # Number of hidden layers
    self.layer_dim = layer_dim

    # Building your RNN
    # batch_first=True causes input/output tensors to be of shape
    # (batch_dim, seq_dim, input_dim)
    # batch_dim = number of samples per batch
    self.RN = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity='relu')

    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    out, hn = self.RN(x, None)
    # out.shape() = (batch, seq_dim, hidden_size*input_dim)
    # hn is the hidden state of the last time step
    out = self.fc(out)
    return out

In [0]:
input_dim = 35165
hidden_dim = 30
layer_dim = 1
output_dim = 17

In [0]:
model = RNN(input_dim, hidden_dim, layer_dim, output_dim)

In [0]:
criterion = nn.CrossEntropyLoss();
optimizer = optim.Adam(model.parameters(), lr = 0.01, betas = (0.9, 0.999), eps = 1e-08, weight_decay=0, amsgrad=False);

In [97]:
for j in range(10):
  for i, data in enumerate(train_loader):
    inputs, label = data
    inputs = inputs.reshape(2, 106, input_dim)
    label = label.reshape(2, 106, output_dim)
    inputs = inputs.float()
    optimizer.zero_grad()
    output = model(inputs)
    loss = criterion(output, label.argmax(dim=2))
    loss.backward()
    optimizer.step()
    print(loss.item())

ValueError: ignored

In [96]:
label.argmax(dim = 2).size()

torch.Size([2, 106])