In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import torch
from torch.utils.data import DataLoader,Dataset
import torch.nn as nn
device = torch.device("mps")

In [2]:
df = pd.read_csv('/Users/aadityajoshi/Downloads/ner.csv',index_col=0)

In [3]:
df.index = range(len(df))
df.head()

Unnamed: 0,Sentence,POS,Tag
0,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47959 entries, 0 to 47958
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  47959 non-null  object
 1   POS       47959 non-null  object
 2   Tag       47959 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


In [5]:
df = df.sample(frac=0.1)

In [41]:
df['text'].iloc[893]

'The government announced it would eliminate 5,00,000 state jobs by March 2011 and has expanded opportunities for self-employment .'

In [6]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
blanks = []
for i,s,p,t in df.itertuples():
    if type(s) == str:
        if s.isspace()==True:
            blanks.append(i)
blanks

[]

In [7]:
df['text'] = df['Sentence']
df.drop('Sentence',axis=1,inplace=True)

### Create lists of everything

In [8]:
texts = df['text'].apply(lambda x : x.split()).tolist()
pos_tags = df['POS'].apply(eval).tolist()
ner_tags = df['Tag'].apply(eval).tolist()

In [9]:
len(ner_tags)

4796

### create vocabs

In [10]:
word_vocab = {'<PAD>':0,'<UNK>':1}
pos_vocab = {'<PAD>':0,'<UNK>':1}
tag_vocab = {'<PAD>':0,'<UNK>':1}
for sent in texts:
    for word in sent:
        if word not in word_vocab:
            word_vocab[word] = len(word_vocab)
for pos_seq in pos_tags:
    for pos in pos_seq:
        if pos not in pos_vocab:
            pos_vocab[pos] = len(pos_vocab)
for tag_seq in ner_tags:
    for ner in tag_seq:
        if ner not in tag_vocab:
            tag_vocab[ner] = len(tag_vocab)

In [11]:
len(tag_vocab)

19

### using vocab convert into indices

In [12]:
text_ind,pos_ind,tag_ind = [],[],[]
for i in range(len(texts)):
    sent_ind = [word_vocab.get(text,word_vocab['<UNK>']) for text in texts[i]]
    text_ind.append(sent_ind)
    pos_tag_idx = [pos_vocab.get(pos,pos_vocab['<UNK>']) for pos in pos_tags[i]]
    pos_ind.append(pos_tag_idx)
    ner_tag_idx = [tag_vocab.get(tag,tag_vocab['<UNK>']) for tag in ner_tags[i]]
    tag_ind.append(ner_tag_idx)

### padding

In [13]:
def pad_sequence(sequences):
    max_len = max([len(seq) for seq in sequences])
    padded_sequences = []
    for seq in sequences:  
        padded_seq = seq + [0] * (max_len - len(seq))
        padded_sequences.append(padded_seq)
    return torch.tensor(padded_sequences)

In [14]:
padded_text = pad_sequence(text_ind)
padded_pos = pad_sequence(pos_ind)
padded_ner = pad_sequence(tag_ind)

In [15]:
lengths = torch.tensor([len(seq) for seq in text_ind])

In [16]:
len(lengths)

4796

In [17]:
processed_data = {
    'texts':padded_text,
    'pos':padded_pos,
    'tags':padded_ner,
    'lengths':lengths,
    'vocabularies':{
        'word_vocab' : word_vocab,
        'pos_vocab' : pos_vocab,
        'tag_vocab': tag_vocab
    }
}

### Dataset

In [18]:
class NERDataset(Dataset):
    def __init__(self,processed_data):
        self.texts = processed_data['texts']
        self.pos = processed_data['pos']
        self.tags = processed_data['tags']
        self.lengths = processed_data['lengths']
    def __len__(self):
        return len(self.texts)
    def __getitem__(self,idx):
        return {'text':self.texts[idx],
               'pos':self.pos[idx],
               'tag':self.tags[idx],
               'length':self.lengths[idx]}

In [19]:
dataset = NERDataset(processed_data)

In [20]:
dataset

<__main__.NERDataset at 0x175842f50>

In [21]:
len(dataset)

4796

In [22]:
total_size = len(dataset)
train_size = int(0.8 * total_size)  # 80% for training
val_size = total_size - train_size  

In [23]:
from torch.utils.data import random_split
train_dataset, test_dataset = random_split(
    dataset, 
    [train_size, val_size],
    generator=torch.Generator().manual_seed(42)  # For reproducibility
)

In [24]:
def collate_fn(batch):
    # Sort batch by sequence length
    batch = sorted(batch, key=lambda x: x['length'], reverse=True)
    
    # Stack all tensors
    texts = torch.stack([item['text'] for item in batch])
    pos = torch.stack([item['pos'] for item in batch])
    tags = torch.stack([item['tag'] for item in batch])
    lengths = torch.tensor([item['length'] for item in batch])
    
    return {
        'text': texts,
        'pos': pos,
        'tag': tags,
        'length': lengths
    }

In [25]:
train_loader = DataLoader(train_dataset,batch_size=128,shuffle=True,collate_fn=collate_fn)
test_loader = DataLoader(test_dataset,batch_size=128,shuffle=False,collate_fn=collate_fn)

In [26]:
class LSTM(nn.Module):
    def __init__(self,vocab_size,pos_size,num_tags,embedding_dim=100,pos_emb_dim=20,hidden_size=128,num_layers=1,dropout=0.4):
        super(LSTM,self).__init__()
        self.embeddings = nn.Embedding(vocab_size,embedding_dim)
        self.pos_embeddings = nn.Embedding(pos_size,pos_emb_dim)
        total_dim = embedding_dim + pos_emb_dim
        self.lstm = nn.LSTM(total_dim,hidden_size,num_layers=num_layers,bidirectional=True,batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(2*hidden_size,num_tags)
    def forward(self, text, pos, lengths):
        word_embeds = self.embeddings(text)
        pos_embeds = self.pos_embeddings(pos)
        embeds = torch.cat([word_embeds, pos_embeds], dim=2)
        packed_embeds = pack_padded_sequence(
            embeds, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        packed_output, _ = self.lstm(packed_embeds)
        output, _ = pad_packed_sequence(packed_output, batch_first=True, 
                                      total_length=text.size(1))  # Pad to original length
        tag_scores = self.fc(output)
        return tag_scores

In [27]:
model = LSTM(
    vocab_size=len(word_vocab),
    pos_size=len(pos_vocab),
    num_tags=len(tag_vocab)
).to(device)

In [28]:
model

LSTM(
  (embeddings): Embedding(12013, 100)
  (pos_embeddings): Embedding(43, 20)
  (lstm): LSTM(120, 128, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.4, inplace=False)
  (fc): Linear(in_features=256, out_features=19, bias=True)
)

In [33]:
num_epochs = 10
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [40]:
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    
    for batch in train_loader:
        # Get batch data
        text = batch['text'].to(device)
        pos = batch['pos'].to(device)
        tags = batch['tag'].to(device)
        lengths = batch['length']
        
        # Forward pass
        tag_scores = model(text, pos, lengths)
        
        # Reshape predictions and targets
        tag_scores = tag_scores.view(-1, tag_scores.size(-1))  # (batch_size * seq_len, num_tags)
        tags = tags.view(-1)  # (batch_size * seq_len)
        
        # Calculate loss
        loss = criterion(tag_scores, tags)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    # Print epoch statistics
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch: {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

Epoch: 1/10, Loss: 0.0066
Epoch: 2/10, Loss: 0.0055
Epoch: 3/10, Loss: 0.0047
Epoch: 4/10, Loss: 0.0041
Epoch: 5/10, Loss: 0.0035
Epoch: 6/10, Loss: 0.0031
Epoch: 7/10, Loss: 0.0027
Epoch: 8/10, Loss: 0.0024
Epoch: 9/10, Loss: 0.0021
Epoch: 10/10, Loss: 0.0019


In [42]:
model.eval()
correct = 0
total = 0
device = torch.device("mps")
with torch.no_grad():
    for batch in test_loader:
        # Move batch data to device
        text = batch['text'].to(device)
        pos = batch['pos'].to(device)
        tags = batch['tag'].to(device)
        lengths = batch['length']  # lengths stay on CPU
        
        # Get model predictions
        outputs = model(text, pos, lengths)
        predictions = torch.argmax(outputs, dim=2)
        
        # Calculate accuracy (ignoring padding)
        mask = tags != 0
        correct += (predictions[mask] == tags[mask]).sum().item()
        total += mask.sum().item()

# Calculate and print accuracy
accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9466


In [44]:
def predict_ner(model, text, word_vocab, pos_vocab, tag_vocab, nlp, device='mps'):

    # Process the text
    doc = nlp(text)
    words = [token.text for token in doc]
    pos_tags = [token.pos_ for token in doc]
    
    # Convert to indices
    word_indices = [word_vocab.get(word, word_vocab['<UNK>']) for word in words]
    pos_indices = [pos_vocab.get(pos, pos_vocab['<UNK>']) for pos in pos_tags]
    
    # Convert to tensors and add batch dimension
    text_tensor = torch.tensor([word_indices], device=device)
    pos_tensor = torch.tensor([pos_indices], device=device)
    lengths = torch.tensor([len(words)])
    
    # Get predictions
    model.eval()
    with torch.no_grad():
        outputs = model(text_tensor, pos_tensor, lengths)
        predictions = torch.argmax(outputs, dim=2)[0]
    
    # Convert indices back to tags
    rev_tag_vocab = {v: k for k, v in tag_vocab.items()}
    predicted_tags = [rev_tag_vocab[idx.item()] for idx in predictions]
    
    # Return word-tag pairs
    return list(zip(words, predicted_tags))

# Example usage:

import spacy
nlp = spacy.load('en_core_web_sm')

# Test a new sentence
test_sentence = "Harry potter is main character of the novel"
results = predict_ner(
    model, 
    test_sentence,
    processed_data['vocabularies']['word_vocab'],
    processed_data['vocabularies']['pos_vocab'],
    processed_data['vocabularies']['tag_vocab'],
    nlp,
    device
)

# Print results
for word, tag in results:
    print(f"{word}: {tag}")

Harry: O
potter: O
is: O
main: O
character: O
of: O
the: O
novel: O
