In [1]:
# Load all dependencies
import torch
import torch.nn as nn
import pickle
import torch
from transformers import BertTokenizerFast

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-ner-final")


In [2]:

# Load vocabularies
with open("ner_vocab.pkl", "rb") as f:
    vocab_data = pickle.load(f)
    word2idx = vocab_data["word2idx"]
    tag2idx = vocab_data["tag2idx"]
    idx2tag = vocab_data["idx2tag"]

# Load embedding matrix
embedding_matrix = torch.load("embedding_matrix.pt")
embed_dim = 100


In [3]:
max_len = 900

# Re-declare the same model class
class CNN_NER(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, embedding_matrix):
        super(CNN_NER, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = self.relu(x)
        x = x.permute(0, 2, 1)
        x = self.dropout(x)
        return self.classifier(x)

# Reconstruct model and load weights
model = CNN_NER(len(word2idx), embed_dim, len(tag2idx), embedding_matrix).to(device)
model.load_state_dict(torch.load("cnn_ner_model.pt"))
model.eval()


CNN_NER(
  (embedding): Embedding(16028, 100)
  (conv1): Conv1d(100, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (classifier): Linear(in_features=128, out_features=13, bias=True)
)

In [None]:

def predict_sentence(raw_text): #predict new sentence
    model.eval()

    # Tokenize input using BERT
    tokens = tokenizer.tokenize(raw_text)
    
    # Map to word2idx (fallback to UNK)
    token_ids = [word2idx.get(tok, word2idx["UNK"]) for tok in tokens]
    
    # Pad to max_len
    padded = token_ids + [word2idx["PAD"]] * (max_len - len(token_ids))
    device = next(model.parameters()).device
    input_tensor = torch.tensor([padded[:max_len]], dtype=torch.long).to(device)    

    with torch.no_grad():
        logits = model(input_tensor)
        pred_ids = torch.argmax(logits, dim=-1)[0].tolist()
        pred_tags = [idx2tag[i] for i in pred_ids[:len(tokens)]]

    return list(zip(tokens, pred_tags))

In [5]:
text = "He graduated from Stanford University in 2015 with a degree in Computer Science."
print(predict_sentence(text))

[('he', 'I-EDU'), (' ', 'B-EDU'), ('grad', 'I-EDU'), ('uat', 'I-HSK'), ('ed', 'I-HSK'), (' ', 'I-EDU'), ('from', 'I-EDU'), (' ', 'B-ORG'), ('s', 'I-EDU'), ('tanf', 'I-ORG'), ('or', 'I-HSK'), ('##d', 'B-ORG'), (' ', 'B-EDU'), ('university', 'I-EDU'), (' ', 'I-EDU'), ('in', 'I-EDU'), (' ', 'B-HSK'), ('2015', 'I-EDU'), (' ', 'B-ORG'), ('with', 'I-EDU'), (' ', 'B-ORG'), ('a', 'I-EDU'), (' ', 'B-HSK'), ('d', 'I-EDU'), ('eg', 'B-HSK'), ('re', 'B-JOB'), ('##e', 'B-ORG'), (' ', 'I-ORG'), ('in', 'I-EDU'), (' ', 'B-HSK'), ('comp', 'I-ORG'), ('ut', 'B-JOB'), ('##er', 'I-ORG'), (' ', 'B-ORG'), ('sci', 'I-HSK'), ('enc', 'I-ORG'), ('e', 'I-ORG'), ('.', 'I-EDU')]


In [None]:
#Predict new text here : 
text = "Your Text Here."
print(predict_sentence(text))