# Named Entity Identification (NEI) using SVM


## Install Dependencies

In [2]:
import nltk
nltk.download('punkt')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to C:\Users\ARNAV
[nltk_data]     AGARWAL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\ARNAV
[nltk_data]     AGARWAL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Lets first put POS TAGGER into place

In [3]:
import nltk
import numpy as np
import pandas as pd
import random
from collections import defaultdict, Counter
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score
import time

In [4]:
nltk.download('brown')
nltk.download('universal_tagset')
nltk_data = list(nltk.corpus.brown.tagged_sents(tagset='universal'))

[nltk_data] Downloading package brown to C:\Users\ARNAV
[nltk_data]     AGARWAL\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to C:\Users\ARNAV
[nltk_data]     AGARWAL\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [5]:
tagged_words = [tup for sent in nltk_data for tup in sent]
tags = {tag for word, tag in tagged_words}

word_tag_freq = defaultdict(lambda: defaultdict(int))
tag_count = defaultdict(int)
       
for word, tag in tagged_words:
    word_tag_freq[word][tag] += 1
    tag_count[tag] += 1

In [6]:
def t2_given_t1(t2, t1, tags_seq):
    transitions = defaultdict(int)
    tag_seq_len = len(tags_seq)

    for i in range(tag_seq_len - 1):
        if tags_seq[i] == t1 and tags_seq[i + 1] == t2:
            transitions[(t1, t2)] += 1

    count_t1 = tags_seq.count(t1)
    return transitions[(t1, t2)] / count_t1 if count_t1 > 0 else 1e-6

# Emission probabilities for word given tag
def word_given_tag(word, tag):
    count_tag = tag_count[tag]
    count_word_tag = word_tag_freq[word][tag] if word in word_tag_freq else 0
    return count_word_tag / count_tag if count_tag > 0 else 1e-6

In [7]:
# Optimized Viterbi Algorithm with smoothing for unknown words
def Viterbi_optimized(words, tags_df, train_tagged_words):
    state = []
    T = list(tag_count.keys())
    word_prob_cache = {}

    for key, word in enumerate(words):
        p = []
        if word in word_tag_freq:
            possible_tags = list(word_tag_freq[word].keys())
        else:
            possible_tags = T

        for tag in possible_tags:
            if key == 0:
                transition_p = tags_df.loc['.', tag] if '.' in tags_df.index else 1e-6
            else:
                transition_p = tags_df.loc[state[-1], tag]

            if (word, tag) not in word_prob_cache:
                emission_p = word_given_tag(word, tag)
                word_prob_cache[(word, tag)] = emission_p
            else:
                emission_p = word_prob_cache[(word, tag)]

            state_probability = emission_p * transition_p
            p.append(state_probability)

        pmax = max(p)
        state_max = possible_tags[p.index(pmax)]
        state.append(state_max)

    return list(zip(words, state))

# Function to compute 5-fold cross-validation, confusion matrix, and per-POS accuracy
def evaluate_viterbi_with_cross_validation(tagged_sentences, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    all_true_tags = []
    all_pred_tags = []
    fold_accuracies = []
    
    # Prepare the transition matrix
    tags_seq = [pair[1] for pair in tagged_words]
    tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
    tag_list = list(tags)

    for i, t1 in enumerate(tag_list):
        for j, t2 in enumerate(tag_list):
            tags_matrix[i, j] = t2_given_t1(t2, t1, tags_seq)
    tags_df = pd.DataFrame(tags_matrix, columns=tag_list, index=tag_list)

    # Cross-validation loop
    for train_index, test_index in kf.split(tagged_sentences):
        # Split the data into train and test sets for this fold
        train_data = [tagged_sentences[i] for i in train_index]
        test_data = [tagged_sentences[i] for i in test_index]

        # Flatten the train data for transition probability calculation
        train_tagged_words = [tup for sent in train_data for tup in sent]

        # Run Viterbi on the test set
        for test_sent in test_data:
            words = [word for word, tag in test_sent]
            true_tags = [tag for word, tag in test_sent]
            predicted_tags = [tag for word, tag in Viterbi_optimized(words, tags_df, train_tagged_words)]

            all_true_tags.extend(true_tags)
            all_pred_tags.extend(predicted_tags)

        # Accuracy for the current fold
        fold_accuracy = accuracy_score(all_true_tags, all_pred_tags)
        fold_accuracies.append(fold_accuracy)

    # Average accuracy over the folds
    avg_accuracy = np.mean(fold_accuracies)
    print(f"5-Fold Cross-Validation Accuracy: {avg_accuracy * 100:.2f}%")

    # Confusion matrix
    conf_matrix = confusion_matrix(all_true_tags, all_pred_tags, labels=tag_list)
    conf_matrix_df = pd.DataFrame(conf_matrix, index=tag_list, columns=tag_list)
    print("\nConfusion Matrix:\n", conf_matrix_df)

    # Per POS accuracy
    per_tag_accuracy = {}
    true_tag_counter = Counter(all_true_tags)

    for tag in tag_list:
        correct_preds = conf_matrix_df.loc[tag, tag]
        total_true = true_tag_counter[tag]
        per_tag_accuracy[tag] = correct_preds / total_true if total_true > 0 else 0.0

    print("\nPer POS Tag Accuracy:")
    for tag, accuracy in per_tag_accuracy.items():
        print(f"{tag}: {accuracy * 100:.2f}%")



# Run the evaluation
evaluate_viterbi_with_cross_validation(nltk_data, k=5)


5-Fold Cross-Validation Accuracy: 96.10%

Confusion Matrix:
         ADJ     DET     ADP     X       .    NOUN   CONJ    VERB    PRT  \
ADJ   78935       0      80     3       0    1839      0     327    230   
DET       0  135566    1161     5       0       1     37       0      1   
ADP      95     201  135351     5      20      48    148      97   7681   
X         7      17      24  1158      24     109      4      27      2   
.         0       0       0     0  147565       0      0       0      0   
NOUN   2793     115      33    18       1  269008      1    2858     40   
CONJ      0     126       3     0       0       2  37965       0      0   
VERB    477       0     162     5       0    5300      0  176692     15   
PRT     292       2    5681     0       0      91      0      28  23618   
NUM       0       0       0     0       0     204      0       0      0   
PRON      0    1139    1776     0       0       0      0       0      2   
ADV    3212     373    2668     1      

In [8]:
# Prepare the transition matrix
tags_seq = [pair[1] for pair in tagged_words]
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
tag_list = list(tags)

for i, t1 in enumerate(tag_list):
    for j, t2 in enumerate(tag_list):
        tags_matrix[i, j] = t2_given_t1(t2, t1, tags_seq)
tags_df = pd.DataFrame(tags_matrix, columns=tag_list, index=tag_list)
        
def call_viterbi(sentence):
    words = sentence.split()
    predicted_tags_with_words = list(Viterbi_optimized(words, tags_df, tagged_words))
    return predicted_tags_with_words

In [9]:
from sklearn.svm import SVC
from string import punctuation
from tqdm.notebook import tqdm
from datasets import load_dataset



In [10]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

## Constants

In [11]:

features_number = 9# number of features used
SW = stopwords.words("english")
PUNCT = list(punctuation)

## Functions

In [12]:
#{'.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X'}
#Lets store these unique pos tags in a list
POS_TAGS = ['.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']
def convert_pos_tags(pos_tags):
    return [POS_TAGS.index(tag) for tag in pos_tags]

### Data

In [13]:
def data_manipulate(data):
    
    words = [] # stores the str
    features = [] # feature array, one vector per word in the corpus
    nei_labels = [] # labels (0/1)

    for d in tqdm(data):

        tokens = d["tokens"]
        #LETS GET THE POS TAG OF THIS TOKENS ARRAY USING CALL_VITREBI
        pos_tags = [tag for word, tag in call_viterbi(" ".join(tokens))]
        pos_tags = convert_pos_tags(pos_tags)
        
        tags = d["ner_tags"]

        l = len(tokens)
        for i in range(l):

            x = vectorize(w = tokens[i], scaled_position = (i/l), prev_tag = pos_tags[i-1] if i > 0 else -1, pos_tag = pos_tags[i], next_tag = pos_tags[i+1] if i < l-1 else -1)

            if tags[i] > 0:
                y = 1
            else:
                y = 0

            features.append(x)
            nei_labels.append(y)

        words += tokens

    words = np.asarray(words, dtype = "object")
    features = np.asarray(features, dtype = np.float32)
    nei_labels = np.asarray(nei_labels, dtype = np.float32)

    return words, features, nei_labels

### Model

#### Feature Engineering (word $w$ (`str`) $\to$ feature vector $x \in \mathbb{R}^d$)
- FirstWordCapital [`0/1`]
- Is all caps (eg., acronyms like 'USA') [`0/1`]
- Token Length [`int`]
- Is it a stopword (using NLTK's english stopword list, 179 stopwords) [`0/1`]
- Is it a punctuation [`0/1`]
- (Scaled) sentence position [`float`]
- PrevPOSTag ['0-11']
- POSTag ['0-11']
- NextPOSTag ['0-11']

In [14]:
def vectorize(w, scaled_position, prev_tag, pos_tag, next_tag):
    # w : str : a token

    v = np.zeros(features_number).astype(np.float32)

    # If first character in uppercase
    if w[0].isupper():
        title = 1
    else:
        title = 0

    # All characters in uppercase
    if w.isupper():
        allcaps = 1
    else:
        allcaps = 0

    # Is stopword
    if w.lower() in SW:
        sw = 1
    else:
        sw = 0

    # Is punctuation
    if w in PUNCT:
        punct = 1
    else:
        punct = 0

    
    # Build vector
    v[0] = title
    v[1] = allcaps
    v[2] = len(w)
    v[3] = sw
    v[4] = punct
    v[5] = scaled_position
    v[6] = prev_tag
    v[7] = pos_tag
    v[8] = next_tag

    return v

In [15]:
def infer(model, scaler, s):
    # s: sentence

    tokens = word_tokenize(s)
    pos_tags = [tag for word, tag in call_viterbi(" ".join(tokens))]
    pos_tags = convert_pos_tags(pos_tags)
    features = []

    l = len(tokens)
    for i in range(l):
        f = vectorize(w = tokens[i], scaled_position = (i/l), prev_tag = pos_tags[i-1] if i > 0 else -1, pos_tag = pos_tags[i], next_tag = pos_tags[i+1] if i < l-1 else -1)
        features.append(f)

    features = np.asarray(features, dtype = np.float32)

    scaled = scaler.transform(features)

    pred = model.predict(scaled)

    return pred, tokens, features

## Data (CoNLL 2003) 

In [16]:
data = load_dataset("conll2003", trust_remote_code=True)

In [17]:
data = load_dataset("conll2003", trust_remote_code=True) # of type datasets.dataset_dict.DatasetDict
train_data = data["train"] #This contains 14,041 rows (type: datasets.arrow_dataset.Dataset)
val_data = data["validation"] #This contains 3250 rows
test_data  = data["test"] # This contains 3453 rows

# columns: 'id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'

In [18]:
#Lets convert it into pandas df
data_train1 = train_data.to_pandas()
#Lets find the unique values in pos_tags
data_train1
#There seems to be array in each pos_tags row, lets get a unique list of all the pos_tags
pos_tags = set()
for i in data_train1["pos_tags"]:
    for pos_tag in i:
        pos_tags.add(pos_tag)

pos_tags

{0,
 1,
 3,
 4,
 5,
 6,
 7,
 8,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46}

In [19]:
trainWords, XTrain, yTrain = data_manipulate(train_data)
valWords, XVal, yVal       = data_manipulate(val_data)
testWords, XTest, yTest    = data_manipulate(test_data)

  0%|          | 0/14041 [00:00<?, ?it/s]

  0%|          | 0/3250 [00:00<?, ?it/s]

  0%|          | 0/3453 [00:00<?, ?it/s]

In [20]:
#We will print the shape of the train, val, test data
print(XTrain.shape)
print(XVal.shape)
print(XTest.shape)

(203621, 9)
(51362, 9)
(46435, 9)


In [21]:
# Standardize the features such that all features contribute equally to the distance metric computation of the SVM
scaler = StandardScaler()

# Fit only on the training data (i.e. compute mean and std)
scaler = scaler.fit(XTrain)

# Use the train data fit values to scale val and test
X_train = scaler.transform(XTrain)
X_val   = scaler.transform(XVal)
X_test  = scaler.transform(XTest)

In [22]:
model = SVC(C = 1.0, kernel = "rbf", class_weight = "balanced", random_state = 0, verbose = True)
model.fit(XTrain, yTrain )

[LibSVM]

In [23]:
y_pred_val = model.predict(X_val)

In [33]:
# Lets make the "confusion_matrix" for all words in the val data


In [24]:
print(classification_report(y_true = yVal, y_pred = y_pred_val))

              precision    recall  f1-score   support

         0.0       0.99      0.96      0.98     42759
         1.0       0.82      0.97      0.89      8603

    accuracy                           0.96     51362
   macro avg       0.91      0.96      0.93     51362
weighted avg       0.96      0.96      0.96     51362



In [39]:
#Lets get the confusion matrix WITH PROPER LABELS
conf_matrix = confusion_matrix(yVal,y_pred_val, labels=[0,1])
conf_matrix_df = pd.DataFrame(conf_matrix, index=[0,1], columns=[0,1])
conf_matrix_df


Unnamed: 0,0,1
0,40925,1834
1,261,8342


In [25]:
# A few examples

examples = [
    "Delhi is the capital of India.",
    "US Vice President Kamala Harris, PM Modi talk up Indo-US ties at 1st in-person meeting.",
    "Covid-19 India Live News: National Task Force drops Ivermectin, HCQ drugs from Covid-19 treatment protocol; India logs 31,382 new cases.",
    "US Rules Out Adding India Or Japan To Security Alliance With Australia And UK" # all words are capitalized,
    "Washington DC is the capital of United States of America"
]

for e in examples:
    pred, tokens, features = infer(model, scaler, e)
    annotated = []
    for w, p in zip(tokens, pred):
        annotated.append(f"{w}_{int(p)}")
    print(" ".join(annotated))
    print()

Delhi_1 is_0 the_0 capital_0 of_0 India_1 ._0

US_1 Vice_1 President_1 Kamala_1 Harris_1 ,_0 PM_1 Modi_1 talk_0 up_0 Indo-US_1 ties_0 at_0 1st_0 in-person_0 meeting_0 ._0

Covid-19_1 India_1 Live_1 News_1 :_0 National_1 Task_1 Force_1 drops_0 Ivermectin_1 ,_0 HCQ_1 drugs_0 from_0 Covid-19_1 treatment_0 protocol_0 ;_0 India_1 logs_0 31,382_0 new_0 cases_0 ._0

US_1 Rules_1 Out_0 Adding_1 India_1 Or_0 Japan_1 To_0 Security_1 Alliance_1 With_1 Australia_1 And_1 UKWashington_1 DC_1 is_0 the_0 capital_0 of_0 United_1 States_1 of_0 America_1



In [31]:
def final_func(sentence):
    pred, tokens, features = infer(model, scaler, sentence)
    annotated = []
    for w, p in zip(tokens, pred):
        annotated.append(f"{w}_{int(p)}")
    return " ".join(annotated)
    

In [32]:
import gradio as gr

# Define the Gradio interface
interface = gr.Interface(
    fn= final_func,                       # Function that processes input
    inputs="text",                       # Input type: Text field for the user
    outputs="text",                      
    title="NEI using SVM",       # Title of the UI
    description="Enter a sentence"  # Description
)

# Launch the UI in a browser
interface.launch()

Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




IMPORTANT: You are using gradio version 4.21.0, however version 4.44.1 is available, please upgrade.
--------
