## 1. Import the data:
## Note: replace variable 'path' to dataset :  [ner_dataset.csv](https://github.com/sriramrokkam/NLP_NER_BERT/blob/BITS/ner_dataset.csv)

In [1]:
import pandas as pd
# path = "/content/ner_dataset.csv"
path = "ner_dataset.csv"  # for local
data = pd.read_csv(path, encoding="latin1")
data.head(30)
# data = data.head(5000)  # change value for number of records.

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


## 2. Fill the missing values

In [2]:
data = data.fillna(method="ffill")
data.head(30)

  data = data.fillna(method="ffill")


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data["Sentence #"] = LabelEncoder().fit_transform(data["Sentence #"])
data.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag
0,0,Thousands,NNS,O
1,0,of,IN,O
2,0,demonstrators,NNS,O
3,0,have,VBP,O
4,0,marched,VBN,O
5,0,through,IN,O
6,0,London,NNP,B-geo
7,0,to,TO,O
8,0,protest,VB,O
9,0,the,DT,O


In [4]:
data.rename(columns={"Sentence #": "sentence_id",
            "Word": "words", "Tag": "labels"}, inplace=True)

## 3. Split Data: Training and Testing

In [5]:
data["labels"] = data["labels"].str.upper()

X = data[["sentence_id", "words"]]
Y = data["labels"]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

# building up train data and test data
train_data = pd.DataFrame(
    {"sentence_id": x_train["sentence_id"], "words": x_train["words"], "labels": y_train})
test_data = pd.DataFrame(
    {"sentence_id": x_test["sentence_id"], "words": x_test["words"], "labels": y_test})
train_data

Unnamed: 0,sentence_id,words,labels
458015,12161,the,O
488504,13734,a,O
346351,6495,making,O
4248,10223,Iraq,B-GEO
851840,32130,Wizard,B-ORG
...,...,...,...
759520,27459,than,O
172698,43384,U.N.,B-GEO
832301,31162,strengthens,O
213976,45514,rebound,O


## 4. Build and Train Model

In [6]:
from simpletransformers.ner import NERModel, NERArgs
label = data["labels"].unique().tolist()
label

['O',
 'B-GEO',
 'B-GPE',
 'B-PER',
 'I-GEO',
 'B-ORG',
 'I-ORG',
 'B-TIM',
 'B-ART',
 'I-ART',
 'I-PER',
 'I-GPE',
 'I-TIM',
 'B-NAT',
 'B-EVE',
 'I-EVE',
 'I-NAT']

In [7]:
args = NERArgs()
args.num_train_epochs = 2
args.learning_rate = 1e-4
args.overwrite_output_dir = True
args.train_batch_size = 16
args.eval_batch_size = 16

In [8]:
model = NERModel('bert', 'bert-base-cased',
                 labels=label, args=args, use_cuda=False)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
r = model.train_model(train_data, eval_data=test_data, acc=accuracy_score)

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/2858 [00:00<?, ?it/s]

Running Epoch 2 of 2:   0%|          | 0/2858 [00:00<?, ?it/s]

In [10]:
result, model_outputs, preds_list = model.eval_model(test_data)

  0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/2781 [00:00<?, ?it/s]

In [17]:
# Display Results
result

{'eval_loss': 0.17601415726351144,
 'precision': 0.8257638315441783,
 'recall': 0.7713461667449192,
 'f1_score': 0.7976279239132319}

## 5. Evaluation

In [18]:
validation_sentence = [
    "They marched from the Houses of Parliament to a rally in Hyde Park ."]
prediction, model_output = model.predict(validation_sentence)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
prediction

[[{'They': 'O'},
  {'marched': 'O'},
  {'from': 'O'},
  {'the': 'O'},
  {'Houses': 'I-ORG'},
  {'of': 'I-ORG'},
  {'Parliament': 'B-ORG'},
  {'to': 'O'},
  {'a': 'O'},
  {'rally': 'O'},
  {'in': 'O'},
  {'Hyde': 'B-GEO'},
  {'Park': 'I-GEO'},
  {'.': 'O'}]]

## 6. Save Tokenizer and Model

In [21]:
# Save the model and tokenizer after training
# model.save_model("ner_model_v1")
import shutil
import os

# Define the model folder path
model_dir = "ner_model"

# Check if the folder exists and remove it if necessary
if os.path.exists(model_dir):
    # This will delete the existing folder and its contents
    shutil.rmtree(model_dir)

# Save the model and tokenizer after deleting the existing folder
model.save_model(model_dir)

## 7. Load the Model
## Note: 1.
1.   use_cuda = True (for GPU) and False (for CPU)
2.   Check the 'outputs' folder.
##       

In [22]:
from simpletransformers.ner import NERModel

# Load the saved model
# Add the 'args' parameter to specify the configuration
model = NERModel(
    "bert",
    "outputs",
    use_cuda=False,
    # This allows you to overwrite the output directory if it already exists
    args={"overwrite_output_dir": True}
)

# 8. Compare the Results
## Note: Values should match with Section 5

In [23]:
#
predictions, raw_outputs = model.predict(validation_sentence)

# Print the predictions
print(predictions)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[[{'They': 'O'}, {'marched': 'O'}, {'from': 'O'}, {'the': 'O'}, {'Houses': 'I-ORG'}, {'of': 'I-ORG'}, {'Parliament': 'B-ORG'}, {'to': 'O'}, {'a': 'O'}, {'rally': 'O'}, {'in': 'O'}, {'Hyde': 'B-GEO'}, {'Park': 'I-GEO'}, {'.': 'O'}]]


# 9. Build the Chat bot

In [71]:
from simpletransformers.ner import NERModel

# Load the saved model
model = NERModel("bert", "outputs", use_cuda=False)

# Function to get NER predictions from a passage


def get_ner_from_passage(passage):
    # The model expects a list of sentences
    sentences = [passage]

    # Predict named entities in the input passage
    predictions, raw_outputs = model.predict(sentences)

    # Return the predictions
    return predictions


# Chatbot loop
print("Welcome to the NER Chatbot! Enter your text or 'exit' to quit.")

while True:
    # Get input from the user
    user_input = input("You: ")

    # Exit condition
    if user_input.lower() == 'exit':
        print("Goodbye!")
        break

    # Get NER predictions
    ner_results = get_ner_from_passage(user_input)

    # Display the NER results
    print("NER Results:", ner_results)