In [None]:
# https://towardsdatascience.com/simple-transformers-named-entity-recognition-with-transformer-models-c04b9242a2a0

In [35]:
# !pip install simpletransformers


### Ref : https://github.com/karndeepsingh/Named-Entity-Recognition/blob/main/NAMED%20ENTITY%20RECOGNITION.ipynb

In [36]:

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
     

In [37]:
import pandas as pd
data = pd.read_csv("~/datasets/grocery_training_data.csv",encoding="latin1" )

In [38]:
data.head(30)


Unnamed: 0,Sentence #,Word,Tag
0,0,I,0
1,0,need,0
2,0,2KG,MEASUREMENT
3,0,of,0
4,0,samba,FOOD_ITEM
5,0,rice,FOOD_ITEM
6,0,and,0
7,0,2,MEASUREMENT
8,0,ltr,MEASUREMENT
9,0,coconut,FOOD_ITEM


In [39]:
data.rename(columns={"Sentence #":"sentence_id","Word":"words","Tag":"labels"}, inplace =True)


In [40]:
data["labels"] = data["labels"].str.upper()


In [41]:

X= data[["sentence_id","words"]]
Y =data["labels"]

In [42]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size =0.2)


In [43]:
#building up train data and test data
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

In [44]:
train_data

Unnamed: 0,sentence_id,words,labels
817,180,250,MEASUREMENT
605,138,and,0
66,17,want,0
701,160,2,MEASUREMENT
428,100,butter,FOOD_ITEM
...,...,...,...
681,156,pasta,FOOD_ITEM
783,174,and,0
834,182,1,MEASUREMENT
132,31,for,0


In [45]:
from simpletransformers.ner import NERModel,NERArgs
# https://simpletransformers.ai/docs/ner-model/

In [46]:
label = data["labels"].unique().tolist()
label

['0', 'MEASUREMENT', 'FOOD_ITEM']

In [47]:

args = NERArgs()
args.num_train_epochs = 3
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.train_batch_size = 16
args.eval_batch_size = 16

In [48]:
model = NERModel('bert', 'bert-base-cased',labels=label,args =args,  use_cuda=False)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
model.train_model(
    train_data,
    eval_data = test_data,
    acc=accuracy_score,
    output_dir='grocery_model'
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 33.40it/s]
Epoch 1 of 3:   0%|                                                                                                                                                                  | 0/3 [00:00<?, ?it/s]
Running Epoch 0 of 3:   0%|                                                                                                                                                          | 0/6 [00:00<?, ?it/s][A
Epochs 0/3. Running Loss:    1.0947:   0%|                                                                                                                                           | 0/6 [00:01<?, ?it/s][A
Epochs 0/3. Running Loss:    1.0947:  17%|█████████████████████▊                                                                                                             | 1/6

(18, 0.39836522357331383)

In [50]:
result, model_outputs, preds_list = model.eval_model(test_data)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 53.27it/s]
Running Evaluation: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.07s/it]


In [51]:
result

{'eval_loss': 0.14663984701037408,
 'precision': 0.8941176470588236,
 'recall': 0.8735632183908046,
 'f1_score': 0.8837209302325582}

In [52]:
prediction, model_output = model.predict(["What is the new name of Bangalore"])


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 90.59it/s]
Running Prediction: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.78it/s]


In [28]:
prediction


[[{'What': '0'},
  {'is': '0'},
  {'the': '0'},
  {'new': '0'},
  {'name': '0'},
  {'of': '0'},
  {'Bangalore': '0'}]]

In [29]:
# https://docs.google.com/spreadsheets/d/1JOK1mFmOZb1S3sf-6VDSkqlxZhc31OcAAXYv51823FE/edit#gid=0

In [30]:
prediction, model_output = model.predict(["I want a dozen eggs and a packet of rice."])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 98.45it/s]
Running Prediction: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.84it/s]


In [31]:
model_output

[[{'I': [[4.4133263, -0.8203807, -3.2260625]]},
  {'want': [[4.5140123, -0.92833894, -2.734099]]},
  {'a': [[2.8099782, 0.15122761, -2.0815275]]},
  {'dozen': [[-0.4349795, 0.7713961, -0.84838283]]},
  {'eggs': [[-0.71776915, -1.3596101, 2.7439942]]},
  {'and': [[4.3686996, -0.9077136, -2.774895]]},
  {'a': [[3.1165352, -0.096107274, -2.4683878]]},
  {'packet': [[1.0510769, -0.6539463, -0.16408028]]},
  {'of': [[3.6002073, -1.0197564, -2.2097425]]},
  {'rice.': [[-0.8100081, -1.5629466, 3.5673077],
    [4.102978, -0.842257, -2.69593]]}]]

In [32]:
prediction

[[{'I': '0'},
  {'want': '0'},
  {'a': '0'},
  {'dozen': 'MEASUREMENT'},
  {'eggs': 'FOOD_ITEM'},
  {'and': '0'},
  {'a': '0'},
  {'packet': '0'},
  {'of': '0'},
  {'rice.': 'FOOD_ITEM'}]]