In [1]:
!pip install simpletransformers


Collecting simpletransformers
  Downloading simpletransformers-0.63.11-py3-none-any.whl (250 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.7/250.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m[31m2.0 MB/s[0m eta [36m0:00:01[0m
Collecting tensorboard (from simpletransformers)
  Downloading tensorboard-2.13.0-py3-none-any.whl (5.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0mm
Collecting wandb>=0.10.32 (from simpletransformers)
  Obtaining dependency information for wandb>=0.10.32 from https://files.pythonhosted.org/packages/e1/13/44dda105177622788af8f5da6f9358ecd6fa46e80caa3f4a01ba02cf63d3/wandb-0.15.7-py3-none-any.whl.metadata
  Downloading wandb-0.15.7-py3-none-any.whl.metadata (8.2 kB)
Collecting streamlit (from simpletransformers)
  Obtaining dependency information for streamlit from https://files.pythonhosted.org/pack

### Ref : https://github.com/karndeepsingh/Named-Entity-Recognition/blob/main/NAMED%20ENTITY%20RECOGNITION.ipynb

In [2]:

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
     

In [10]:
import pandas as pd
data = pd.read_csv("~/datasets/grocery_training_data.csv",encoding="latin1" )

In [11]:
data.head(30)


Unnamed: 0,Sentence #,Word,Tag
0,0,I,0
1,0,need,0
2,0,2KG,MEASUREMENT
3,0,of,0
4,0,samba,FOOD_ITEM
5,0,rice,FOOD_ITEM
6,0,and,0
7,0,2,MEASUREMENT
8,0,ltr,MEASUREMENT
9,0,coconut,FOOD_ITEM


In [12]:
data.rename(columns={"Sentence #":"sentence_id","Word":"words","Tag":"labels"}, inplace =True)


In [13]:
data["labels"] = data["labels"].str.upper()


In [15]:

X= data[["sentence_id","words"]]
Y =data["labels"]

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size =0.2)


In [17]:
#building up train data and test data
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

In [18]:
train_data

Unnamed: 0,sentence_id,words,labels
847,184,need,0
401,94,I,0
579,134,for,0
159,37,coffee,FOOD_ITEM
182,41,meals.,0
...,...,...,...
568,132,want,0
643,146,"apples,",FOOD_ITEM
619,142,and,0
133,31,my,0


In [19]:
from simpletransformers.ner import NERModel,NERArgs


In [20]:
label = data["labels"].unique().tolist()
label

['0', 'MEASUREMENT', 'FOOD_ITEM']

In [21]:

args = NERArgs()
args.num_train_epochs = 3
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.train_batch_size = 16
args.eval_batch_size = 16

In [23]:
model = NERModel('bert', 'bert-base-cased',labels=label,args =args,  use_cuda=False)

Downloading (…)lve/main/config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 570/570 [00:00<00:00, 95.4kB/s]
Downloading model.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 436M/436M [00:28<00:00, 15.3MB/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading (…)solve/main/vocab.txt: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 213k/213k [00:00<00:00, 534kB/s]
Downloading (…)okenizer_config.json: 100%|██████████████████████████████████████████████████████████

In [24]:
model.train_model(train_data,eval_data = test_data,acc=accuracy_score)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 30.01it/s]
Epoch 1 of 3:   0%|                                                                                                                                                                  | 0/3 [00:00<?, ?it/s]
Running Epoch 0 of 3:   0%|                                                                                                                                                          | 0/6 [00:00<?, ?it/s][A
Epochs 0/3. Running Loss:    1.1523:   0%|                                                                                                                                           | 0/6 [00:02<?, ?it/s][A
Epochs 0/3. Running Loss:    1.1523:  17%|█████████████████████▊                                                                                                             | 1/6

(18, 0.4432292717198531)

In [25]:
result, model_outputs, preds_list = model.eval_model(test_data)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.63it/s]
Running Evaluation: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.37s/it]


In [26]:
result

{'eval_loss': 0.17812437936663628,
 'precision': 0.9135802469135802,
 'recall': 0.8409090909090909,
 'f1_score': 0.8757396449704142}

In [27]:
prediction, model_output = model.predict(["What is the new name of Bangalore"])


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 99.10it/s]
Running Prediction: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.13it/s]


In [28]:
prediction


[[{'What': '0'},
  {'is': '0'},
  {'the': '0'},
  {'new': '0'},
  {'name': '0'},
  {'of': '0'},
  {'Bangalore': '0'}]]

In [29]:
# https://docs.google.com/spreadsheets/d/1JOK1mFmOZb1S3sf-6VDSkqlxZhc31OcAAXYv51823FE/edit#gid=0

In [30]:
prediction, model_output = model.predict(["I want a dozen eggs and a packet of rice."])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 98.45it/s]
Running Prediction: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.84it/s]


In [31]:
model_output

[[{'I': [[4.4133263, -0.8203807, -3.2260625]]},
  {'want': [[4.5140123, -0.92833894, -2.734099]]},
  {'a': [[2.8099782, 0.15122761, -2.0815275]]},
  {'dozen': [[-0.4349795, 0.7713961, -0.84838283]]},
  {'eggs': [[-0.71776915, -1.3596101, 2.7439942]]},
  {'and': [[4.3686996, -0.9077136, -2.774895]]},
  {'a': [[3.1165352, -0.096107274, -2.4683878]]},
  {'packet': [[1.0510769, -0.6539463, -0.16408028]]},
  {'of': [[3.6002073, -1.0197564, -2.2097425]]},
  {'rice.': [[-0.8100081, -1.5629466, 3.5673077],
    [4.102978, -0.842257, -2.69593]]}]]

In [32]:
prediction

[[{'I': '0'},
  {'want': '0'},
  {'a': '0'},
  {'dozen': 'MEASUREMENT'},
  {'eggs': 'FOOD_ITEM'},
  {'and': '0'},
  {'a': '0'},
  {'packet': '0'},
  {'of': '0'},
  {'rice.': 'FOOD_ITEM'}]]