# EXTRACTING NAME, LOCATION, EMAIL,DATE FROM TEXT USING BERT LARGE CASED MODEL FINETUNED ON CON1103 ENGLISH DATASET

In [1]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from transformers import pipeline

In [2]:
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"

In [3]:
# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

text = """
My name is kunal singh and i am persuing PG in artificial intelligence and machine learning from 23 sept 2023 to 23 august 2024 and I live in singhatiya 
and my email id is kunal.jcdu@gmail.com and my company name will be shiv tandav.pvt.ltd
"""

In [13]:
token_words=tokenizer.tokenize(text)
token_words

['My',
 'name',
 'is',
 'k',
 '##una',
 '##l',
 'sing',
 '##h',
 'and',
 'i',
 'am',
 'per',
 '##sui',
 '##ng',
 'P',
 '##G',
 'in',
 'artificial',
 'intelligence',
 'and',
 'machine',
 'learning',
 'from',
 '23',
 'se',
 '##pt',
 '202',
 '##3',
 'to',
 '23',
 'au',
 '##gus',
 '##t',
 '202',
 '##4',
 'and',
 'I',
 'live',
 'in',
 'sing',
 '##hat',
 '##iya',
 'and',
 'my',
 'email',
 'id',
 'is',
 'k',
 '##una',
 '##l',
 '.',
 'j',
 '##c',
 '##du',
 '@',
 'g',
 '##mail',
 '.',
 'com',
 'and',
 'my',
 'company',
 'name',
 'will',
 'be',
 's',
 '##hi',
 '##v',
 'tan',
 '##da',
 '##v',
 '.',
 'p',
 '##v',
 '##t',
 '.',
 'l',
 '##t',
 '##d']

In [15]:
token_ids=tokenizer.convert_tokens_to_ids(token_words)
token_ids

[1422,
 1271,
 1110,
 180,
 9291,
 1233,
 6928,
 1324,
 1105,
 178,
 1821,
 1679,
 26841,
 2118,
 153,
 2349,
 1107,
 8246,
 4810,
 1105,
 3395,
 3776,
 1121,
 1695,
 14516,
 6451,
 17881,
 1495,
 1106,
 1695,
 12686,
 12909,
 1204,
 17881,
 1527,
 1105,
 146,
 1686,
 1107,
 6928,
 11220,
 9384,
 1105,
 1139,
 10632,
 25021,
 1110,
 180,
 9291,
 1233,
 119,
 179,
 1665,
 7641,
 137,
 176,
 14746,
 119,
 3254,
 1105,
 1139,
 1419,
 1271,
 1209,
 1129,
 188,
 3031,
 1964,
 15925,
 1810,
 1964,
 119,
 185,
 1964,
 1204,
 119,
 181,
 1204,
 1181]

In [6]:
# Get named entities
entities = ner_pipeline(text)

In [7]:
# Filter entities by type
def filter_entities(entities):
    filtered_entities = {
        "PERSON": [],
        "LOC": [],
        "ORG": [],
        "DATE": [],
        "EMAIL": []
    }
    for entity in entities:
        entity_type = entity['entity'].split('-')[-1]
        if entity_type in filtered_entities:
            filtered_entities[entity_type].append(entity['word'])
    
    return filtered_entities

In [8]:
filtered_entities = filter_entities(entities)

In [9]:
filtered_entities

{'PERSON': [],
 'LOC': ['sing', '##hat', '##iya'],
 'ORG': [],
 'DATE': [],
 'EMAIL': []}

In [11]:
# Extract emails using regex
import re

def extract_emails(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    emails = re.findall(email_pattern, text)
    return emails

emails = extract_emails(text)
filtered_entities['EMAIL'] = emails

filtered_entities

{'PERSON': [],
 'LOC': ['sing', '##hat', '##iya'],
 'ORG': [],
 'DATE': [],
 'EMAIL': ['kunal.jcdu@gmail.com']}

# LETS TRY ANOTHER LLM MODEL FOR EXTRACTING NAMED ENTITY RECOGNITION(NAMES,PHONE NUMBERS, EMAI IDS, SKILLS FROM A RESUME)

In [16]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Try a different NER model
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is kunal singh and i am pursuing PG in artificial intelligence and machine learning from 23 sept 2023 to 23 august 2024 and I live in singhatiya"

ner_results = nlp(example)

ner_results


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'I-PER',
  'score': 0.9868324,
  'index': 4,
  'word': 'k',
  'start': 11,
  'end': 12},
 {'entity': 'I-PER',
  'score': 0.9746304,
  'index': 5,
  'word': '##una',
  'start': 12,
  'end': 15},
 {'entity': 'I-PER',
  'score': 0.9898369,
  'index': 6,
  'word': '##l',
  'start': 15,
  'end': 16},
 {'entity': 'I-PER',
  'score': 0.9933996,
  'index': 7,
  'word': 'sing',
  'start': 17,
  'end': 21},
 {'entity': 'I-PER',
  'score': 0.9221179,
  'index': 8,
  'word': '##h',
  'start': 21,
  'end': 22},
 {'entity': 'I-LOC',
  'score': 0.53480405,
  'index': 38,
  'word': 'sing',
  'start': 142,
  'end': 146},
 {'entity': 'I-LOC',
  'score': 0.61715794,
  'index': 39,
  'word': '##hat',
  'start': 146,
  'end': 149},
 {'entity': 'I-LOC',
  'score': 0.6104713,
  'index': 40,
  'word': '##iya',
  'start': 149,
  'end': 152}]

# LETS EXTRACT 4 THINGS FROM A ANOTHER MODEL(PERSON NAMES, DATES, LOCATIONS, AND MIISCELLANOUS) FROM A TEXT USING BERT-LARGE-NER

In [2]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

tokenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
#from transformers import AutoTokenizer, AutoModelForTokenClassification
#from transformers import pipeline

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is kunal singh and i am persuing PG in artificial intelligence and machine learning from 23 sept 2023 to 23 august 2024 and I live in singhatiya"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.98376864, 'index': 4, 'word': 'k', 'start': 11, 'end': 12}, {'entity': 'B-PER', 'score': 0.32892758, 'index': 5, 'word': '##una', 'start': 12, 'end': 15}, {'entity': 'I-PER', 'score': 0.7284298, 'index': 6, 'word': '##l', 'start': 15, 'end': 16}, {'entity': 'I-PER', 'score': 0.7760353, 'index': 7, 'word': 'sing', 'start': 17, 'end': 21}, {'entity': 'I-PER', 'score': 0.8695711, 'index': 8, 'word': '##h', 'start': 21, 'end': 22}, {'entity': 'B-LOC', 'score': 0.76172465, 'index': 40, 'word': 'sing', 'start': 142, 'end': 146}, {'entity': 'B-LOC', 'score': 0.5201816, 'index': 41, 'word': '##hat', 'start': 146, 'end': 149}, {'entity': 'I-LOC', 'score': 0.5219515, 'index': 42, 'word': '##iya', 'start': 149, 'end': 152}]


In [4]:
ner_results

[{'entity': 'B-PER',
  'score': 0.98376864,
  'index': 4,
  'word': 'k',
  'start': 11,
  'end': 12},
 {'entity': 'B-PER',
  'score': 0.32892758,
  'index': 5,
  'word': '##una',
  'start': 12,
  'end': 15},
 {'entity': 'I-PER',
  'score': 0.7284298,
  'index': 6,
  'word': '##l',
  'start': 15,
  'end': 16},
 {'entity': 'I-PER',
  'score': 0.7760353,
  'index': 7,
  'word': 'sing',
  'start': 17,
  'end': 21},
 {'entity': 'I-PER',
  'score': 0.8695711,
  'index': 8,
  'word': '##h',
  'start': 21,
  'end': 22},
 {'entity': 'B-LOC',
  'score': 0.76172465,
  'index': 40,
  'word': 'sing',
  'start': 142,
  'end': 146},
 {'entity': 'B-LOC',
  'score': 0.5201816,
  'index': 41,
  'word': '##hat',
  'start': 146,
  'end': 149},
 {'entity': 'I-LOC',
  'score': 0.5219515,
  'index': 42,
  'word': '##iya',
  'start': 149,
  'end': 152}]

# LETS TRY SOME DIFFERENT SENTENCE AND TRY TO EXTRACT NAMED ENTITIES FROM IT

In [23]:
import fitz
text=fitz.get_text(r'E:\new_downloads_default_folder\RESUME_AS_OF_17_JUNE.pdf')
text

[' \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n  \n \n1 Speech to Speech Chatbot for Airplane flying school  |   LINK \n \n\uf0be Performed Text Processing to create Question Answer pair Template  \n\uf0be Used these Tools and technologies Transformers, Llama-2-7b, NLP, PyTorch, Hugging \nFace, LoRA, QLoRA, Parameter efficient Fine Tuning(PEFT), Supervised Fine \nTuning(SFT) \n\uf0be Then Re-trained the Llama-2-7b model using hugging face API and dataset I took was,  \nAirplane Flying Handbook of US Federal Aviation Administration(DOT-FAA). \n \n\uf0be Took prompts in speech medium and then used Speech to text technology to convert speech \ninto text and then passed the text to our customized chatbot. The chat gave response in text \nform which I converted it into speech medium using Text to Speech technology. \n. \n \n2  Object detection model  |  LINK \n\uf0be Used Transfer Learning to get pretrained YOLOv8 model from huggingface. \n\uf0be Improvements done:  \n1. A

In [25]:
text2=" " + text
text2

TypeError: can only concatenate str (not "list") to str

In [40]:
text2=" ".join(text)                     # THIS IS HOW WE CONVERT LIST INTO A STRING 
text2

' \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n  \n \n1 Speech to Speech Chatbot for Airplane flying school  |   LINK \n \n\uf0be Performed Text Processing to create Question Answer pair Template  \n\uf0be Used these Tools and technologies Transformers, Llama-2-7b, NLP, PyTorch, Hugging \nFace, LoRA, QLoRA, Parameter efficient Fine Tuning(PEFT), Supervised Fine \nTuning(SFT) \n\uf0be Then Re-trained the Llama-2-7b model using hugging face API and dataset I took was,  \nAirplane Flying Handbook of US Federal Aviation Administration(DOT-FAA). \n \n\uf0be Took prompts in speech medium and then used Speech to text technology to convert speech \ninto text and then passed the text to our customized chatbot. The chat gave response in text \nform which I converted it into speech medium using Text to Speech technology. \n. \n \n2  Object detection model  |  LINK \n\uf0be Used Transfer Learning to get pretrained YOLOv8 model from huggingface. \n\uf0be Improvements done:  \n1. Ad

In [39]:
cleaned_text=re.sub(r'\s+',' ',  text2)                   # re.sub(pattern,replacement,text)
cleaned_text

' 1 Speech to Speech Chatbot for Airplane flying school | LINK \uf0be Performed Text Processing to create Question Answer pair Template \uf0be Used these Tools and technologies Transformers, Llama-2-7b, NLP, PyTorch, Hugging Face, LoRA, QLoRA, Parameter efficient Fine Tuning(PEFT), Supervised Fine Tuning(SFT) \uf0be Then Re-trained the Llama-2-7b model using hugging face API and dataset I took was, Airplane Flying Handbook of US Federal Aviation Administration(DOT-FAA). \uf0be Took prompts in speech medium and then used Speech to text technology to convert speech into text and then passed the text to our customized chatbot. The chat gave response in text form which I converted it into speech medium using Text to Speech technology. . 2 Object detection model | LINK \uf0be Used Transfer Learning to get pretrained YOLOv8 model from huggingface. \uf0be Improvements done: 1. Added audio to the output which is not present in output of yolov8. 2. Calculated distance between objects in the vid

# NOW, FEED THE EXTRACTED AND CLEANED TEXT OF THE RESUME PDF TO THE BERT MODEL TO EXTRACT NAMED ENTITIES

In [42]:
tokenizer_resume = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model_resume = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [45]:
nlp = pipeline("ner", model=model_resume, tokenizer=tokenizer_resume)

ner_results = nlp(cleaned_text)
ner_results

[{'entity': 'B-MISC',
  'score': 0.62508166,
  'index': 2,
  'word': 'Speech',
  'start': 3,
  'end': 9},
 {'entity': 'B-MISC',
  'score': 0.96644306,
  'index': 9,
  'word': 'Air',
  'start': 32,
  'end': 35},
 {'entity': 'I-MISC',
  'score': 0.9739333,
  'index': 10,
  'word': '##plane',
  'start': 35,
  'end': 40},
 {'entity': 'B-ORG',
  'score': 0.70077974,
  'index': 14,
  'word': 'L',
  'start': 57,
  'end': 58},
 {'entity': 'B-MISC',
  'score': 0.94651216,
  'index': 36,
  'word': 'Transformers',
  'start': 166,
  'end': 178},
 {'entity': 'B-MISC',
  'score': 0.99461454,
  'index': 38,
  'word': 'L',
  'start': 180,
  'end': 181},
 {'entity': 'I-MISC',
  'score': 0.9825911,
  'index': 39,
  'word': '##lam',
  'start': 181,
  'end': 184},
 {'entity': 'I-MISC',
  'score': 0.98890775,
  'index': 40,
  'word': '##a',
  'start': 184,
  'end': 185},
 {'entity': 'I-MISC',
  'score': 0.77167714,
  'index': 41,
  'word': '-',
  'start': 185,
  'end': 186},
 {'entity': 'I-MISC',
  'score'

# so to extract each value corrosponding to word stored in the dictionary which is stored in a list , you need to use LIST COMPHRENSION

# LIST COMPHRESION TO EXTRACT VALUES FROM A LIST CONTAINING MANY DICTIONARIES

In [69]:
for i in ner_results:                         # THIS CODE MEANS: GO INTO THIS LIST WHICH HAS MANY SUB DICTIONARIES IN IT. 
                                              # GO INSIDE EACH DICTIONARY ONE BY ONE AND extract the VALUE stored corrosponding to the key 'WORD'
    print(i['word'])

Speech
Air
##plane
L
Transformers
L
##lam
##a
-
2
7
NL
##P
P
##T
Hu
Face
Lo
##RA
Q
##L
##RA
Para
L
##lam
##a
-
2
-
7
Air
##plane
Flying
Handbook
US
Federal
Aviation
Administration
D
##OT
FAA
Speech
Text
to
Speech
L
Learning
Y
##OL
##O
##8
hugging
K
K
Ku
##nal
Kumar
G
##ub
Sol
##ve
##K
L
Con
Dash
B
##J
Python
S
Mat
Sea
NL
Pan
Lea
B
Table
My
Visual
Ju
Speech
Text
Speech
Speech
****************************************


'Speech'

In [71]:
print(ner_results)

[{'entity': 'B-MISC', 'score': 0.62508166, 'index': 2, 'word': 'Speech', 'start': 3, 'end': 9}, {'entity': 'B-MISC', 'score': 0.96644306, 'index': 9, 'word': 'Air', 'start': 32, 'end': 35}, {'entity': 'I-MISC', 'score': 0.9739333, 'index': 10, 'word': '##plane', 'start': 35, 'end': 40}, {'entity': 'B-ORG', 'score': 0.70077974, 'index': 14, 'word': 'L', 'start': 57, 'end': 58}, {'entity': 'B-MISC', 'score': 0.94651216, 'index': 36, 'word': 'Transformers', 'start': 166, 'end': 178}, {'entity': 'B-MISC', 'score': 0.99461454, 'index': 38, 'word': 'L', 'start': 180, 'end': 181}, {'entity': 'I-MISC', 'score': 0.9825911, 'index': 39, 'word': '##lam', 'start': 181, 'end': 184}, {'entity': 'I-MISC', 'score': 0.98890775, 'index': 40, 'word': '##a', 'start': 184, 'end': 185}, {'entity': 'I-MISC', 'score': 0.77167714, 'index': 41, 'word': '-', 'start': 185, 'end': 186}, {'entity': 'I-MISC', 'score': 0.93122756, 'index': 42, 'word': '2', 'start': 186, 'end': 187}, {'entity': 'I-MISC', 'score': 0.70

In [52]:
ner_results=' '.join(ner_results)

for i in ner_results:
    print(ner_results(['word']))

TypeError: 'list' object is not callable

In [58]:
# Extract words from each dictionary
words = [entry['word'] for entry in ner_results]

# Convert list of words to a single string with space delimiter
words_string = " ".join(words)

print(words_string)

Speech Air ##plane L Transformers L ##lam ##a - 2 7 NL ##P P ##T Hu Face Lo ##RA Q ##L ##RA Para L ##lam ##a - 2 - 7 Air ##plane Flying Handbook US Federal Aviation Administration D ##OT FAA Speech Text to Speech L Learning Y ##OL ##O ##8 hugging K K Ku ##nal Kumar G ##ub Sol ##ve ##K L Con Dash B ##J Python S Mat Sea NL Pan Lea B Table My Visual Ju Speech Text Speech Speech


# HOW TO CONVERT DICTIONARY INTO LIST # woww

In [54]:
dict1={'name':'kunal','age':28 }
dict1

{'name': 'kunal', 'age': 28}

In [55]:
list1=list(dict1.keys())
list1

['name', 'age']

In [56]:
list2=list(dict1.values())
list2

['kunal', 28]

# LETS TRY WITH 3RD NER MODEL (huggingface/bert-base-cased-finetuned-mrpc)

In [20]:
tokenizer=AutoTokenizer.from_pretrained("huggingface/bert-base-cased-finetuned-mrpc")

OSError: huggingface/bert-base-cased-finetuned-mrpc is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

# THIS IS SHOWING ERRORBECAUSE, THIS MODEL IS NOT FINE TUNED FOR NAMED ENTITY RECOGNITION.