# Entity Recognition

In [1]:
# import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os

import json

from wordcloud import WordCloud

import nltk

# Load the Dataset

In [2]:
with open("Healthcare.json",'r') as f:
    data=json.load(f)
    
df=pd.DataFrame(data['intents'])
df

Unnamed: 0,tag,patterns,responses
0,greeting,"[Hi, Hey, Is anyone there?, Hi there, Hello, H...",[Hello there. Tell me how are you feeling toda...
1,morning,[Good morning],[Good morning. I hope you had a good night's s...
2,afternoon,[Good afternoon],[Good afternoon. How is your day going?]
3,evening,[Good evening],[Good evening. How has your day been?]
4,night,[Good night],"[Good night. Get some proper sleep, Good night..."
...,...,...,...
75,fact-28,[What do I do if I'm worried about my mental h...,[The most important thing is to talk to someon...
76,fact-29,[How do I know if I'm unwell?],"[If your beliefs , thoughts , feelings or beha..."
77,fact-30,[How can I maintain social connections? What i...,"[A lot of people are alone right now, but we d..."
78,fact-31,[What's the difference between anxiety and str...,[Stress and anxiety are often used interchange...


In [3]:
dic={"tag":[],"patterns":[],"responses":[]}

for i in range(len(df)):
    ptrns=df[df.index==i]['patterns'].values[0]
    rspns=df[df.index==i]['responses'].values[0]
    tag=df[df.index==i]['tag'].values[0]
    
    for j in range(len(ptrns)):
        dic['tag'].append(tag)
        dic['patterns'].append(ptrns[j])
        dic['responses'].append(rspns)
        
df=pd.DataFrame.from_dict(dic)

df


Unnamed: 0,tag,patterns,responses
0,greeting,Hi,[Hello there. Tell me how are you feeling toda...
1,greeting,Hey,[Hello there. Tell me how are you feeling toda...
2,greeting,Is anyone there?,[Hello there. Tell me how are you feeling toda...
3,greeting,Hi there,[Hello there. Tell me how are you feeling toda...
4,greeting,Hello,[Hello there. Tell me how are you feeling toda...
...,...,...,...
227,fact-29,How do I know if I'm unwell?,"[If your beliefs , thoughts , feelings or beha..."
228,fact-30,How can I maintain social connections? What if...,"[A lot of people are alone right now, but we d..."
229,fact-31,What's the difference between anxiety and stress?,[Stress and anxiety are often used interchange...
230,fact-32,What's the difference between sadness and depr...,"[Sadness is a normal reaction to a loss, disap..."


<br/>    

## POS Tagging                

POS Tagging (Parts of Speech Tagging) is a process to mark up the words in the text format for a particular part of a speach based on its definition and context. it is responsible for text reading in a language and assigning some specific token to each word. it is also called grammatical tagging.

In [4]:
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [5]:
# Function to perform POS tagging
def pos_tagging(sentence):
    # Tokenize the sentence
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in tokens if not w.lower() in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]
    
    # Perform POS tagging
    pos_tags = pos_tag(lemmatized_tokens)
    
    return pos_tags

# Apply POS tagging to each pattern in the DataFrame and store the results in a new column
df['Pos tag'] = df['patterns'].apply(lambda x: pos_tagging(x))

In [6]:
df.head()

Unnamed: 0,tag,patterns,responses,Pos tag
0,greeting,Hi,[Hello there. Tell me how are you feeling toda...,"[(Hi, NN)]"
1,greeting,Hey,[Hello there. Tell me how are you feeling toda...,"[(Hey, NN)]"
2,greeting,Is anyone there?,[Hello there. Tell me how are you feeling toda...,"[(anyone, NN)]"
3,greeting,Hi there,[Hello there. Tell me how are you feeling toda...,"[(Hi, NN)]"
4,greeting,Hello,[Hello there. Tell me how are you feeling toda...,"[(Hello, NN)]"


## Entity 

In [7]:
import spacy

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Function to perform Named Entity Recognition (NER) using spaCy
def recognize_entities_spacy(sentence):
    # Process the sentence using spaCy
    doc = nlp(sentence)
    
    # Extract entity labels
    entity_labels = [ent.label_ for ent in doc.ents]
    
    return entity_labels

# Apply NER using spaCy to each pattern in the DataFrame and store the results in a new column
df['entity labels'] = df['patterns'].apply(lambda x: recognize_entities_spacy(x))

In [8]:
df.head()

Unnamed: 0,tag,patterns,responses,Pos tag,entity labels
0,greeting,Hi,[Hello there. Tell me how are you feeling toda...,"[(Hi, NN)]",[]
1,greeting,Hey,[Hello there. Tell me how are you feeling toda...,"[(Hey, NN)]",[]
2,greeting,Is anyone there?,[Hello there. Tell me how are you feeling toda...,"[(anyone, NN)]",[]
3,greeting,Hi there,[Hello there. Tell me how are you feeling toda...,"[(Hi, NN)]",[]
4,greeting,Hello,[Hello there. Tell me how are you feeling toda...,"[(Hello, NN)]",[]


## Visualize the Named Entities

In [9]:
from spacy import displacy

# Process the text from the first pattern
doc = nlp(df.loc[75, 'patterns'])

# Visualize the named entities
displacy.render(doc, style="ent", jupyter=True)

## Converting the POS and entity column into list

In [10]:
# Function to extract POS tags and named entities from a sentence
def extract_pos_and_entities(sentence):
    doc = nlp(sentence)
    pos_tags = [token.pos_ for token in doc]
    entities = [token.ent_iob_ + "-" + token.ent_type_ if token.ent_iob_ != 'O' else 'O' for token in doc]
    return pos_tags, entities

# Iterate through each row in the DataFrame
pos_tags_list = []
entity_list = []
for index, row in df.iterrows():
    sentence = row['patterns']
    pos_tags, entities = extract_pos_and_entities(sentence)
    pos_tags_list.append(pos_tags)
    entity_list.append(entities)

# Add new columns to the DataFrame
df['POS'] = pos_tags_list
df['entity'] = entity_list

In [11]:
df.head()

Unnamed: 0,tag,patterns,responses,Pos tag,entity labels,POS,entity
0,greeting,Hi,[Hello there. Tell me how are you feeling toda...,"[(Hi, NN)]",[],[INTJ],[O]
1,greeting,Hey,[Hello there. Tell me how are you feeling toda...,"[(Hey, NN)]",[],[INTJ],[O]
2,greeting,Is anyone there?,[Hello there. Tell me how are you feeling toda...,"[(anyone, NN)]",[],"[AUX, PRON, ADV, PUNCT]","[O, O, O, O]"
3,greeting,Hi there,[Hello there. Tell me how are you feeling toda...,"[(Hi, NN)]",[],"[INTJ, ADV]","[O, O]"
4,greeting,Hello,[Hello there. Tell me how are you feeling toda...,"[(Hello, NN)]",[],[INTJ],[O]


In [12]:
# delet unwanted columns 

df=df.drop(columns=['Pos tag', 'entity labels'],axis=1)

In [13]:
all_entities = []
for entities in df['entity']:
    all_entities.extend(entities)

# Extract unique entities
unique_entities = set(all_entities)

# Print unique entities
print("Unique Entities:")
for entity in unique_entities:
    print(entity)

Unique Entities:
O
B-DATE
B-PERSON
B-TIME
B-ORG
I-ORG
I-DATE


In [14]:
entity_mapping = {
    'O': 1,
    'I-ORG': 2,
    'B-DATE': 3,
    'B-PERSON': 4,
    'I-DATE': 5,
    'B-TIME':6,
    'B-ORG':7
}

# Map/replace the values in the 'entity' column using the dictionary
df['entity_mapped'] = df['entity'].apply(lambda x: [entity_mapping[entity] for entity in x])
df.head()

Unnamed: 0,tag,patterns,responses,POS,entity,entity_mapped
0,greeting,Hi,[Hello there. Tell me how are you feeling toda...,[INTJ],[O],[1]
1,greeting,Hey,[Hello there. Tell me how are you feeling toda...,[INTJ],[O],[1]
2,greeting,Is anyone there?,[Hello there. Tell me how are you feeling toda...,"[AUX, PRON, ADV, PUNCT]","[O, O, O, O]","[1, 1, 1, 1]"
3,greeting,Hi there,[Hello there. Tell me how are you feeling toda...,"[INTJ, ADV]","[O, O]","[1, 1]"
4,greeting,Hello,[Hello there. Tell me how are you feeling toda...,[INTJ],[O],[1]


<br/>           

# NER 

Named Entity Recognition (NER) is a subtask of Natural Language Processing (NLP) that identifies and classifies named entities in a text into predefined categories such as person names, organizations and locations. It is can be used is lots fileds and indistries, like Question answering and Sentiment analysis.

In [15]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# https://huggingface.co/dslim/bert-base-NER
tokenizer=AutoTokenizer.from_pretrained('dslim/bert-base-NER')
model=AutoModelForTokenClassification.from_pretrained('dslim/bert-base-NER')

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
import torch
from transformers import pipeline

pipe=pipeline(task='ner', model=model, tokenizer=tokenizer, device='cuda', torch_dtype=torch.bfloat16)
pipe.device

device(type='cuda')

In [18]:
prompt="I am Aisuko and I am studying on RMIT university."

result=pipe(prompt)
result

[{'entity': 'B-PER',
  'score': 0.9960129,
  'index': 3,
  'word': 'Ai',
  'start': 5,
  'end': 7},
 {'entity': 'B-PER',
  'score': 0.89774305,
  'index': 4,
  'word': '##su',
  'start': 7,
  'end': 9},
 {'entity': 'B-PER',
  'score': 0.78208786,
  'index': 5,
  'word': '##ko',
  'start': 9,
  'end': 11},
 {'entity': 'B-LOC',
  'score': 0.6383068,
  'index': 11,
  'word': 'R',
  'start': 33,
  'end': 34}]