### Defining a NER (Named Entity Recognition) model for token classification using BERT

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
NER_MODEL_ID = "dslim/bert-base-NER"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)

nlp = pipeline(
    task="ner", 
    model=model, 
    tokenizer=tokenizer,
    device='cpu',
    aggregation_strategy='max'
)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Defining a model for Transforming sentence to Vectors

In [4]:
from sentence_transformers import SentenceTransformer

In [5]:
TRANSFORMER_MODEL_ID = "flax-sentence-embeddings/all_datasets_v3_mpnet-base"

In [6]:
retriever = SentenceTransformer(TRANSFORMER_MODEL_ID)

### Initilizing a Pine cone Index for storing the vectors

In [7]:
from pinecone import Pinecone, PodSpec
from dotenv import load_dotenv

In [8]:
load_dotenv(dotenv_path="../.env")

True

In [9]:
pc = Pinecone()

In [10]:
pc.create_index(
    name="ner-embedding",
    metric="cosine",
    dimension=768,
    spec=PodSpec(
        environment='gcp-starter'
    )
)

In [11]:
idx = pc.Index('ner-embedding')

### Loading data from Hugging Face Datasets

In [12]:
from datasets import load_dataset

In [13]:
DATASET_ID = "fabiochiu/medium-articles"
df = load_dataset(
    DATASET_ID,
    data_files="medium_articles.csv",
    split="train"
).to_pandas()

df = df.dropna().sample(100, random_state=45)

In [14]:
df['text_extended'] = df['title'] + "." + df['text'].str[0:1000]

In [15]:
def extract_entities(list_of_texts):
    entities = []
    for text in list_of_texts:
        extracted_ners = nlp(text)
        entities.append([entity['word'] for entity in extracted_ners])
    return entities

In [16]:
from tqdm.auto import tqdm

In [20]:
BATCH_SIZE = 10
for i in range(0, len(df), BATCH_SIZE):
    end = min(i + BATCH_SIZE, len(df))
    df_batch = df.iloc[i:end]
    # embed the text
    emb = retriever.encode(
        df_batch['text_extended'].tolist()
    ).tolist()
    # extract entities
    entities = extract_entities(df_batch['text_extended'].tolist())
    df_batch['named_entities'] = [list(set(e)) for e in entities]
    # removing text from df_batch
    df_batch = df_batch.drop(columns=['text'])
    # create metadata
    meta_data = df_batch.to_dict(orient='records')
    # creating the index
    ids = [str(i) for i in range(i, end)]

    vectors = list(zip(ids, emb, meta_data))

    idx.upsert(vectors)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_batch['named_entities'] = [list(set(e)) for e in entities]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_batch['named_entities'] = [list(set(e)) for e in entities]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_batch['named_entities'] = [list(set(e)) for e in entities]
A value is trying t

### Querying data

In [30]:
query = "ios keyboard not working."
qx = retriever.encode([query]).tolist()

ne = extract_entities([query])[0]

[]

In [32]:
idx.query(
    vector=qx,
    top_k=5,
    include_metadata=True,
    filter={
        "named_entities": {
            "$in": ne
        }
    }
)

{'matches': [{'id': '95',
              'metadata': {'authors': "['Federica Benacquista']",
                           'named_entities': ['iPad', 'iOS', 'iPhone'],
                           'tags': "['Swift', 'iPhone', 'iPad', 'iOS', "
                                   "'Keyboard']",
                           'text_extended': 'List of the official iOS '
                                            'keyboards’ heights (and how to '
                                            'calculate them).Recently I have '
                                            'been working on a custom keyboard '
                                            'extension.\n'
                                            '\n'
                                            'I wanted it to be of the same '
                                            'size of the original iOS '
                                            'keyboard. I noticed that almost '
                                            'all of the devices hav