# NER Powered Semantic Search Using Pinecone v5.0.0

### Setup Environment

In [None]:
import os

from dotenv import load_dotenv
from tqdm.autonotebook import tqdm
from pinecone import Pinecone, ServerlessSpec 
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")

pinecone_key = os.getenv("PINCONE_API_KEY")

  from tqdm.autonotebook import tqdm


In [2]:
# init pinecone

# from pinecone import Pinecone, ServerlessSpec
# API_KEY = "YOUR API KEY"
pc = Pinecone(api_key = pinecone_key)

index = pc.Index("medium-data")

In [None]:
# clean up pinecone index, after deleting all vectors if you run it again you will get error

index.delete(delete_all=True)

In [4]:
# delete index , dimension no longer useful
pc.delete_index("medium-data")

In [5]:
# load libraries for NER 

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch


### NER Engine

In [6]:
# init NER engine

model_id = 'dslim/bert-base-NER'

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForTokenClassification.from_pretrained(model_id)

# nlp pipeline

nlp = pipeline('ner',
              model=model,
              tokenizer=tokenizer,
              aggregation_strategy= 'max',
              device= 'cpu') 

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [7]:
nlp("Bill Gates is the founder of Microsoft")

[{'entity_group': 'PER',
  'score': 0.9997382,
  'word': 'Bill Gates',
  'start': 0,
  'end': 10},
 {'entity_group': 'ORG',
  'score': 0.99829453,
  'word': 'Microsoft',
  'start': 29,
  'end': 38}]

### Retriever

In [8]:
# load libraries for retriever

from sentence_transformers import SentenceTransformer


# https://huggingface.co/flax-sentence-embeddings/all_datasets_v3_mpnet-base
retriever = SentenceTransformer(
    "flax-sentence-embeddings/all_datasets_v3_mpnet-base")

In [9]:
retriever

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

## 'word_embedding_dimension': 768 
refers to the size of the vector used to represent a single word (or token) in a word embedding space.for example, the word "apple" might be represented as:
ex: [0.25, -0.17, 0.38, ..., 0.91]  → (768 numbers total)

In [10]:
# Create Pinecone Index
pc.create_index("medium-data", dimension= 768, metric="cosine",
                     spec=ServerlessSpec(cloud="aws", region="us-east-1"))


{
    "name": "medium-data",
    "metric": "cosine",
    "host": "medium-data-rcv72uk.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null
}

In [11]:
index= pc.Index("medium-data")

### Data Prep

In [12]:
from datasets import load_dataset

In [None]:
# Obtain Raw Data

# df = load_dataset(
#     "fabiochiu/medium-articles",
#     data_files="medium_articles.csv",
#     split="train"
# ).to_pandas()

# df = df.dropna().sample(10000, random_state=45) # might take 30mins to 1hr

# df['text_extended'] = df['title'] + '.' + df['text'].str[:1000]


In [13]:
# Incase your internet is slow and couldn't make "dataset" works, you can download the file I uploaded as "medium_articles_10k.csv"
# Source of data: https://www.kaggle.com/code/fabiochiusano/medium-articles-simple-data-analysis?select=medium_articles.csv
# it is the same underlying data

import pandas as pd
df = pd.read_csv("C:\\Users\\somashekar.inguva\\OneDrive - Telstra\\Documents\\GenAI_Projects\\Master-VectorDB_Udmy\\Proj2-NER\\medium_articles_10k\\medium_articles_10k.csv")


In [91]:
df = df.dropna().sample(100, random_state=45) # 
df['text_extended'] = df['title'] + '.' + df['text'].str[:1000]

In [None]:
# df.dropna().sum()

Unnamed: 0                                                 4898980
title            Why You’re Struggling with Innovation. And How...
text             The modern typewriter had a problem. When Chri...
url              https://jswilder16.medium.com/why-youre-strugg...
authors          ['Jake Wilder']['Kelvin Zhao']['Paul Newton'][...
timestamp        2019-04-22 01:46:42.469000+00:002020-03-24 23:...
tags             ['Management', 'Leadership', 'Innovation', 'Cr...
text_extended    Why You’re Struggling with Innovation. And How...
dtype: object

In [92]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,title,text,url,authors,timestamp,tags,text_extended
5884,5884,How to Do Cocos-BCX MainNet Token Mapping?,Attention\n\nPlease identify the official desi...,https://medium.com/bitpie/how-to-do-cocos-bcx-...,['Bitpie Wallet'],2020-04-08 14:04:17.837000+00:00,"['Blochchain', 'Bitpie', 'Games', 'Cocos Bcx',...",How to Do Cocos-BCX MainNet Token Mapping?.Att...
9165,9165,Decision Tree from Scratch in Python,Decision trees are among the most powerful Mac...,https://towardsdatascience.com/decision-tree-f...,['Joachim Valente'],2019-10-31 21:44:02.664000+00:00,"['Machine Learning', 'Python', 'Scikit Learn',...",Decision Tree from Scratch in Python.Decision ...


In [93]:
# len(nlp(df_batch)) # list of lst

In [94]:
df['text_extended']

5884    How to Do Cocos-BCX MainNet Token Mapping?.Att...
9165    Decision Tree from Scratch in Python.Decision ...
3313    You Don’t Need Therapy.In comes the vacuum.\n\...
5567    Design & the military: a love story.Collage by...
4866    From Published Author to Successful Ghostwrite...
                              ...                        
2221    If God Doesn’t Exist ….If God Doesn’t Exist …\...
3525    What makes COVID-19 so scary for some and not ...
3022    How to run Jupyter Notebook on Andriod?.Recent...
6496    Docker container as an executable to process i...
3536    Before We Had Google, There Was Googie Archite...
Name: text_extended, Length: 100, dtype: object

In [95]:
df_batch = df['text_extended'].iloc[0:10].tolist()
# type(nlp(df_batch))

In [96]:
df_batch

['How to Do Cocos-BCX MainNet Token Mapping?.Attention\n\nPlease identify the official designated mapping address of Cocos-BCX：0xAC1E002563E0945ad8F1c193171e3ce2617B269e\n\nDO NOT send your ERC20 tokens from exchanges to the above address!\n\nPlease beware of fraudulent websites and impersonating social media accounts.\n\nThis mapping can be done in two ways currently: Bitpie Wallet for mobile users Mapping and MetaMask Extension Mapping for PC users . Exchange Mapping will be available soon.\n\nIt is highly recommended that you use a safe ERC20 address to swap.\n\nThe mapping ratio is 1:1, i.e. 1 ERC20 COCOS = 1 MainNet COCOS\n\nPlease beware that a transcation hash can only be mapped once. It takes some time to complete the entire mapping, so do not repeat the operation.\n\nPlease beware that all mapping applications on the day will be processed in batches at 6 pm every day.\n\nBitpie Wallet Mapping for Mobile Users\n\nTransfer ERC20 COCOS to the official designated mapping address o

In [97]:
nlp(df_batch)

[[{'entity_group': 'MISC',
   'score': 0.930069,
   'word': 'Cocos',
   'start': 10,
   'end': 15},
  {'entity_group': 'LOC',
   'score': 0.8751905,
   'word': 'Cocos',
   'start': 113,
   'end': 118},
  {'entity_group': 'ORG',
   'score': 0.6547991,
   'word': 'Bitpie Wallet',
   'start': 362,
   'end': 375},
  {'entity_group': 'LOC',
   'score': 0.7768182,
   'word': 'Cocos',
   'start': 980,
   'end': 985}],
 [{'entity_group': 'MISC',
   'score': 0.9296101,
   'word': 'Python',
   'start': 30,
   'end': 36},
  {'entity_group': 'MISC',
   'score': 0.9792417,
   'word': 'Machine Learning',
   'start': 80,
   'end': 96},
  {'entity_group': 'MISC',
   'score': 0.8895789,
   'word': 'Ad',
   'start': 182,
   'end': 184},
  {'entity_group': 'MISC',
   'score': 0.951351,
   'word': 'Ranking of Airbnb',
   'start': 219,
   'end': 236},
  {'entity_group': 'MISC',
   'score': 0.98232424,
   'word': 'Python',
   'start': 398,
   'end': 404},
  {'entity_group': 'MISC',
   'score': 0.99536765,
 

In [98]:
type(nlp(df_batch))

list

In [64]:
nlp(df_batch[1])

[]

In [99]:
entities=[]
for doc in df_batch:
    entities.append([item['word'] for item in nlp(doc)])

In [100]:
entities

[['Cocos', 'Cocos', 'Bitpie Wallet', 'Cocos'],
 ['Python',
  'Machine Learning',
  'Ad',
  'Ranking of Airbnb',
  'Python',
  'Wireless Indoor Localization Dataset'],
 ['Bangor University', 'Paloma Mari', 'Beffa', 'University of Toronto'],
 ['Vittoria Casanova',
  'Vittoria Casanova',
  'DARPA',
  'Defense Advanced Research Projects Agency'],
 ['Author Academy Elite',
  'April Tribe',
  'Giauque',
  'Pinpoints of Light',
  'of Abuse',
  'Author Academy Elite'],
 ['Excel',
  'API',
  'Microsoft Excel',
  'Python',
  'Excel',
  'Python',
  'BitMEX Python'],
 ['Classification'],
 ['Timbre', 'Christian Tronhjem', 'Jingle Bells'],
 ['North Carolina'],
 []]

In [101]:
len(nlp(df_batch))

10

### NER Helper Function

In [102]:
# helper function for extracting entities of a batch of texts

def extract_entities(list_of_text):
    entities = []
    for doc in list_of_text: 
        entities.append([item['word'] for item in nlp(doc)])
        # list of entities for 1 doc
    return entities

In [22]:
# embedding

# len(retriever.encode(df_batch))
# len(retriever.encode(df_batch[0])) # try for one doc
# embedding for batch
# emb = retriever.encode(df_batch).tolist() # array to python list

In [103]:
len(retriever.encode(df_batch)),len(retriever.encode(df_batch[0]))

(10, 768)

In [104]:
type(retriever.encode(df_batch)) 
## its 2-dim arrey, we need to convert it into list so that it will becoem 1-dim arrey.

numpy.ndarray

In [105]:
retriever.encode(df_batch)

array([[ 0.0084957 , -0.10262306, -0.02961361, ..., -0.03555423,
        -0.05429437, -0.06234365],
       [-0.02391333,  0.03421886, -0.03572325, ...,  0.01053549,
        -0.00123093,  0.01541532],
       [ 0.03476762,  0.001085  ,  0.00582695, ..., -0.02344149,
         0.00610463, -0.01379733],
       ...,
       [-0.00229889, -0.0406497 ,  0.01282074, ...,  0.03404196,
        -0.00684608,  0.02599859],
       [ 0.02891026,  0.02182649, -0.01790925, ...,  0.0363367 ,
         0.00060563, -0.02878629],
       [ 0.02610229, -0.02072231, -0.0319979 , ...,  0.03341003,
         0.04339864,  0.03243241]], dtype=float32)

In [106]:
retriever.encode(df_batch).tolist()

[[0.008495700545608997,
  -0.10262306034564972,
  -0.029613612219691277,
  -0.009123475290834904,
  0.019854506477713585,
  0.04835822805762291,
  0.014488054439425468,
  -0.019423414021730423,
  -0.01720793917775154,
  0.01985754445195198,
  -0.021005695685744286,
  -0.0038130655884742737,
  0.03786206245422363,
  0.07902691513299942,
  0.008478165604174137,
  0.039862651377916336,
  0.03095141239464283,
  0.023398229852318764,
  -0.026505809277296066,
  -0.007136410567909479,
  -0.010985967703163624,
  0.0008891942561604083,
  0.02906854823231697,
  -0.005058735143393278,
  -0.044095247983932495,
  -0.031364742666482925,
  -0.03318008780479431,
  -0.010379874147474766,
  0.014900004491209984,
  -0.02142835222184658,
  0.07018176466226578,
  -0.005831895861774683,
  -8.950694609666243e-05,
  -0.012581727467477322,
  3.358860922730855e-08,
  -0.01074676588177681,
  -0.015935776755213737,
  0.040235187858343124,
  -0.024387052282691002,
  -0.04192738234996796,
  -0.03169703111052513,
  

In [107]:
len(retriever.encode(df_batch).tolist())

10

In [87]:
retriever.encode(df_batch).tolist()[1]

[-0.007231200113892555,
 0.037665657699108124,
 0.007895044051110744,
 -0.019433384761214256,
 0.01730756089091301,
 0.04458227381110191,
 0.05629997327923775,
 0.05285406485199928,
 -0.04146132245659828,
 0.0009877191623672843,
 -0.03006870299577713,
 -0.021460991352796555,
 -0.01987467147409916,
 -0.0022885994985699654,
 0.022464066743850708,
 0.012328406795859337,
 0.043552156537771225,
 -0.02033931575715542,
 -0.044524066150188446,
 -0.054861899465322495,
 -0.03169975429773331,
 -0.012820892035961151,
 -0.007092684507369995,
 -0.01011401042342186,
 -0.02738131955265999,
 0.024901574477553368,
 -0.030977219343185425,
 0.06288547813892365,
 -0.049061305820941925,
 -0.00025427923537790775,
 -0.03588167205452919,
 -0.032293517142534256,
 -0.008915229700505733,
 0.04212820157408714,
 3.2855606235671075e-08,
 0.014891455881297588,
 0.04272378236055374,
 -0.03556177020072937,
 -0.006164073012769222,
 0.039662107825279236,
 -0.011467867530882359,
 -0.013349625281989574,
 0.0088890362530946

In [None]:
type(retriever.encode(df_batch).tolist()) ### list of lists.

list

In [108]:
emb = retriever.encode(df_batch).tolist()
# emb

### Batch Upsert

In [None]:
# upsert data

from tqdm.auto import tqdm

batch_size = 10

for i in range(0, len(df), batch_size):
    i_end = min(i+batch_size, len(df))
    # print(i, i_end) # starting and ending index of each batch
    
    # get a batch of data
    df_batch = df.iloc[i: i_end].copy()
    
    # embedding
    emb = retriever.encode(df_batch['text_extended'].tolist()
                          ).tolist() # array to python list
    
    # ner extraction
    entities = extract_entities(df_batch['text_extended'].tolist())
    
    # [[]] --> [set1, set2, ], remove duplicate entities    
    df_batch['named_entity'] = [list(set(entity)) for entity in entities] # one list per document
    
    # create meta data
    df_batch = df_batch.drop('text', axis=1)
    
    meta_data = df_batch.to_dict(orient='records') # pd.df to dictionary
    
    # create ids
    
    ids = [f"{idx}" for idx in range(i, i_end)] #
    
    # upsert
    
    vectors_to_upsert = list(zip(ids, emb, meta_data))  # nd array to python list
    
    _ = index.upsert(vectors= vectors_to_upsert)  
    
    



In [111]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 576}},
 'total_vector_count': 576,
 'vector_type': 'dense'}

### Query data

In [112]:
query = "How to make a Wordpress website?" 
qx = retriever.encode(query)
qx

array([ 1.36483200e-02,  2.17883084e-02, -8.97393934e-03, -4.70352769e-02,
        1.56242093e-02,  5.21264412e-02, -4.19059508e-02,  4.88967495e-03,
       -4.24955413e-02,  3.32517326e-02,  5.87665942e-03,  1.89076643e-03,
        1.78067130e-03,  2.92297527e-02, -1.80451609e-02,  2.66481913e-03,
       -1.82629973e-02, -6.72399020e-03, -3.82122286e-02, -1.48898615e-02,
        3.40755396e-02,  3.29952762e-02, -4.95636798e-02, -3.09998821e-02,
       -2.69161025e-03, -2.06722487e-02, -3.71319195e-03, -3.36920354e-03,
       -9.61447358e-02, -1.74136534e-02, -5.42662106e-02,  1.29500907e-02,
        1.20961117e-02,  4.40454343e-03, -1.46143959e-08,  3.05563770e-03,
       -2.72159325e-03, -4.03201766e-02, -7.59574177e-04, -2.30948012e-02,
        5.08132577e-02,  6.32825717e-02,  2.26843990e-02,  4.63504978e-02,
       -8.99913348e-03, -5.69515266e-02,  4.65453416e-02,  6.64430931e-02,
        7.13941604e-02, -2.27958653e-02,  5.00145322e-03, -6.75273538e-02,
        7.66330073e-03, -

In [113]:
query = "How to make a Wordpress website?"  # Natural Language

emb_qx = retriever.encode(query).tolist() # embedded query vector

ne = extract_entities([query])[0] # Named entity as a search filter

In [119]:
# "Author Academy Elite"
query = "Tell me about Author Academy Elite"  # Natural Language

emb_qx = retriever.encode(query).tolist() # embedded query vector

ne = extract_entities([query])[0] # Named entity as a search filter

In [120]:
emb_qx

[0.0017449896549805999,
 0.022570950910449028,
 0.007740634027868509,
 0.030659187585115433,
 -0.03819498047232628,
 0.018273232504725456,
 0.012639783322811127,
 -0.004862742964178324,
 -0.07868913561105728,
 0.02615378051996231,
 0.006136585026979446,
 0.024742446839809418,
 0.011862334795296192,
 0.10371746867895126,
 0.001096630934625864,
 0.012420540675520897,
 -0.02005913481116295,
 -0.030816493555903435,
 0.06665236502885818,
 -0.026120267808437347,
 -0.0330062136054039,
 -0.007330159656703472,
 -0.0855514407157898,
 0.022471217438578606,
 0.061739467084407806,
 -0.050941865891218185,
 0.012104547582566738,
 0.0030882342252880335,
 0.023129351437091827,
 0.021357346326112747,
 -0.017829731106758118,
 -0.02637111395597458,
 -0.00857482198625803,
 0.006754905916750431,
 -6.8915695372595565e-09,
 -0.0300959050655365,
 -0.02600398100912571,
 -0.01912151277065277,
 -0.016218002885580063,
 0.028863050043582916,
 0.0450560562312603,
 0.13240943849086761,
 -0.002381929662078619,
 -0.027

In [121]:
ne

['Author Academy', 'Elite']

In [1]:
index.query?

Object `index.query` not found.


In [122]:
xc = index.query(vector=emb_qx, top_k= 5, include_metadata=True,
           filter = {"named_entity": {"$in" : ne}})

In [128]:
type(xc)

pinecone.core.openapi.db_data.model.query_response.QueryResponse

In [125]:
# you might not find any match if you are only upserting 1k data because of insufficient data there might not be good match, 
# try to load more data or tweak query based on data (glance over pinecone console and look for text_extended field in your vectors)
for result in xc['matches']:
    print(result['score'], " ", result['metadata']['named_entity'])

In [126]:
query = "How to learn NLP?"  # Natural Language

emb_qx = retriever.encode(query).tolist() # embedded query vector

ne = extract_entities([query])[0] # Named entity as a search filter

xc = index.query(vector=emb_qx, top_k= 5, include_metadata=True,
           filter = {"named_entity": {"$in" : ne}  })

In [127]:
for result in xc['matches']:
    print(result['score'], " ", result['metadata']['named_entity'])

0.284014463   ['NLP', 'PyTorch', 'Allen NLP', 'Allen']
0.242224589   ['Lambda Layer', 'NLP']
0.237039223   ['Stephen King', 'The Shining', 'NLP', 'King', 'The Dark Tower', 'Under the Dome', 'It', 'Text Mining', 'Carrie']
0.211129501   ['Your Weekly AI', 'NLP', 'AI', 'ML', 'Machine Learning and Data Science']


In [None]:
for result in xc['matches']: ## Auto Saved result of Udemy INstructor
    print(result['score'], " ", result['metadata']['named_entity'])

0.28366977   ['Allen', 'Allen NLP', 'NLP', 'PyTorch']
0.263163775   ['Python', 'Harnham', 'NLP', 'LDA', 'Science', 'London', 'Datatech Analytics', 'Data']
0.241720855   ['NLP', 'Lambda Layer']
0.236823067   ['The Dark Tower', 'It', 'NLP', 'Text Mining', 'Carrie', 'Stephen King', 'King', 'Under the Dome', 'The Shining']
0.211078629   ['AI', 'ML', 'NLP', 'Your Weekly AI', 'Machine Learning and Data Science']
