# NER Powered Semantic Search 

### Setup Environment

In [2]:
# init pinecone

import pinecone
from tqdm.autonotebook import tqdm # warning taken care of

# API_KEY = "YOUR API KEY"
# ENV = "YOUR ENVIRONMENT"

pinecone.init(api_key = API_KEY, environment = ENV)
index = pinecone.Index("medium-data")

  from tqdm.autonotebook import tqdm


In [3]:
# clean up pinecone index
index.delete(delete_all=True)

In [5]:
# delete index , dimensino no longer useful

In [9]:
# load libraries for NER 

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch


### NER Engine

In [None]:
# init NER engine

model_id = 'dslim/bert-base-NER'

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForTokenClassification.from_pretrained(model_id)

# nlp pipeline

nlp = pipeline('ner',
              model=model,
              tokenizer=tokenizer,
              aggregation_strategy= 'max',
              device= 'cpu') 
# nlp("Bill Gates is the founder of Microsoft")

### Retriever

In [None]:
# load libraries for retriever

from sentence_transformers import SentenceTransformer


# https://huggingface.co/flax-sentence-embeddings/all_datasets_v3_mpnet-base
retriever = SentenceTransformer(
    "flax-sentence-embeddings/all_datasets_v3_mpnet-base")

In [18]:
retriever

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [None]:
# Create Pinecone Index
pinecone.create_index("medium-data", dimension= 768, metric="cosine")
index= pinecone.Index("medium-data")

### Data Prep

In [22]:
from datasets import load_dataset

In [204]:
# Obtain Raw Data

df = load_dataset(
    "fabiochiu/medium-articles",
    data_files="medium_articles.csv",
    split="train"
).to_pandas()

df = df.dropna().sample(10000, random_state=45) # might take 30mins to 1hr

df['text_extended'] = df['title'] + '.' + df['text'].str[:1000]


Found cached dataset csv (/home/mohsin/.cache/huggingface/datasets/fabiochiu___csv/fabiochiu--medium-articles-96791ff68926910d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


In [208]:
df.head()

Unnamed: 0,title,text,url,authors,timestamp,tags,text_extended
189059,How do you move a WordPress website to another...,Photo by Moritz Mentges on Unsplash\n\nMoving ...,https://medium.com/@dyderik/how-do-you-move-a-...,['Richard Detering'],2021-11-13 05:42:44.009000+00:00,"['Web Hosting Services', 'Web Hosting', 'Trans...",How do you move a WordPress website to another...
96618,A Long December,In my quest to find ways of engaging with the ...,https://medium.com/@keenekomeskleen/a-long-dec...,['Matt Keene'],2020-12-16 19:47:55.820000+00:00,"['Society', 'Politics', 'Poverty', 'Pandemic',...",A Long December.In my quest to find ways of en...
46027,I Have Decided to Stop Being the Michael Scott...,"Writing is, for me, a beloved pastime. I’ve do...",https://pisancantos43.medium.com/i-have-decide...,['Anthony Aycock'],2019-05-12 20:04:59.371000+00:00,"['Teaching', 'Television', 'College', 'Writing...",I Have Decided to Stop Being the Michael Scott...
145790,EU fully committed to sustainable development,European Commission Vice-President Jyrki Katai...,https://medium.com/ecajournal/eu-fully-committ...,['European Court Of Auditors'],2019-07-24 13:15:13.130000+00:00,['Sustainable Development'],EU fully committed to sustainable development....
132859,HD ▷..! เรื่องเต็ม 【M-Thai ดาบพิฆาตอสูร เดอะมู...,TAG::\n\nดาบพิฆาตอสูร เดอะมูฟวี่ ศึกรถไฟสู่นิร...,https://medium.com/@bangetanjay405/hd-%E0%B9%8...,[],2020-12-12 14:33:34.766000+00:00,"['Thailand', 'Japan', 'Taiwan', 'Hong Kong', '...",HD ▷..! เรื่องเต็ม 【M-Thai ดาบพิฆาตอสูร เดอะมู...


In [175]:
# len(nlp(df_batch)) # list of lst

### NER Helper Function

In [223]:
# helper function for extracting entities of a batch of texts

def extract_entities(list_of_text):
    entities = []
    for doc in list_of_text: 
        entities.append([item['word'] for item in nlp(doc)])
        # list of entities for 1 doc
    return entities

In [178]:
# embedding

# len(retriever.encode(df_batch))
# len(retriever.encode(df_batch[0])) # try for one doc
# embedding for batch
# emb = retriever.encode(df_batch).tolist() # array to python list

### Batch Upsert

In [206]:
# upsert data

from tqdm.auto import tqdm

batch_size = 64

for i in range(0, len(df), batch_size):
    i_end = min(i+batch_size, len(df))
    # print(i, i_end) # starting and ending index of each batch
    
    # get a batch of data
    df_batch = df.iloc[i: i_end].copy()
    
    # embedding
    emb = retriever.encode(df_batch['text_extended'].tolist()
                          ).tolist() # array to python list
    
    # ner extraction
    entities = extract_entities(df_batch['text_extended'].tolist())
    
    # [[]] --> [set1, set2, ], remove duplicate entities    
    df_batch['named_entity'] = [list(set(entity)) for entity in entities] # one list per document
    
    # create meta data
    df_batch = df_batch.drop('text', axis=1)
    
    meta_data = df_batch.to_dict(orient='records') # pd.df to dictionary
    
    # create ids
    
    ids = [f"{idx}" for idx in range(i, i_end)] #
    
    # upsert
    
    vectors_to_upsert = list(zip(ids, emb, meta_data))  # nd array to python list
    
    _ = index.upsert(vectors= vectors_to_upsert)  
    
    



In [207]:
# index.describe_index_stats()

### Query data

In [248]:
query = "How to make a Wordpress website?"  # Natural Language

emb_qx = retriever.encode(query).tolist() # embedded query vector

ne = extract_entities([query])[0] # Named entity as a search filter

In [249]:
xc = index.query(emb_qx, top_k= 5, include_metadata=True,
           filter = {"named_entity": {"$in" : ne}})

In [253]:
for result in xc['matches']:
    print(result['score'], " ", result['metadata']['named_entity'])

0.294890732   ['Wordpress', 'Both Sides of the Table']
0.207520485   ['Wordpress', 'Opencart', 'Laraval', 'Magento 2', 'Squarespace', 'Wix', 'Joomla', 'Zencart Cakephp', 'CMS', 'Social Media Management', 'Shopify', 'Media', 'Magento', 'Prodigitaly', 'Angular', 'Banner', 'IOS', 'Drupal', 'Volusion', 'media', 'Codeignitor', 'Weebly', 'Technologies and Platform Wordpress', 'Woocomerce', 'Bigcommerce']
0.187706783   ['CMS', 'Wordpress', 'Industry', 'Readymag']
0.153713182   ['YouTube', 'Wordpress', 'HCS Raleigh Kickoff Major', 'Cloud9', 'Search', 'Medium', 'Engine', 'KeenGamer']
-0.00761373667   ['Wordpress', 'Godden & Baddeley', 'Abernethy']
