# setup pinecone

In [1]:
 MY_KEY= "Your_pinecone_key"

In [4]:
# initiate pinecone

from pinecone import Pinecone, ServerlessSpec
# API_KEY = "YOUR API KEY"
pc = Pinecone(api_key = MY_KEY)

# create an index
pc.create_index("medium-data", dimension=768, metric='cosine',
                    spec=ServerlessSpec(cloud="aws", region="us-east-1"))

index = pc.Index("medium-data")

In [5]:
# load libraries for NER 

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch


In [6]:
# init NER engine

model_id = 'dslim/bert-base-NER'

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForTokenClassification.from_pretrained(model_id)

# nlp pipeline

nlp = pipeline('ner',
              model=model,
              tokenizer=tokenizer,
              aggregation_strategy= 'max',
              device= 'cpu') 
nlp("Bill Gates is the founder of Microsoft")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'entity_group': 'PER',
  'score': 0.9997382,
  'word': 'Bill Gates',
  'start': 0,
  'end': 10},
 {'entity_group': 'ORG',
  'score': 0.99829453,
  'word': 'Microsoft',
  'start': 29,
  'end': 38}]

In [7]:
# load libraries for retriever

from sentence_transformers import SentenceTransformer


# https://huggingface.co/flax-sentence-embeddings/all_datasets_v3_mpnet-base
retriever = SentenceTransformer(
    "flax-sentence-embeddings/all_datasets_v3_mpnet-base")

In [8]:
retriever

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False, 'architecture': 'MPNetModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [9]:
# Obtain Raw Data
from datasets import load_dataset
import pandas as pd


df = load_dataset(
    "fabiochiu/medium-articles",
    data_files="medium_articles.csv",
    split="train"
).to_pandas() 
# convert data to pandas framework

In [11]:
df.shape

(192368, 6)

In [None]:
# take a sample of 10000
# random_state used so that df remains the same
df = df.dropna().sample(10000, random_state=45) # might take 30mins to 1hr

In [86]:
# create a column 'text_extended' combining title n text 
df['text_extended'] = df['title'] + '.' + df['text'].str[:1000]

In [62]:
# helper function for extracting entities of a batch of texts

def extract_entities(list_of_text):
    entities = []
    for doc in list_of_text: 
        entities.append([item['word'] for item in nlp(doc)])
        # list of entities for 1 doc
    return entities

In [107]:
# upsert data to pinecone

from tqdm.auto import tqdm

batch_size = 64

for i in range(0, len(df), batch_size):
    i_end = min(i+batch_size, len(df))
    # print(i, i_end) # starting and ending index of each batch
    
    # get a batch of data
    df_batch = df.iloc[i: i_end].copy()
    
    # embedding
    emb = retriever.encode(df_batch['text_extended'].tolist()
                          #).tolist() # array to python list
    
    # ner extraction
    entities = extract_entities(df_batch['text_extended'].tolist())
    
    # [[]] --> [set1, set2, ], remove duplicate entities    
    df_batch['named_entity'] = [list(set(entity)) for entity in entities] # one list per document
    
    # create meta data
    df_batch = df_batch.drop('text', axis=1)
    
    meta_data = df_batch.to_dict(orient='records') # pd.df to dictionary
    
    # create ids
    
    ids = [f"{idx}" for idx in range(i, i_end)] #
    
    # upsert
    
    vectors_to_upsert = list(zip(ids, emb, meta_data))  # nd array to python list
    
    _ = index.upsert(vectors= vectors_to_upsert)  
    
    



In [108]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 2512}},
 'total_vector_count': 2512,
 'vector_type': 'dense'}

In [117]:
query = "How to make a Wordpress website?"  # Natural Language

emb_qx = retriever.encode(query).tolist() # embedded query vector

ne = extract_entities([query])[0] # Named entity as a search filter

In [118]:
# query from index
xc1 = index.query(vector=emb_qx, top_k= 5, include_metadata=True,
           filter = {"named_entity": {"$in" : ne}})

In [119]:
xc1

{'matches': [], 'namespace': '', 'usage': {'read_units': 1}}

In [112]:
# you might not find any match if you are only upserting 1k data because of insufficient data there might not be good match, 
# try to load more data or tweak query based on data (glance over pinecone console and look for text_extended field in your vectors)
for result in xc['matches']:
    print(result['score'], " ", result['metadata']['named_entity'])

In [137]:
query = "wordpress"  # Natural Language

emb_qx = retriever.encode(query).tolist() # embedded query vector

ne = extract_entities([query])[0] # Named entity as a search filter

xc2 = index.query(vector=emb_qx, top_k= 5, include_metadata=True)
           

In [141]:
for result in xc2['matches']:
    print(result['score'], " ", result['metadata'])

0.140793815   {'authors': "['Jairam R Prabhu']", 'tags': "['Blogging', 'Médium', 'Journal', '2020', 'Writing']", 'timestamp': '2020-12-28 14:50:28.741000+00:00', 'title': 'Why 2020 was the best year for blogging?', 'url': 'https://medium.com/illumination/why-2020-was-the-best-year-for-blogging-c34a6d7ddcc8'}
0.140793815   {'authors': "['Bank Al Etihad']", 'tags': "['Lifestyle', 'Smart Banking', 'Banking Technology', 'Mobile Apps', 'Bank Al Etihad']", 'timestamp': '2020-12-07 12:24:14.705000+00:00', 'title': '5 smart ways to bank with your banking app', 'url': 'https://medium.com/bank-al-etihad/5-smart-ways-to-bank-with-your-banking-app-bff1bc409e4a'}
0.140793815   {'authors': "['Audrey Malone']", 'tags': "['Health', 'Healthy Lifestyle', 'Wellness', 'Exercise', 'Fitness']", 'timestamp': '2021-02-28 21:51:31+00:00', 'title': 'What I Learned While Laying on My Kitchen Floor Unable to Move', 'url': 'https://medium.com/in-fitness-and-in-health/what-i-learned-while-laying-on-my-kitchen-floor

In [139]:
xc2

{'matches': [{'id': '7616',
              'metadata': {'authors': "['Jairam R Prabhu']",
                           'tags': "['Blogging', 'Médium', 'Journal', '2020', "
                                   "'Writing']",
                           'timestamp': '2020-12-28 14:50:28.741000+00:00',
                           'title': 'Why 2020 was the best year for blogging?',
                           'url': 'https://medium.com/illumination/why-2020-was-the-best-year-for-blogging-c34a6d7ddcc8'},
              'score': 0.140793815,
              'values': []},
             {'id': '7680',
              'metadata': {'authors': "['Bank Al Etihad']",
                           'tags': "['Lifestyle', 'Smart Banking', 'Banking "
                                   "Technology', 'Mobile Apps', 'Bank Al "
                                   "Etihad']",
                           'timestamp': '2020-12-07 12:24:14.705000+00:00',
                           'title': '5 smart ways to bank with your bankin