 # SEMANTIC SEARCH

# Load data

In [2]:
import pandas as pd

In [3]:
# create pandas df 
# take data from csv file "medium_post_titles.csv"

df=pd.read_csv("medium_post_titles.csv",nrows=10000)

In [6]:
 MY_KEY= "your_pinecone_key"

 # Data clean up

In [None]:
# check for truncated subtitles
df["subtitle_truncated_flag"].value_counts()

In [8]:
# drop null values
df=df.dropna()

In [9]:
# create new df without truncated subtitles
df = df[~df["subtitle_truncated_flag"]]

In [10]:
df["subtitle_truncated_flag"].value_counts()

subtitle_truncated_flag
False    6211
Name: count, dtype: int64

In [11]:
df.shape

(6211, 4)

In [12]:
df['title']

0       "21 Conversations" - A fun (and easy) game for...
1                            "Biblical Porn" at Mars Hill
2                       "CISGENDER?! Is That A Disease?!"
4                "Can I Train my Model on Your Computer?"
5       "Cypherpunks and Wall Street": The Security To...
                              ...                        
9994       America Lets Too Much Young Talent Go to Waste
9996    America Loves the Idea of Family Farms. That’s...
9997    America May Need to Adopt China’s Weapons to W...
9998    America May Outsmart China in 5G With AI and B...
9999                         America Needs Bernie Sanders
Name: title, Length: 6211, dtype: object

In [13]:
# create title extended for metadata
df['title_extended']=df['title']+df['subtitle']

In [14]:
df['title_extended'][1]

'"Biblical Porn" at Mars HillAuthor and UW lecturer Jessica Johnson talks about her new book on Mars Hill Church\'s and Mark Driscoll\'s evangelical masculinity'

In [15]:
## all text ,emoji cleanup can be done , do it yourself if you want

In [16]:
df.head(2)

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for..."
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars HillAuthor and UW lect..."


In [17]:
# can check for unique categories
df['category'].nunique()

93

# Prep for upsert

In [20]:
from pinecone import Pinecone,ServerlessSpec
from tqdm import tqdm

pc=Pinecone(api_key=MY_KEY)

In [21]:
# create index
pc.create_index(name="medium-data",
                dimension=384,
                spec=ServerlessSpec(cloud='aws',region='us-east-1'))

{
    "name": "medium-data",
    "metric": "cosine",
    "host": "medium-data-rqv7mxm.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [2]:
# import these libraries for embedding

from sentence_transformers import SentenceTransformer
import torch

In [3]:
# using model from hugging face
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu') # cuda or cpu

In [26]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [29]:
# create embeddings for title_extended
df['values'] = df['title_extended'].map(
    lambda x: (model.encode(x)).tolist()) # python list as pinecone requires

In [31]:
# create ids for upserting
df['id'] = df.reset_index(drop = 'index').index

In [33]:
# create metadata
df['metadata'] = df.apply(lambda x: {
    'title' : x['title'],
    'subtitle': x['subtitle'],
    'category': x['category']
    
}, axis=1)

In [38]:
# create df to be upserted
df_upsert = df[['id', 'values', 'metadata']]

In [42]:
# ids should be strings
df_upsert['id'] = df_upsert['id'].map(lambda x: str(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_upsert['id'] = df_upsert['id'].map(lambda x: str(x))


In [None]:
df_upsert = df[['id', 'values', 'metadata']]

In [41]:
# initiate the index
index =pc.Index('medium-data')

In [43]:
# upsert the data
index.upsert_from_dataframe(df_upsert) # 6k takes 1 min

sending upsert requests:   0%|          | 0/6211 [00:00<?, ?it/s]

{'upserted_count': 6211}

# Query

In [47]:
query_vector = model.encode("which city is the most beautiful").tolist()

results = index.query(
    vector=query_vector,
    top_k=10,
    include_metadata=True
)


In [48]:
results

{'matches': [{'id': '994',
              'metadata': {'category': 'photography',
                           'subtitle': 'If you are willing to look hard '
                                       'enough, eventually you will see beauty '
                                       'in the most difficult of places.',
                           'title': '3 Places Where You Can Find Beauty'},
              'score': 0.573749602,
              'values': []},
             {'id': '1931',
              'metadata': {'category': 'travel',
                           'subtitle': 'Pembrokeshire is as beautiful as the '
                                       'Italian Coast.',
                           'title': '6 Easy Reasons to Enjoy Exploring South '
                                    'Wales'},
              'score': 0.463108093,
              'values': []},
             {'id': '2778',
              'metadata': {'category': 'accessibility',
                           'subtitle': 'Complete parity with t

In [52]:
for result in results['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['subtitle']}: {result['metadata']['category']} ")

0.57: If you are willing to look hard enough, eventually you will see beauty in the most difficult of places.: photography 
0.46: Pembrokeshire is as beautiful as the Italian Coast.: travel 
0.45: Complete parity with the sighted may seem like an impossible goal, but maybe the only thing holding us back is a lack of imagination.: accessibility 
0.45: What does America stand for?: politics 
0.42: The World Cup gets advertising right: sports 
0.4: Combine your love for books and travel with these 6 literary cities.: travel 
0.4: Discover the city you are visting like a local: ux 
0.39: Bangalore Chapter: cities 
0.38: Choatic nature of order: design 
0.38: Examining life through a lens of beauty: spirituality 
