# Medium Article Semantic Search by Title+Subtitle

### Load Data

In [1]:
import pandas as pd

In [None]:
df = pd.read_csv("medium_post_titles.csv", nrows=10000)
# data source: https://www.kaggle.com/datasets/nulldata/medium-post-titles

In [3]:
df["subtitle_truncated_flag"].value_counts()

subtitle_truncated_flag
False    6318
True     3682
Name: count, dtype: int64

### Data Cleanup

In [4]:
# df.isna().sum()

df = df.dropna()
df = df[~df["subtitle_truncated_flag"]]
# df["subtitle_truncated_flag"].value_counts()

df['title_extended'] = df['title'] + df['subtitle']

### Prep for Upsert

In [5]:
import os

from dotenv import load_dotenv
from tqdm.autonotebook import tqdm
from pinecone import Pinecone, ServerlessSpec 
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")

pinecone_key = os.getenv("PINCONE_API_KEY")

  from tqdm.autonotebook import tqdm


In [None]:
# from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key = pinecone_key)

In [8]:
df.head()

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for..."
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars HillAuthor and UW lect..."
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False,"""CISGENDER?! Is That A Disease?!""Or, a primer ..."
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False,"""Can I Train my Model on Your Computer?""How we..."
5,cryptocurrency,"""Cypherpunks and Wall Street"": The Security To...",Bruce Fenton presents at the World Blockchain ...,False,"""Cypherpunks and Wall Street"": The Security To..."


In [9]:
pc.create_index(name = "medium-data", 
                dimension=384, 
                metric="cosine",
                spec=ServerlessSpec(
                    cloud="aws",
                    region="us-east-1"
                )) # remember to use only us-east-1 in free tier

{
    "name": "medium-data",
    "metric": "cosine",
    "host": "medium-data-rcv72uk.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [None]:
# pinecone.create_index(name='medium-data', dimension=384, pod_type='s1', metric="cosine" )

In [10]:
from sentence_transformers import SentenceTransformer
import torch

In [12]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu') # cuda or cpu

In [13]:
df['values'] = df['title_extended'].map(
    lambda x: (model.encode(x)).tolist()) # python list, 6k rows 1 min

In [15]:
df.head(2)

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended,values
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for...","[-0.03107442706823349, -0.014303440228104591, ..."
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars HillAuthor and UW lect...","[-0.034670304507017136, -0.018165184184908867,..."


In [16]:
df['id'] = df.reset_index(drop = 'index').index

In [17]:
df.head(2)

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended,values,id
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for...","[-0.03107442706823349, -0.014303440228104591, ...",0
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars HillAuthor and UW lect...","[-0.034670304507017136, -0.018165184184908867,...",1


In [21]:
df['metadata'] = df.apply(lambda x: {
    'title' : x['title'],
    'subtitle': x['subtitle'],
    'category': x['category']
    
}, axis=1)

In [22]:
df.head(2)

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended,values,id,metadata
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for...","[-0.03107442706823349, -0.014303440228104591, ...",0,"{'title': '""21 Conversations"" - A fun (and eas..."
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars HillAuthor and UW lect...","[-0.034670304507017136, -0.018165184184908867,...",1,"{'title': '""Biblical Porn"" at Mars Hill', 'sub..."


In [23]:
df_upsert = df[['id', 'values', 'metadata']]

In [24]:
df_upsert['id'] = df_upsert['id'].map(lambda x: str(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_upsert['id'] = df_upsert['id'].map(lambda x: str(x))


In [25]:
index =pc.Index('medium-data')

In [26]:
index.upsert_from_dataframe(df_upsert) # 6k takes 1 min

sending upsert requests:   0%|          | 0/6211 [00:00<?, ?it/s]

{'upserted_count': 6211}

### Query

In [28]:
index.query(vector=(model.encode("which city is the most beautiful")).tolist(), # python list
           top_k=10,
           include_metadata=True)

{'matches': [{'id': '994',
              'metadata': {'category': 'photography',
                           'subtitle': 'If you are willing to look hard '
                                       'enough, eventually you will see beauty '
                                       'in the most difficult of places.',
                           'title': '3 Places Where You Can Find Beauty'},
              'score': 0.573478758,
              'values': []},
             {'id': '1931',
              'metadata': {'category': 'travel',
                           'subtitle': 'Pembrokeshire is as beautiful as the '
                                       'Italian Coast.',
                           'title': '6 Easy Reasons to Enjoy Exploring South '
                                    'Wales'},
              'score': 0.463663071,
              'values': []},
             {'id': '2778',
              'metadata': {'category': 'accessibility',
                           'subtitle': 'Complete parity with t

In [27]:
xc = index.query(vector=(model.encode("which city is the most beautiful")).tolist(), # python list
           top_k=10,
           include_metadata=True) 

In [29]:
type(xc)

pinecone.core.openapi.db_data.model.query_response.QueryResponse

In [30]:
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['title']}: {result['metadata']['category']} ")

0.57: 3 Places Where You Can Find Beauty: photography 
0.46: 6 Easy Reasons to Enjoy Exploring South Wales: travel 
0.45: A City That’s Better for the Blind Is Better for Everyone: accessibility 
0.45: A Shining City on a Hill: politics 
0.42: A Most Beautiful Game: sports 
0.4: 6 Literary Cities for Book Lovers To Visit This Year: travel 
0.4: Ace Hotel: A UX Case Study: ux 
0.39: A city and its architecture: cities 
0.39: Adaptive urban design: design 
0.38: Aesthetics of Being: spirituality 


In [31]:
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['subtitle']}: {result['metadata']['category']} ")

0.57: If you are willing to look hard enough, eventually you will see beauty in the most difficult of places.: photography 
0.46: Pembrokeshire is as beautiful as the Italian Coast.: travel 
0.45: Complete parity with the sighted may seem like an impossible goal, but maybe the only thing holding us back is a lack of imagination.: accessibility 
0.45: What does America stand for?: politics 
0.42: The World Cup gets advertising right: sports 
0.4: Combine your love for books and travel with these 6 literary cities.: travel 
0.4: Discover the city you are visting like a local: ux 
0.39: Bangalore Chapter: cities 
0.39: Choatic nature of order: design 
0.38: Examining life through a lens of beauty: spirituality 


In [39]:
res = index.query(vector=(model.encode("Talk about databricks")).tolist(), # python list
           top_k=10,
           include_metadata=True)

In [40]:
for result in res['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['subtitle']}: {result['metadata']['category']} ")


0.61: Data Scraping and Data Integration with pandas library from Python.: data-science 
0.58: What is Data Science?: data-science 
0.54: Don’t worry — getting started is the hardest part: artificial-intelligence 
0.53: Use this guide to help you complete your data science projects.: data-science 
0.5: Use Jupyter Notebooks for interactive Data Science Projects: data-science 
0.49: Contents: Basic plots, 3D plots and widgets: data-science 
0.48: Start your self-learning journey into the world of data right now.: productivity 
0.47: How to incorporate Data Science in your projects without costs, risks or pain: business 
0.47: Game of Thrones, game of wines, and other games: artificial-intelligence 
0.47: What is Data Visualization?: data-science 
