In [2]:
import pandas as pd

In [None]:
files = pd.read_csv(r"filepath", encoding="ANSI")

In [4]:
def create_course_description(row):
    return f'''The course name is {row['course_name']}, the slug is {row['course_slug']},
          the technology is {row["course_technology"]} and the course topic is {row["course_topic"]}'''


In [5]:
files['course_description_new'] = files.apply(create_course_description, axis =1 )

In [6]:
print(files['course_description_new'])

0      The course name is Introduction to Tableau, th...
1      The course name is The Complete Data Visualiza...
2      The course name is Introduction to R Programmi...
3      The course name is Data Preprocessing with Num...
4      The course name is Introduction to Data and Da...
                             ...                        
101    The course name is Intro to NLP for AI, the sl...
102    The course name is Data Analysis with ChatGPT,...
103    The course name is ChatGPT for Data Science, t...
104    The course name is Intro to LLMs, the slug is ...
105    The course name is Growth Analysis with SQL, P...
Name: course_description_new, Length: 106, dtype: object


In [7]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv

  from tqdm.autonotebook import tqdm


In [8]:
%load_ext dotenv 
%dotenv

In [9]:
load_dotenv(find_dotenv(), override=True)

True

In [10]:
import pinecone
import os

In [11]:
pc= Pinecone(api_key = os.environ.get("PINECONE_API_KEY"), environment = os.environ.get("PINECONE_ENV"))

In [12]:
index_name = 'my-index'
dimension = 384,
metric= "cosine"

In [14]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} succ deleted")
else:
    print(f"{index_name} not in index list.")

my-index succ deleted


In [16]:
pc.create_index(
    name = index_name,
    dimension = 384,
    metric = metric,
    spec= ServerlessSpec(
        cloud = "aws",
        region = "us-east-1"
    )
)

In [17]:
index= pc.Index(index_name)

In [19]:
#embedding algo

In [20]:
from sentence_transformers import SentenceTransformer

In [21]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [23]:
def create_embedding(row):
    combined_text = ' '.join([str(row[field]) for field in ['course_description', 'course_description_new','course_description_short']])
    embedding = model.encode(combined_text, show_progress_bar= False)
    return embedding
    

In [24]:
files["embedding"] = files.apply(create_embedding, axis = 1)

In [25]:
vectors_to_upsert = [(str(row["course_name"]),row["embedding"].tolist()) for _,row in files.iterrows()]
print("Data upserted to Pinecone Index")

Data upserted to Pinecone Index


In [26]:
#Semantic Search

In [37]:
query = "clustering"
query_embedding = model.encode(query, show_progress_bar=False).tolist()

In [45]:
query_results = index.query(
    vector  = [query_embedding],
    top_k = 1,
    include_values= True
)

In [46]:
query_results

{'matches': [], 'namespace': '', 'usage': {'read_units': 1}}

In [40]:
for match in query_results["matches"]:
    print(f"Matched item ID: {match['id']}, score: {match['score']}")