In [None]:
import getpass
import os
import pandas as pd
import pinecone
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer

In [None]:
files = pd.read_csv("../../../data/course_descriptions.csv", encoding='ANSI')

In [None]:
def create_course_Description(row):
    return f"The course name {row['course_name']}, the slug is : {row['course_slug']}, and the technology is : {row['course_technology']} and the course topic is : {row['course_topic']}"

In [None]:
pd.set_option('display.max_rows', 106)
files['course_description_new'] = files.apply(create_course_Description, axis=1)
print(files['course_description_new'])

In [None]:
pc = Pinecone(api_key= os.environ.get("PINECONE_API_KEY"), environment= os.environ.get("PINECONE_ENV"))

In [None]:
index_name = "my-index"
dimension = 384  # Dimension of the embeddings
metric = "cosine"  # Similarity metric

In [None]:
if index_name in [i.name for i in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"Deleted existing index '{index_name}'.")
else:
    print(f"{index_name} not in the index list.")

In [None]:
pc.create_index(
    name=index_name,
    dimension=dimension,
    metric=metric,
    spec = ServerlessSpec( cloud="aws", region="us-east-1")
)

In [None]:
index =  pc.Index(index_name)

In [None]:
#Embedding the data

In [None]:
model =  SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
def create_embedding(row):
    combined_text = ' '.join([str(row[field]) for field in ['course_description', 'course_description_new', 'course_description_short']])
    embedding =  model.encode(combined_text, show_progress_bar=False)
    return embedding

In [None]:
files["embedding"] = files.apply(create_embedding, axis=1)

In [None]:
vectors_to_upsert = [(str(row['course_name']), row['embedding'].tolist()) for _, row in files.iterrows()]
index.upsert(vectors=vectors_to_upsert)

print("Data upserted successfully.")

Semantic Search

In [None]:
query = "Clustering"
query_embedding = model.encode(query).tolist()

In [None]:
query_results = index.query( vector= [query_embedding], top_k=12, include_metadata=True)

In [None]:
query_results

In [None]:
score_threshold = 0.3
for match in query_results['matches']:
    if match['score'] >= score_threshold:
        print(f"Course Name: {match['id']}, Score: {match['score']}")