In [35]:
import getpass
import os
import pandas as pd
import pinecone
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer

In [36]:
files = pd.read_csv("../../../data/course_descriptions.csv", encoding='ANSI')

In [37]:
def create_course_Description(row):
    return f"The course name {row['course_name']}, the slug is : {row['course_slug']}, and the technology is : {row['course_technology']} and the course topic is : {row['course_topic']}"

In [38]:
pd.set_option('display.max_rows', 106)
files['course_description_new'] = files.apply(create_course_Description, axis=1)
print(files['course_description_new'])

0      The course name Introduction to Tableau, the s...
1      The course name The Complete Data Visualizatio...
2      The course name Introduction to R Programming,...
3      The course name Data Preprocessing with NumPy,...
4      The course name Introduction to Data and Data ...
5      The course name Data Cleaning and Preprocessin...
6      The course name Introduction to Business Analy...
7      The course name Data Analysis with Excel Pivot...
8      The course name SQL, the slug is : sql, and th...
9      The course name Credit Risk Modeling in Python...
10     The course name Python Programmer Bootcamp, th...
11     The course name SQL + Tableau + Python, the sl...
12     The course name Introduction to Jupyter, the s...
13     The course name Statistics, the slug is : stat...
14     The course name Mathematics, the slug is : mat...
15     The course name Introduction to Excel, the slu...
16     The course name Probability, the slug is : pro...
17     The course name Starting

In [39]:
pc = Pinecone(api_key= os.environ.get("PINECONE_API_KEY"), environment= os.environ.get("PINECONE_ENV"))

In [40]:
index_name = "my-index"
dimension = 384  # Dimension of the embeddings
metric = "cosine"  # Similarity metric

In [41]:
if index_name in [i.name for i in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"Deleted existing index '{index_name}'.")
else:
    print(f"{index_name} not in the index list.")

Deleted existing index 'my-index'.


In [42]:
pc.create_index(
    name=index_name,
    dimension=dimension,
    metric=metric,
    spec = ServerlessSpec( cloud="aws", region="us-east-1")
)

{
    "name": "my-index",
    "metric": "cosine",
    "host": "my-index-jo7crz3.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "region": "us-east-1",
            "cloud": "aws",
            "read_capacity": {
                "mode": "OnDemand",
                "status": {
                    "state": "Ready",
                    "current_shards": null,
                    "current_replicas": null
                }
            }
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null,
    "_response_info": {
        "raw_headers": {
            "content-type": "application/json",
            "access-control-allow-origin": "*",
            "vary": "origin,access-control-request-method,access-control-request-headers",
            "access-control-expose-headers": "*",
            "x-pinecone-api-version": "2025-10",
  

In [43]:
index =  pc.Index(index_name)

In [44]:
#Embedding the data

In [45]:
model =  SentenceTransformer('all-MiniLM-L6-v2')

In [46]:
def create_embedding(row):
    combined_text = ' '.join([str(row[field]) for field in ['course_description', 'course_description_new', 'course_description_short']])
    embedding =  model.encode(combined_text, show_progress_bar=False)
    return embedding

In [47]:
files["embedding"] = files.apply(create_embedding, axis=1)

In [48]:
vectors_to_upsert = [(str(row['course_name']), row['embedding'].tolist()) for _, row in files.iterrows()]
index.upsert(vectors=vectors_to_upsert)

print("Data upserted successfully.")

Data upserted successfully.


Semantic Search

In [49]:
query = "Clustering"
query_embedding = model.encode(query).tolist()

In [50]:
query_results = index.query( vector= [query_embedding], top_k=12, include_metadata=True)

In [51]:
query_results

QueryResponse(matches=[{'id': 'Machine Learning in Excel',
 'metadata': {},
 'score': 0.349885464,
 'values': []}, {'id': 'Growth Analysis with SQL, Python, and Tableau  ',
 'metadata': {},
 'score': 0.249427766,
 'values': []}, {'id': 'Machine Learning with Support Vector Machines',
 'metadata': {},
 'score': 0.225214019,
 'values': []}, {'id': 'Introduction to Industry and Company Analysis',
 'metadata': {},
 'score': 0.201858535,
 'values': []}, {'id': 'Machine Learning in Python',
 'metadata': {},
 'score': 0.275269508,
 'values': []}, {'id': 'Fashion Analytics with Tableau',
 'metadata': {},
 'score': 0.235293403,
 'values': []}, {'id': 'Customer Churn Analysis with SQL and Tableau',
 'metadata': {},
 'score': 0.2750597,
 'values': []}, {'id': 'Machine Learning with Naive Bayes',
 'metadata': {},
 'score': 0.211933136,
 'values': []}, {'id': 'Machine Learning with K-Nearest Neighbors',
 'metadata': {},
 'score': 0.288754463,
 'values': []}, {'id': 'Linear Algebra and Feature Selec

In [52]:
score_threshold = 0.3
for match in query_results['matches']:
    if match['score'] >= score_threshold:
        print(f"Course Name: {match['id']}, Score: {match['score']}")

Course Name: Machine Learning in Excel, Score: 0.349885464
