In [1]:
import pandas as pd 
import pinecone
import os 
from pinecone import Pinecone,ServerlessSpec
from dotenv import load_dotenv,find_dotenv
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm


In [2]:
%load_ext dotenv
%dotenv

In [18]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
files = pd.read_csv('365 Courses and sections data.csv',encoding='ANSI')
files.head()

Unnamed: 0,course_id,course_name,course_slug,course_description,course_description_short,course_technology,course_topic,course_instructor_quote,section_id,section_name,section_description
0,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don’t just need to deal with d...,9,Introduction to Tableau,While Tableau is an indispensable tool in the ...
1,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don’t just need to deal with d...,10,Tableau Functionalities,"In this section, you will create your first Ta..."
2,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don’t just need to deal with d...,11,The Tableau Exercise,This section is a practical example that will ...
3,3,The Complete Data Visualization Course with Py...,data-visualization,The Data Visualization course is designed for ...,Teaching you how to master the art of creating...,python,data visualization,Data visualization is the face of data. Many p...,12,Introduction,"In this section, you will learn about the impo..."
4,3,The Complete Data Visualization Course with Py...,data-visualization,The Data Visualization course is designed for ...,Teaching you how to master the art of creating...,python,data visualization,Data visualization is the face of data. Many p...,13,Setting Up the Environments,"Here, we set up different environments for the..."


In [6]:
files['unique_id']  = files['course_id'].astype(str) + '-' + files['section_id'].astype(str)

In [15]:
files['metadata'] = files.apply(lambda row :
    {
        'course_name' : row['course_name'],
            'section_name' : row['section_name'],
        'section_description' : row['section_description'],
    },
axis=1)

In [16]:
def create_embeddings(row):
    combined_text = f'''{row['course_name']}'{row['course_technology']}'{row['section_description']}'''
    return model.encode(combined_text,show_progress_bar=False)


In [19]:
files['embedding'] = files.apply(create_embeddings,axis=1)

In [None]:
api_key = os.environ.get('PINECONE_API_KEY')
env = os.environ.get('PINECONE_ENV')

pc = Pinecone(api_key=api_key,enviorment = env)

#Creating a index 
pc.create_index(
    name='semanticsearchindex2',
    dimension= 384,
    metric='cosine',
    spec=ServerlessSpec(
        cloud = 'aws',
        region = 'us-east-1'
   )
)

In [29]:
index = pc.Index('semanticsearchindex2')
vectors_to_upsert = [(row['unique_id'],row['embedding'].tolist(),row['metadata']) for index ,row in files.iterrows()]
index.upsert(vectors = vectors_to_upsert)
print("Data upserted")


Data upserted


In [43]:
#Querying 
query = 'regression in python'
query_embedding = model.encode(query,show_progress_bar=False).tolist()
query_result = index.query(
    vector=[query_embedding],
    top_k=12,
    include_metadata=True
)



In [44]:
score_threshold = 0.2  #Self adjustable
for match in  query_result['matches']:
    if match['score'] >= score_threshold:
        course_details = match.get('metadata',{})
        course_name = course_details.get('course_name','N/A')
        section_name = course_details.get('section_name','N/A')
        section_description = course_details.get('section_description','No description available')
        
        
        print(f"Matched item id {match['id']}, Score : {match['score']}")
        print(f'Course: {course_name} \nSection:{section_name} \nSection Description:{section_description}' )




Matched item id 37-369, Score : 0.740024626
Course: Machine Learning in Python 
Section:Linear Regression with sklearn 
Section Description:While there are many libraries that can compute a regression model, the most numerically stable one is sklearn. It is also the preferred choice of many machine learning professionals. In this section, we implement all we know about regressions in this amazing library.
Matched item id 37-368, Score : 0.653421402
Course: Machine Learning in Python 
Section:Linear Regression 
Section Description:In this part of the course, we will discuss what the course covers, why you need to learn advanced statistics, what’s the differences are with machine learning, and how to get the most out of this training. In this section, you will also expand on what you learned in our statistics training with additional concepts and will apply all the theory in Python. This section serves two purposes: 1) a useful refresher of regression, and 2) a great way to reinforce wha