In [4]:
!pip install \
    "pinecone[grpc]" \
    sentence-transformers \
    pandas

Collecting pinecone[grpc]
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting lz4>=3.1.3 (from pinecone[grpc])
  Downloading lz4-4.4.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone[grpc])
  Downloading pinecone_plugin_assistant-1.8.0-py3-none-any.whl.metadata (30 kB)
Collecting protoc-gen-openapiv2<0.0.2,>=0.0.1 (from pinecone[grpc])
  Downloading protoc_gen_openapiv2-0.0.1-py3-none-any.whl.metadata (1.5 kB)
Collecting packaging>=20.9 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Downloading lz4-4.4.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (1.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.4/1.4 MB[0m [31m17.9 MB/s[0m eta [36m0:00

In [9]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import pinecone

df = pd.read_csv("/content/course_section_descriptions.csv", encoding="windows-1252")

print("‚úÖ Data loaded successfully!")
print("Columns:", df.columns)
df.head()

‚úÖ Data loaded successfully!
Columns: Index(['course_id', 'course_name', 'course_slug', 'course_description',
       'course_description_short', 'course_technology', 'course_topic',
       'course_instructor_quote', 'section_id', 'section_name',
       'section_description'],
      dtype='object')


Unnamed: 0,course_id,course_name,course_slug,course_description,course_description_short,course_technology,course_topic,course_instructor_quote,section_id,section_name,section_description
0,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don‚Äôt just need to deal with d...,9,Introduction to Tableau,While Tableau is an indispensable tool in the ...
1,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don‚Äôt just need to deal with d...,10,Tableau Functionalities,"In this section, you will create your first Ta..."
2,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don‚Äôt just need to deal with d...,11,The Tableau Exercise,This section is a practical example that will ...
3,3,The Complete Data Visualization Course with Py...,data-visualization,The Data Visualization course is designed for ...,Teaching you how to master the art of creating...,python,data visualization,Data visualization is the face of data. Many p...,12,Introduction,"In this section, you will learn about the impo..."
4,3,The Complete Data Visualization Course with Py...,data-visualization,The Data Visualization course is designed for ...,Teaching you how to master the art of creating...,python,data visualization,Data visualization is the face of data. Many p...,13,Setting Up the Environments,"Here, we set up different environments for the..."


In [None]:
import os
from pinecone import Pinecone, ServerlessSpec

# Set your API key and environment
API_KEY = ""
ENVIRONMENT = "us-east-1"  

# Initialize Pinecone client
pc = Pinecone(api_key=API_KEY)

# Define your index name
index_name = "courses-index"

# Create the index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # depends on your embedding model
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region=ENVIRONMENT
        )
    )
    print(f"‚úÖ Created new Pinecone index: {index_name}")
else:
    print(f"‚úÖ Using existing Pinecone index: {index_name}")

# Connect to the index
index = pc.Index(index_name)
print("‚úÖ Connected to Pinecone index:", index_name)

‚úÖ Created new Pinecone index: courses-index
‚úÖ Connected to Pinecone index: courses-index


In [13]:
# Load BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Choose the text column (update this after we confirm column name)
text_column = "section_description"

texts = df[text_column].astype(str).tolist()
embeddings = model.encode(texts).tolist()

print("‚úÖ Generated embeddings:", len(embeddings))

‚úÖ Generated embeddings: 680


In [14]:
# Prepare items (id, vector, metadata)
to_upsert = [
    (str(i), embeddings[i], {"text": texts[i]})
    for i in range(len(embeddings))
]

# Upsert to index
index.upsert(vectors=to_upsert)
print(f"‚úÖ Upserted {len(to_upsert)} items to Pinecone index.")

‚úÖ Upserted 680 items to Pinecone index.


In [16]:
query = "Data vizualization introduction"
query_vec = model.encode([query]).tolist()

results = index.query(vector=query_vec[0], top_k=5, include_metadata=True)
print("üîç Top 5 Similar Results:\n")

for match in results['matches']:
    print(f"Score: {match['score']:.3f}")
    print(f"Text: {match['metadata']['text'][:200]}...\n")

üîç Top 5 Similar Results:

Score: 0.653
Text: In this course section you will find resources for data visualization, including best practices for creating effective visualizations, identifying and eliminating chartjunk, selecting appropriate char...

Score: 0.618
Text: In this section, you will learn about the importance of data visualization, as well as some theoretical foundations for creating charts. We introduce popular frameworks for choosing an appropriate vis...

Score: 0.618
Text: In this section, you will learn about the importance of data visualization, as well as some theoretical foundations for creating charts. We introduce popular frameworks for choosing an appropriate vis...

Score: 0.582
Text: This section introduces you to the initial steps in crafting your data presentation. From identifying your key message to understanding your audience, these fundamental steps set the groundwork for a ...

Score: 0.532
Text: In this course, we will go through an entire practical e

In [17]:
# Update existing entry
index.upsert(vectors=[("1", model.encode(["updated text"]).tolist()[0])])
print("‚úÖ Updated vector with ID 1")

# Delete example
# index.delete(ids=["1"])

‚úÖ Updated vector with ID 1


In [22]:
def recommend_similar(item_id, top_k=5):
    # Fetch the stored vector for the given item_id
    fetch_response = index.fetch(ids=[item_id])

    # Extract the vector values correctly
    vector = fetch_response.vectors[item_id].values

    # Query Pinecone for similar vectors
    results = index.query(vector=vector, top_k=top_k + 1, include_metadata=True)

    print(f"üîó Recommendations for Item {item_id}:\n")
    for match in results.matches[1:]:  # skip the item itself
        text = match.metadata.get("text", "[No text available]")
        print(f"- {text[:120]} (Score: {match.score:.3f})")

recommend_similar("1")

üîó Recommendations for Item 1:

- Still on the subject of manipulating your data set, in this section we will focus on another DML Statement: the UPDATE S (Score: 0.335)
- Textual data in its raw form isn't suitable for machine learning algorithms. Discover techniques to transform text into  (Score: 0.329)
- As an analyst, you'll frequently handle text data‚Äîmaster text data preprocessing with ChatGPT's Advanced Data Analysis t (Score: 0.305)
- Every good project manager and business leader needs to learn how to deal with change requests. This is the topic of thi (Score: 0.289)
- Once accustomed to manipulating text data, doors will open for you to employ several tools that will optimize the use of (Score: 0.274)


In [25]:
!pip install -q sentence-transformers torch torchvision pillow

from sentence_transformers import SentenceTransformer
from PIL import Image
import requests

# Load CLIP model
clip_model = SentenceTransformer('clip-ViT-B-32')

In [None]:
from PIL import Image
from pinecone import Pinecone, ServerlessSpec

# Initialize Pinecone client
API_KEY = ""
pc = Pinecone(api_key=API_KEY)

# Example images (local files)
img1 = Image.open("/content/dog.jpg")
img2 = Image.open("/content/cat.jpg")

# Encode images (assuming clip_model is already loaded)
image_embeddings = clip_model.encode([img1, img2]).tolist()
texts = ["dog photo", "cat photo"]

# Create Pinecone index for images if it doesn't exist
index_name = "images-index"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=512,   # depends on your model
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print(f"‚úÖ Created new index: {index_name}")
else:
    print(f"‚úÖ Using existing index: {index_name}")

# Connect to the index
img_index = pc.Index(index_name)

# Upsert image embeddings
img_index.upsert(vectors=[
    ("img1", image_embeddings[0], {"label": texts[0]}),
    ("img2", image_embeddings[1], {"label": texts[1]})
])

print("‚úÖ Image embeddings uploaded to Pinecone!")

‚úÖ Created new index: images-index
‚úÖ Image embeddings uploaded to Pinecone!


In [34]:
# üîç Text-based image query
query = "a cute cat"
query_vec = clip_model.encode([query]).tolist()
results = img_index.query(vector=query_vec[0], top_k=2, include_metadata=True)
print(results)


{'matches': [{'id': 'img2',
              'metadata': {'label': 'cat photo'},
              'score': 0.284639388,
              'values': []},
             {'id': 'img1',
              'metadata': {'label': 'dog photo'},
              'score': 0.225837752,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 1}}


In [None]:
!pip install -q transformers torch

from transformers import AutoTokenizer, AutoModel
import torch
from pinecone import Pinecone, ServerlessSpec

# üîë Initialize Pinecone client
API_KEY = ""
pc = Pinecone(api_key=API_KEY)

# ‚öóÔ∏è Load BioBERT model
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
model_bio = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.2")

# üî¨ Function to generate embeddings
def embed_biomedical(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model_bio(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.tolist()

# üß¨ Example biomedical data
bio_texts = [
    "The protein p53 regulates the cell cycle.",
    "COVID-19 is caused by the SARS-CoV-2 virus."
]
bio_vectors = embed_biomedical(bio_texts)

# üß© Create or connect to a Pinecone index
index_name = "biomed-index"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,       # BioBERT embedding size
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print(f"‚úÖ Created new index: {index_name}")
else:
    print(f"‚úÖ Using existing index: {index_name}")

bio_index = pc.Index(index_name)

# üì• Upsert biomedical embeddings
bio_index.upsert([
    (f"bio_{i}", bio_vectors[i], {"text": bio_texts[i]})
    for i in range(len(bio_texts))
])

# üîé Query biomedical data
query = "cell cycle regulation"
query_vec = embed_biomedical([query])[0]

results = bio_index.query(vector=query_vec, top_k=2, include_metadata=True)

# üß† Print results
print("üß¨ Biomedical Similarity Search Results:")
for match in results.matches:
    print(f"- {match.metadata['text']} (Score: {match.score:.3f})")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


‚úÖ Created new index: biomed-index
üß¨ Biomedical Similarity Search Results:
- COVID-19 is caused by the SARS-CoV-2 virus. (Score: 0.749)
- The protein p53 regulates the cell cycle. (Score: 0.854)
