# Overview
Convert course embeddings into json that pinecone vector database can understand. 

In [1]:
import ast
import json
import pandas as pd

In [2]:
df = pd.read_csv('course_embeddings_small.csv')

In [3]:
df.head(3)

Unnamed: 0,code,name,credits,description,prerequisites,embedding
0,EAE 001,— Introduction to Aerospace Science Engineering,(1 unit),Course Description: Description of the field o...,,"[0.022164037451148033, 0.0007370838429778814, ..."
1,EAE 010,— From the Wright Brothers to Drones & Quadcop...,(2 units),Course Description: History of aircraft and it...,,"[-0.008988866582512856, -0.02638152614235878, ..."
2,EAE 099,— Special Study for Undergraduates,(1-5 units),Course Description: Special study for undergra...,Prerequisite(s): Consent of instructor; lower ...,"[0.011427514255046844, -0.02591511607170105, 0..."


In [4]:
f'Number of courses: {len(df)}'

'Number of courses: 10024'

In [5]:
# Convert the string embeddings to arrays
df['embedding'] = df['embedding'].apply(ast.literal_eval)

In [7]:
dimensions = len(df["embedding"][0])
f'Embedding dimensions: {dimensions}'

'Embedding dimensions: 1536'

# Convert to pinecone vectors

In [62]:
def create_row_json(row):
    if (row["prerequisites"] == None): prerequisites = row["prerequisites"]
    else: prerequisites = ''
        
    row_json = {
        "id": f'item_{row["code"]}',
        "metadata": {
            "code": row["code"],
            "name": row["name"],
            "credits": row["credits"],
            "description": row["description"],
            "prerequisites": prerequisites
        },
        "values": row["embedding"]
    }
    return row_json


In [63]:
df["vectors"] = df.apply(create_row_json, axis=1)

In [64]:
# First entry
df["vectors"][0]

{'id': 'item_EAE 001',
 'metadata': {'code': 'EAE 001',
  'name': '—\xa0Introduction to Aerospace Science Engineering',
  'credits': '(1 unit)',
  'description': 'Course Description: Description of the field of aerospace engineering with examples from industry, government, and research. Aerospace engineering principles, ethics, and responsibilities.',
  'prerequisites': ''},
 'values': [0.022164037451148033,
  0.0007370838429778814,
  -0.012843707576394081,
  -0.01930544711649418,
  0.004035264253616333,
  -0.007419035769999027,
  -0.007605176419019699,
  0.00469340430572629,
  -0.02016967348754406,
  -0.02239006571471691,
  0.000772816245444119,
  0.012139031663537025,
  -0.008196838200092316,
  -0.00694038812071085,
  -0.02095412276685238,
  -0.007817909121513367,
  0.008702076971530914,
  -0.011966186575591564,
  -0.001048703328706324,
  -0.0005123023875057697,
  -0.0034070392139256,
  -0.004570418503135443,
  -0.019744208082556725,
  -0.019970236346125603,
  -0.000993027351796627,


## Insert vectors to pinecone index

In [108]:
import os
import numpy as np
import pinecone

from dotenv import load_dotenv
load_dotenv(override=True)

# Add to .env or replace below
api_key = os.getenv("PINECONE")
environment = os.getenv("PINECONE_ENV")
pinecone.init(api_key=api_key, environment=environment)
index = pinecone.Index("davis-course-search")

In [109]:
# Batch size
batch_size = 50

count = 1
# Iterate over the dataframe in batches
for i in range(0, len(df), batch_size):
    batch = df["vectors"].iloc[i:i+batch_size]
    batch = batch.to_numpy(batch)
    upsert_response = index.upsert(vectors=batch, namespace="primary")
    print(f'{count}: {upsert_response}')
    count += 1

1: {'upserted_count': 50}
2: {'upserted_count': 50}
3: {'upserted_count': 50}
4: {'upserted_count': 50}
5: {'upserted_count': 50}
6: {'upserted_count': 50}
7: {'upserted_count': 50}
8: {'upserted_count': 50}
9: {'upserted_count': 50}
10: {'upserted_count': 50}
11: {'upserted_count': 50}
12: {'upserted_count': 50}
13: {'upserted_count': 50}
14: {'upserted_count': 50}
15: {'upserted_count': 50}
16: {'upserted_count': 50}
17: {'upserted_count': 50}
18: {'upserted_count': 50}
19: {'upserted_count': 50}
20: {'upserted_count': 50}
21: {'upserted_count': 50}
22: {'upserted_count': 50}
23: {'upserted_count': 50}
24: {'upserted_count': 50}
25: {'upserted_count': 50}
26: {'upserted_count': 50}
27: {'upserted_count': 50}
28: {'upserted_count': 50}
29: {'upserted_count': 50}
30: {'upserted_count': 50}
31: {'upserted_count': 50}
32: {'upserted_count': 50}
33: {'upserted_count': 50}
34: {'upserted_count': 50}
35: {'upserted_count': 50}
36: {'upserted_count': 50}
37: {'upserted_count': 50}
38: {'upse

## Query database

In [110]:
import openai
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity

# Create .env file with your secret key 'OPENAI' or replace 'os.getenv('OPENAI')' with your secret key
openai.api_key = os.getenv('OPENAI')

In [143]:
# Enter your prompt
prompt = "Biology and Programming"

In [144]:
# Get embedding for prompt
search_vector = get_embedding(prompt, engine='text-embedding-ada-002')
search_vector

[0.010661432519555092,
 0.005337433889508247,
 0.0006860735593363643,
 -0.014483962208032608,
 -0.004349891096353531,
 0.02223650924861431,
 -0.02929038740694523,
 -0.0027174223214387894,
 -0.012508876621723175,
 -0.02047639898955822,
 0.005931975319981575,
 0.015357299707829952,
 0.007994394749403,
 -0.00931111816316843,
 0.003846042789518833,
 -0.004763046745210886,
 0.01727864146232605,
 0.004370045382529497,
 0.004894047509878874,
 0.009264092892408371,
 -0.016620280221104622,
 0.04189331457018852,
 0.028188638389110565,
 -0.011951284483075142,
 0.005572563502937555,
 0.008323575370013714,
 0.004894047509878874,
 -0.033079326152801514,
 0.004635405261069536,
 -0.00879383459687233,
 0.03187008947134018,
 -0.009505939669907093,
 -0.018756596371531487,
 -0.006301464047282934,
 -0.01472580898553133,
 0.004370045382529497,
 -0.0020976888481527567,
 0.019777730107307434,
 0.009835121221840382,
 -0.004507763776928186,
 0.028081150725483894,
 0.032058194279670715,
 -0.008531833067536354,
 

In [145]:
query_response = index.query(
    namespace="primary",
    top_k=10,
    include_values=True,
    include_metadata=True,
    vector=search_vector
)

In [146]:
print(query_response)

{'matches': [{'id': 'item_MCB 185',
              'metadata': {'code': 'MCB 185',
                           'credits': '(3 units)',
                           'description': 'Course Description: Introduction to '
                                          'computer programming specifically '
                                          'for biology majors. Programming '
                                          'projects have molecular biology and '
                                          'bioinformatic themes.',
                           'name': '—\xa0Computer Programming for Biologists',
                           'prerequisites': ''},
              'score': 0.879548728,
              'values': [0.0,
                         -0.0154346526,
                         -0.00514488434,
                         -0.00514488434,
                         0.00514488434,
                         0.0154346526,
                         -0.0257244203,
                         -0.0102897687,
       

In [147]:
# Print out matched courses
for course in query_response["matches"]:
    print(course["metadata"]["code"], course["metadata"]["name"], '\n')

MCB 185 — Computer Programming for Biologists 

MAT 124 — Mathematical Biology 

BIS 180L — Genomics Laboratory 

BIS 015L — Introduction to Data Science for Biologists 

ECS 124 — Theory & Practice of Bioinformatics 

BIS 185L — Systems & Synthetic Biology Lab 

BIT 150 — Applied Bioinformatics 

BIT 001Y — Introduction to Biotechnology 

BIS 010 — Everyday Biology 

BIS 020Q — Modeling in Biology 

