## Overview
Semantic searching for UC Davis courses using OpenAI's embedding model

In [1]:
import json
import pandas as pd

### Convert courses_data.json into a csv file 
(Skip this step if you already have course data in csv format)

In [20]:
# Load json data
with open("course_data.json") as f:
    data = json.load(f)

print(f'Total Subjects = {len(data)}')

Total Subjects = 216


In [9]:
# Create an empty dataframe
df = pd.DataFrame(columns=["code","name","credits","description","prerequisites"])

# Iterate through course_data.json and store each entry into the dataframe
for subject in data:
    for course_code in subject:
        course_list = subject[course_code]
        for course in course_list:
            df = pd.concat([df, pd.DataFrame(course, index=[0])], ignore_index=True)

In [16]:
df.head()

Unnamed: 0,code,name,credits,description,prerequisites
0,EAE 001,— Introduction to Aerospace Science Engineering,(1 unit),Course Description: Description of the field o...,
1,EAE 010,— From the Wright Brothers to Drones & Quadcop...,(2 units),Course Description: History of aircraft and it...,
2,EAE 099,— Special Study for Undergraduates,(1-5 units),Course Description: Special study for undergra...,Prerequisite(s): Consent of instructor; lower ...
3,EAE 126,— Theoretical & Computational Aerodynamics,(4 units),Course Description: Development of general equ...,Prerequisite(s): ENG 103 C- or better; ENG 105...
4,EAE 127,— Applied Aircraft Aerodynamics,(4 units),"This version has ended; see updated course, be...",Prerequisite(s): EME 106 C- or better.


In [18]:
print(f'Total Courses = {len(df)}')

Total Courses = 10024


In [13]:
# convert the DataFrame to a CSV file
df.to_csv('davis_courses.csv', index=False)

### Generate embeddings
Get embeddings for all courses using OpenAI's 'get_embedding' function

In [11]:
import os
from dotenv import load_dotenv
load_dotenv(override=True)

import openai
from openai.embeddings_utils import get_embedding
# Create .env file with your secret key 'OPENAI' or replace 'os.getenv('OPENAI')' with your secret key
openai.api_key = os.getenv('OPENAI')

In [3]:
# Read course data from davis_courses.csv
df = pd.read_csv('davis_courses.csv')

In [4]:
df.head()

Unnamed: 0,code,name,credits,description,prerequisites
0,EAE 001,— Introduction to Aerospace Science Engineering,(1 unit),Course Description: Description of the field o...,
1,EAE 010,— From the Wright Brothers to Drones & Quadcop...,(2 units),Course Description: History of aircraft and it...,
2,EAE 099,— Special Study for Undergraduates,(1-5 units),Course Description: Special study for undergra...,Prerequisite(s): Consent of instructor; lower ...
3,EAE 126,— Theoretical & Computational Aerodynamics,(4 units),Course Description: Development of general equ...,Prerequisite(s): ENG 103 C- or better; ENG 105...
4,EAE 127,— Applied Aircraft Aerodynamics,(4 units),"This version has ended; see updated course, be...",Prerequisite(s): EME 106 C- or better.


In [5]:
# Create a cobined column
df["combined"] = (
    "code: " + df.code.str.strip() + "; name: " + df.name.str.strip() + 
    "; credits: " + df.credits.str.strip() + 
    "; description: " + df.description.str.strip() +
    "; prerequisites: " + df.prerequisites.fillna('').str.strip()
)

df.head(3)

Unnamed: 0,code,name,credits,description,prerequisites,combined
0,EAE 001,— Introduction to Aerospace Science Engineering,(1 unit),Course Description: Description of the field o...,,code: EAE 001; name: — Introduction to Aerospa...
1,EAE 010,— From the Wright Brothers to Drones & Quadcop...,(2 units),Course Description: History of aircraft and it...,,code: EAE 010; name: — From the Wright Brother...
2,EAE 099,— Special Study for Undergraduates,(1-5 units),Course Description: Special study for undergra...,Prerequisite(s): Consent of instructor; lower ...,code: EAE 099; name: — Special Study for Under...


In [6]:
# Get embeddings for all courses
# (This will take a long time)
df['embedding'] = df.combined.apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))

In [9]:
# Save as csv
df.to_csv('course_embeddings.csv')

Unnamed: 0,code,name,credits,description,prerequisites,combined,embedding
0,EAE 001,— Introduction to Aerospace Science Engineering,(1 unit),Course Description: Description of the field o...,,code: EAE 001; name: — Introduction to Aerospa...,"[0.022164037451148033, 0.0007370838429778814, ..."
1,EAE 010,— From the Wright Brothers to Drones & Quadcop...,(2 units),Course Description: History of aircraft and it...,,code: EAE 010; name: — From the Wright Brother...,"[-0.008988866582512856, -0.02638152614235878, ..."
2,EAE 099,— Special Study for Undergraduates,(1-5 units),Course Description: Special study for undergra...,Prerequisite(s): Consent of instructor; lower ...,code: EAE 099; name: — Special Study for Under...,"[0.011427514255046844, -0.02591511607170105, 0..."


In [10]:
df.head(3)

Unnamed: 0,code,name,credits,description,prerequisites,combined,embedding
0,EAE 001,— Introduction to Aerospace Science Engineering,(1 unit),Course Description: Description of the field o...,,code: EAE 001; name: — Introduction to Aerospa...,"[0.022164037451148033, 0.0007370838429778814, ..."
1,EAE 010,— From the Wright Brothers to Drones & Quadcop...,(2 units),Course Description: History of aircraft and it...,,code: EAE 010; name: — From the Wright Brother...,"[-0.008988866582512856, -0.02638152614235878, ..."
2,EAE 099,— Special Study for Undergraduates,(1-5 units),Course Description: Special study for undergra...,Prerequisite(s): Consent of instructor; lower ...,code: EAE 099; name: — Special Study for Under...,"[0.011427514255046844, -0.02591511607170105, 0..."


# Semantic searching

In [54]:
# Covert embeddings from string to numpy array
import numpy as np
df = pd.read_csv('course_embeddings.csv')

In [55]:
df['embedding'] = df['embedding'].apply(eval).apply(np.array)
df.to_csv('course_embeddings_cleaned.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,code,name,credits,description,prerequisites,combined,embedding
0,0,EAE 001,— Introduction to Aerospace Science Engineering,(1 unit),Course Description: Description of the field o...,,code: EAE 001; name: — Introduction to Aerospa...,"[0.022164037451148033, 0.0007370838429778814, ..."
1,1,EAE 010,— From the Wright Brothers to Drones & Quadcop...,(2 units),Course Description: History of aircraft and it...,,code: EAE 010; name: — From the Wright Brother...,"[-0.008988866582512856, -0.02638152614235878, ..."
2,2,EAE 099,— Special Study for Undergraduates,(1-5 units),Course Description: Special study for undergra...,Prerequisite(s): Consent of instructor; lower ...,code: EAE 099; name: — Special Study for Under...,"[0.011427514255046844, -0.02591511607170105, 0..."


In [48]:
# Enter your prompt
prompt = "computers and biology"

In [49]:
search_vector = get_embedding(prompt, engine='text-embedding-ada-002')
search_vector

[0.005985757801681757,
 0.003160335822030902,
 0.011787641793489456,
 -0.017520572990179062,
 0.0037004658952355385,
 0.028132734820246696,
 -0.03391163423657417,
 -0.0003441482549533248,
 -0.009633688256144524,
 -0.01452605240046978,
 0.011853311210870743,
 0.021736543625593185,
 0.016417328268289566,
 0.004041946493089199,
 0.010553059168159962,
 0.017271028831601143,
 0.01909663714468479,
 -0.00012518212315626442,
 0.01115064974874258,
 -0.008051058277487755,
 -0.01438158005475998,
 0.030312955379486084,
 0.033648956567049026,
 -0.01093394123017788,
 0.0017550124321132898,
 0.003061831695958972,
 0.013094461522996426,
 -0.035093680024147034,
 -0.010605594143271446,
 -0.00022307045583147556,
 0.011295122094452381,
 0.004127316176891327,
 -0.04082004725933075,
 -0.0239298976957798,
 -0.010507090017199516,
 0.0007342650205828249,
 -0.00246752449311316,
 0.023207535967230797,
 0.016916414722800255,
 0.0013363705947995186,
 0.009016396477818489,
 0.01920170709490776,
 -0.0032177963294088

In [50]:
from openai.embeddings_utils import cosine_similarity

In [51]:
# Use cosine similarity to search courses that are closest to your prompt
df["similarities"] = df['embedding'].apply(lambda x: cosine_similarity(x, search_vector))

In [52]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,code,name,credits,description,prerequisites,combined,embedding,similarities
0,0,EAE 001,— Introduction to Aerospace Science Engineering,(1 unit),Course Description: Description of the field o...,,code: EAE 001; name: — Introduction to Aerospa...,"[0.022164037451148033, 0.0007370838429778814, ...",0.740785
1,1,EAE 010,— From the Wright Brothers to Drones & Quadcop...,(2 units),Course Description: History of aircraft and it...,,code: EAE 010; name: — From the Wright Brother...,"[-0.008988866582512856, -0.02638152614235878, ...",0.750972
2,2,EAE 099,— Special Study for Undergraduates,(1-5 units),Course Description: Special study for undergra...,Prerequisite(s): Consent of instructor; lower ...,code: EAE 099; name: — Special Study for Under...,"[0.011427514255046844, -0.02591511607170105, 0...",0.735353


In [53]:
# Get top ten courses
df.sort_values("similarities", ascending=False).head(10)

Unnamed: 0.1,Unnamed: 0,code,name,credits,description,prerequisites,combined,embedding,similarities
7068,7068,MCB 185,— Computer Programming for Biologists,(3 units),Course Description: Introduction to computer p...,Prerequisite(s): BIS 101 C- or better.,code: MCB 185; name: — Computer Programming fo...,"[-0.0021729422733187675, -0.013914714567363262...",0.857037
1066,1066,BIS 180L,— Genomics Laboratory,(5 units),Course Description: Computational approaches t...,Prerequisite(s): BIS 181 or BIS 183 or MCB 182...,code: BIS 180L; name: — Genomics Laboratory; c...,"[-0.024339810013771057, -0.00365097145549953, ...",0.841211
1065,1065,BIS 134,— Systems Biology: From Biological Circuits to...,(2 units),Course Description: Applying systems theory to...,Prerequisite(s): BIS 101; (MCB 121 or PLB 113)...,code: BIS 134; name: — Systems Biology: From B...,"[-0.009313995949923992, 0.00970319751650095, 0...",0.837456
2403,2403,ECS 221,— Computational Methods in Systems & Synthetic...,(4 units),Course Description: Computational methods rela...,,code: ECS 221; name: — Computational Methods i...,"[0.0011389408027753234, 0.007520836312323809, ...",0.837236
2339,2339,ECS 124,— Theory & Practice of Bioinformatics,(4 units),"Course Description: Fundamental biological, ma...",Prerequisite(s): (ECS 010 or ECS 032A or ECS 0...,code: ECS 124; name: — Theory & Practice of Bi...,"[0.006816255860030651, -0.012159635312855244, ...",0.836307
1069,1069,BIS 185L,— Systems & Synthetic Biology Lab,(5 units),Course Description: Principles & applications ...,Prerequisite(s): BIS 015L C- or better; BIS 13...,code: BIS 185L; name: — Systems & Synthetic Bi...,"[-0.01611376367509365, 0.0051537808030843735, ...",0.835844
1201,1201,BIM 202,— Cell & Molecular Biology for Engineers,(4 units),Course Description: Preparation for research a...,Prerequisite(s): BIS 104 or MCB 121.,code: BIM 202; name: — Cell & Molecular Biolog...,"[-0.02036133036017418, -0.0102991983294487, -0...",0.83322
6577,6577,MAT 124,— Mathematical Biology,(4 units),Course Description: Methods of mathematical mo...,Prerequisite(s): (MAT 022A or MAT 027A or MAT ...,code: MAT 124; name: — Mathematical Biology; c...,"[-0.0014017298817634583, 0.00796202290803194, ...",0.831788
1042,1042,BIS 015L,— Introduction to Data Science for Biologists,(2 units),Course Description: Introduction to data scien...,,code: BIS 015L; name: — Introduction to Data S...,"[-0.004503453616052866, 0.004033349920064211, ...",0.831729
2465,2465,ECS 289N,— Special Topics in Bioinformatics & Computati...,(1-5 units),Course Description: Special topic in Bioinform...,Prerequisite(s): Consent of instructor.,code: ECS 289N; name: — Special Topics in Bioi...,"[0.010042251087725163, -0.016161957755684853, ...",0.830957


In [63]:
# Vector size
df["embedding"][2].shape

(1536,)

## Convert embeddings into smaller csv

In [2]:
df = pd.read_csv("course_embeddings.csv")

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,code,name,credits,description,prerequisites,combined,embedding
0,0,EAE 001,— Introduction to Aerospace Science Engineering,(1 unit),Course Description: Description of the field o...,,code: EAE 001; name: — Introduction to Aerospa...,"[0.022164037451148033, 0.0007370838429778814, ..."
1,1,EAE 010,— From the Wright Brothers to Drones & Quadcop...,(2 units),Course Description: History of aircraft and it...,,code: EAE 010; name: — From the Wright Brother...,"[-0.008988866582512856, -0.02638152614235878, ..."
2,2,EAE 099,— Special Study for Undergraduates,(1-5 units),Course Description: Special study for undergra...,Prerequisite(s): Consent of instructor; lower ...,code: EAE 099; name: — Special Study for Under...,"[0.011427514255046844, -0.02591511607170105, 0..."


In [4]:
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('combined', axis=1)
df.head()

Unnamed: 0,code,name,credits,description,prerequisites,embedding
0,EAE 001,— Introduction to Aerospace Science Engineering,(1 unit),Course Description: Description of the field o...,,"[0.022164037451148033, 0.0007370838429778814, ..."
1,EAE 010,— From the Wright Brothers to Drones & Quadcop...,(2 units),Course Description: History of aircraft and it...,,"[-0.008988866582512856, -0.02638152614235878, ..."
2,EAE 099,— Special Study for Undergraduates,(1-5 units),Course Description: Special study for undergra...,Prerequisite(s): Consent of instructor; lower ...,"[0.011427514255046844, -0.02591511607170105, 0..."
3,EAE 126,— Theoretical & Computational Aerodynamics,(4 units),Course Description: Development of general equ...,Prerequisite(s): ENG 103 C- or better; ENG 105...,"[-0.006947833579033613, -0.008508626371622086,..."
4,EAE 127,— Applied Aircraft Aerodynamics,(4 units),"This version has ended; see updated course, be...",Prerequisite(s): EME 106 C- or better.,"[-0.006218200549483299, -0.005346987396478653,..."


In [7]:
# Save smaller embeddings csv
df.to_csv('course_embeddings_small.csv', index=False)