## Overview
Semantic searching for UC Davis courses using OpenAI's embedding model

In [1]:
import json
import pandas as pd

### Convert courses_data.json into a csv file 
(Skip this step if you already have course data in csv format)

In [20]:
# Load json data
with open("course_data.json") as f:
    data = json.load(f)

print(f'Total Subjects = {len(data)}')

Total Subjects = 216


In [9]:
# Create an empty dataframe
df = pd.DataFrame(columns=["code","name","credits","description","prerequisites"])

# Iterate through course_data.json and store each entry into the dataframe
for subject in data:
    for course_code in subject:
        course_list = subject[course_code]
        for course in course_list:
            df = pd.concat([df, pd.DataFrame(course, index=[0])], ignore_index=True)

In [16]:
df.head()

Unnamed: 0,code,name,credits,description,prerequisites
0,EAE 001,— Introduction to Aerospace Science Engineering,(1 unit),Course Description: Description of the field o...,
1,EAE 010,— From the Wright Brothers to Drones & Quadcop...,(2 units),Course Description: History of aircraft and it...,
2,EAE 099,— Special Study for Undergraduates,(1-5 units),Course Description: Special study for undergra...,Prerequisite(s): Consent of instructor; lower ...
3,EAE 126,— Theoretical & Computational Aerodynamics,(4 units),Course Description: Development of general equ...,Prerequisite(s): ENG 103 C- or better; ENG 105...
4,EAE 127,— Applied Aircraft Aerodynamics,(4 units),"This version has ended; see updated course, be...",Prerequisite(s): EME 106 C- or better.


In [18]:
print(f'Total Courses = {len(df)}')

Total Courses = 10024


In [13]:
# convert the DataFrame to a CSV file
df.to_csv('davis_courses.csv', index=False)

### Generate embeddings
Get embeddings for all courses using OpenAI's 'get_embedding' function

In [2]:
import os
from dotenv import load_dotenv
load_dotenv(override=True)

import openai
# Create .env file with your secret key 'OPENAI' or replace 'os.getenv('OPENAI')' with your secret key
openai.api_key = os.getenv('OPENAI')

In [3]:
# Read course data from davis_courses.csv
df = pd.read_csv('davis_courses.csv')

In [4]:
df.head()

Unnamed: 0,code,name,credits,description,prerequisites
0,EAE 001,— Introduction to Aerospace Science Engineering,(1 unit),Course Description: Description of the field o...,
1,EAE 010,— From the Wright Brothers to Drones & Quadcop...,(2 units),Course Description: History of aircraft and it...,
2,EAE 099,— Special Study for Undergraduates,(1-5 units),Course Description: Special study for undergra...,Prerequisite(s): Consent of instructor; lower ...
3,EAE 126,— Theoretical & Computational Aerodynamics,(4 units),Course Description: Development of general equ...,Prerequisite(s): ENG 103 C- or better; ENG 105...
4,EAE 127,— Applied Aircraft Aerodynamics,(4 units),"This version has ended; see updated course, be...",Prerequisite(s): EME 106 C- or better.


In [5]:
# Create a cobined column
df["combined"] = (
    "code: " + df.code.str.strip() + "; name: " + df.name.str.strip() + 
    "; credits: " + df.credits.str.strip() + 
    "; description: " + df.description.str.strip() +
    "; prerequisites: " + df.prerequisites.fillna('').str.strip()
)

df.head(3)

Unnamed: 0,code,name,credits,description,prerequisites,combined
0,EAE 001,— Introduction to Aerospace Science Engineering,(1 unit),Course Description: Description of the field o...,,code: EAE 001; name: — Introduction to Aerospa...
1,EAE 010,— From the Wright Brothers to Drones & Quadcop...,(2 units),Course Description: History of aircraft and it...,,code: EAE 010; name: — From the Wright Brother...
2,EAE 099,— Special Study for Undergraduates,(1-5 units),Course Description: Special study for undergra...,Prerequisite(s): Consent of instructor; lower ...,code: EAE 099; name: — Special Study for Under...


In [None]:
from openai.embeddings_utils import get_embedding
df['embedding'] = df.combined.apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))

In [None]:
df.head(3)