In [None]:
# clean data, create embeddings and upload to pinecone

In [177]:
import os
import pandas as pd
from dotenv import load_dotenv
import os
import openai
import pinecone
from tqdm import tqdm
import math
import requests
from bs4 import BeautifulSoup
from sqlalchemy import create_engine

load_dotenv()

True

In [183]:
# set constants
EMBEDDINGS_MODEL = "text-embedding-ada-002"
EMBEDDINGS_DIMENSION = 1536
EMBEDDINGS_MAX_TOKENS = 8191
COMPLETION_MODEL = "text-davinci-003"
DAVINCI_MAX_TOKENS = 4096

PINECONE_BATCH_SIZE = 32
# pinecone index name and name for company we are building for
COMPANY_NAME = "ubc"

In [184]:
openai.api_key = os.getenv("OPENAI_API_KEY")
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),
    environment="us-west1-gcp"
)
pinecone_index = pinecone.Index(COMPANY_NAME)
DATABASE_URL = os.getenv("DATABASE_URL")

In [313]:
# 1000 tokens ~ 750 words; there is no way to get the number of tokens from the API for 2nd gen models for now
# 1 token ~ 4 characters; ceil to be safe
def token_estimate(text):
    return int(math.ceil(len(text) / 4))


# we know that openai ada model costs $0.0004 / 1K tokens
def cost_estimate(tokens):
    return tokens / 1000 * 0.0004

# get embeddings for text
def get_embedding(text: str) -> list[float]:
    result = openai.Embedding.create(
        model=EMBEDDINGS_MODEL,
        input=text
    )
    return result["data"][0]["embedding"]

In [189]:
%load_ext sql
%sql $DATABASE_URL
engine = create_engine(DATABASE_URL)

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [300]:
import re

df = pd.read_sql('SELECT * FROM crawl_data', engine)

# convert data column into list of p texts. data is a list of p tags but when u read from db, it is a string
df['data'] = df['data'].apply(lambda x: re.findall(r'\"(.*?)\"', x))
df.sample(5)

Unnamed: 0,id,url,title,data,company_id,embedded
16803,25367,https://communityengagement.ubc.ca/community-e...,Community Engagement at UBC - Community Engage...,"[Community engagement is not new at UBC, it ha...",1,0
11622,21087,https://courses.students.ubc.ca/cs/coursesched...,BA 563 - Course - UBC Student Services,[Comments],1,0
6521,14963,https://courses.students.ubc.ca/cs/coursesched...,PLAN - Courses - UBC Student Services,[The requested subject is either no longer off...,1,0
7963,20939,https://courses.students.ubc.ca/cs/coursesched...,BA 504 - Course - UBC Student Services,[Comments],1,0
9995,18296,https://courses.students.ubc.ca/cs/coursesched...,APSC 171 T1L - Engineering Drawing and CAD/CAM...,"[Orthographic projections, axonometric and per...",1,0


In [301]:
# remove duplicate urls
old_rows = df.shape[0]
df = df.drop_duplicates(subset=['url'], keep='first')
# print number of rows after removing duplicates
print('rows deleted duplicate urls', old_rows - df.shape[0])

rows deleted duplicate urls 652


In [302]:
# delete duplicate titles
old_rows = df.shape[0]
df = df.drop_duplicates(subset=['title'], keep='first')
# print number of rows after removing duplicates
print('rows deleted duplicate titles', old_rows - df.shape[0])

rows deleted duplicate titles 4660


In [303]:
# find most common words in data list
from collections import Counter
from itertools import chain

# flatten list of lists
data = list(chain.from_iterable(df['data']))
# get most common words, returns list of tuples
most_common = Counter(data).most_common(10)
# convert to list of words
most_common = [x[0] for x in most_common]

# remove 10 most common words from lists of data
df['data'] = df['data'].apply(lambda x: [word for word in x if word not in most_common])

In [304]:
df.sample(5)

Unnamed: 0,id,url,title,data,company_id,embedded
11478,19798,https://courses.students.ubc.ca/cs/coursesched...,ASIA 413 W01 - Section - UBC Student Services,[],1,0
3813,12347,https://courses.students.ubc.ca/cs/coursesched...,FRE 460 - Course - UBC Student Services,[],1,0
12860,21445,https://courses.students.ubc.ca/cs/coursesched...,BAMS 508 - Optimal Decision Making II - UBC St...,[Models and methods in discrete optimization; ...,1,0
586,9229,https://courses.students.ubc.ca/cs/coursesched...,ARCH 549 - Graduate Project Part II - UBC Stud...,"[Design component, completion, and defense of ...",1,0
4942,13495,https://courses.students.ubc.ca/cs/coursesched...,LIBR 596 - Professional Experience - UBC Stude...,[Completion of the MLIS Core. Courses directly...,1,0


In [305]:
# remove empty data
old_rows = df.shape[0]
df = df[df['data'].apply(lambda x: len(x) > 0)]
# print number of rows after removing duplicates
print('rows deleted empty data', old_rows - df.shape[0])

rows deleted empty data 4242


In [306]:
df.sample(5)

Unnamed: 0,id,url,title,data,company_id,embedded
9711,18025,https://courses.students.ubc.ca/cs/coursesched...,ANTH 200 001 - Public Anthropology: Engagement...,[Examines the range of approaches to public an...,1,0
6913,15317,https://courses.students.ubc.ca/cs/coursesched...,RES 509 - Advanced Conservation Science - UBC ...,"[Equivalents: ZOOL 509, For more information o...",1,0
5665,14191,https://courses.students.ubc.ca/cs/coursesched...,MTRL 594 - Advanced Composite Materials - UBC ...,[Processing and properties of advanced polymer...,1,0
4054,12575,https://courses.students.ubc.ca/cs/coursesched...,GEOG 410 - Environment and Society - UBC Stude...,[Geographical analysis of society-environment ...,1,0
4588,13068,https://courses.students.ubc.ca/cs/coursesched...,JRNL 555A - International Reporting - INTL REP...,[JRNL 555A - INTERNATIONAL REPORTING is 2nd YR...,1,0


In [307]:
# concat data into string
df['data'] = df['data'].apply(lambda x: ' '.join(x))

In [308]:
# add token estimate column
df['token_estimate'] = df['data'].apply(lambda x: token_estimate(x))
# davinci token limit prompt + completion is 4096
# so we will keep our request token to 2500 (excluding prompty suffix since its tiny)
# if there is anything above 2500 tokens we will get rid of it for now
# we will also get rid of anything below 20 tokens

# remove rows with token estimate above 2500
old_rows = df.shape[0]
df = df[df['token_estimate'] < 2500]
# print number of rows after removing duplicates
print('rows deleted token estimate above 2500', old_rows - df.shape[0])

# remove rows with token estimate below 20
old_rows = df.shape[0]
df = df[df['token_estimate'] > 20]
# print number of rows after removing duplicates
print('rows deleted token estimate below 20', old_rows - df.shape[0])

rows deleted token estimate above 2500 84
rows deleted token estimate below 20 849


In [339]:

# metadata size limit is 10KB for pinecone
# so we will remove rows with metadata size above 10310 bytes
import sys

df['metadata_size'] = df.apply(lambda x: sys.getsizeof(x.to_json()), axis=1)

# remove rows with metadata size above 10310
old_rows = df.shape[0]
df = df[df['metadata_size'] < 10310]
# print number of rows after removing duplicates
print('rows deleted metadata size above 10310', old_rows - df.shape[0])

rows deleted metadata size above 10310 3


In [309]:
# i want to summarize but it will take too long lmao, will summarize later

In [340]:
# get embeddings for each row and add to pinecone index
for i in tqdm(range(0, df.shape[0], PINECONE_BATCH_SIZE)):
    # set end position of batch
    i_end = min(i+PINECONE_BATCH_SIZE, df.shape[0])
    ids_batch = [str(n) for n in range(i, i_end)]
    metadata_batch = df.iloc[i: i_end].to_dict('records')
    # create embeddings for the data column in df
    embeds = [get_embedding(text) for text in df.iloc[i: i_end]['data']]
    to_upsert = zip(ids_batch, embeds, metadata_batch)
    pinecone_index.upsert(vectors=list(to_upsert))

100%|██████████| 29/29 [01:52<00:00,  3.87s/it]


In [341]:
# the rows that have been added to pinecone index, set their embedded status to 1 in database
df['embedded'] = 1
df.to_sql('crawl_data', engine, if_exists='replace', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['embedded'] = 1


466

In [None]:
# lfggg