In [1]:
#import libs
from openai import OpenAI
import pandas as pd

In [None]:
#create OpenAI client
client = OpenAI()


In [None]:
#get vector embedding from OpenAI
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [3]:
#load csv data
df = pd.read_csv("sic.csv")

In [4]:
#clean up data
df["industry_title"] = df["industry_title"].str.replace('SIC ','')

In [8]:
#skip top 12 records and use the rest
df3 = df[12:]
df3.head()

Unnamed: 0,csv_industry_code,ewb_industry_code,industry_title
12,SIC_0A01,0A01,01 Agricultural Production Crops
13,SIC_0A011,0A011,011 Cash Grains
14,SIC_0A0111,0A0111,0111 Wheat
15,SIC_0A0112,0A0112,0112 Rice
16,SIC_0A0115,0A0115,0115 Corn


In [None]:
#get vector embedding and add to the dataframe
#save the dataframe with vector embedding data, so that the data can be reused
#and we can minimize embedding cost
df3['ada_embedding'] = df3.industry_title.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df3.to_csv('embedded-code-full.csv', index=False)

In [11]:
df = pd.read_csv('embedded-code-full.csv')
df.head()

Unnamed: 0,csv_industry_code,ewb_industry_code,industry_title,ada_embedding
0,SIC_0A01,0A01,01 Agricultural Production Crops,"[-0.009058876894414425, -0.014253156259655952,..."
1,SIC_0A011,0A011,011 Cash Grains,"[-0.012378759682178497, 0.0057749575935304165,..."
2,SIC_0A0111,0A0111,0111 Wheat,"[-0.004349409602582455, -0.014054561965167522,..."
3,SIC_0A0112,0A0112,0112 Rice,"[-0.002359968377277255, -0.010370948351919651,..."
4,SIC_0A0115,0A0115,0115 Corn,"[-0.017421545460820198, -0.006935370620340109,..."


In [None]:
#create pinecone index
from pinecone import Pinecone

index_name = 'index'
pc = Pinecone(api_key="", environment="")
index = pc.Index(index_name)

In [5]:
import numpy as np
df['ada_embedding'] = df.ada_embedding.apply(eval).apply(np.array)

In [6]:
df[:10]

Unnamed: 0,csv_industry_code,ewb_industry_code,industry_title,ada_embedding
0,SIC_0A01,0A01,01 Agricultural Production Crops,"[-0.009058876894414425, -0.014253156259655952,..."
1,SIC_0A011,0A011,011 Cash Grains,"[-0.012378759682178497, 0.0057749575935304165,..."
2,SIC_0A0111,0A0111,0111 Wheat,"[-0.004349409602582455, -0.014054561965167522,..."
3,SIC_0A0112,0A0112,0112 Rice,"[-0.002359968377277255, -0.010370948351919651,..."
4,SIC_0A0115,0A0115,0115 Corn,"[-0.017421545460820198, -0.006935370620340109,..."
5,SIC_0A0116,0A0116,0116 Soybeans,"[-0.011305897496640682, -0.016329267993569374,..."
6,SIC_0A0119,0A0119,"0119 Cash grains, nec","[-0.011909620836377144, -0.013629973866045475,..."
7,SIC_0A013,0A013,"013 Field Crops, Except Cash Grains","[-0.017850518226623535, 8.419383084401488e-05,..."
8,SIC_0A0131,0A0131,0131 Cotton,"[-0.00759208295494318, -0.011167967692017555, ..."
9,SIC_0A0132,0A0132,0132 Tobacco,"[-0.006475711241364479, 0.003945466596633196, ..."


In [11]:
#upsert vector embedding and the title text as metadata to pinecone
from tqdm.auto import tqdm

count = 0  # we'll use the count to create unique IDs
batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(df), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(df))
    # get batch of lines and IDs
    embeds = df['ada_embedding'][i: i_end]
    lines_batch = df['industry_title'][i: i_end]
    ids_batch = [str(n) for n in range(i, i_end)]
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

100%|███████████████████████████████████████████| 55/55 [01:00<00:00,  1.11s/it]
