# Iniciando com vetores e aplicações com IA Generativa

Após criar seu banco de dados no Astra (com suporte a vetores) mão na massa!

In [None]:
# Instalando os libraries para execução do notebook
!pip install openai pandas jupyter-datatables cassandra-driver

In [31]:
# Carregando variáveis de ambiente
# Crie um arquivo chamado ".env" com as seguintes variáveis
# OPENAI_API_KEY=""
# ASTRA_DB_SECURE_BUNDLE_PATH=""
# ASTRA_DB_CLIENT_ID=""
# ASTRA_DB_CLIENT_SECRET=""
# ASTRA_DB_APPLICATION_TOKEN=""
# ASTRA_DB_KEYSPACE=""

from dotenv import load_dotenv, find_dotenv
import os
load_dotenv(find_dotenv(), override=True)

True

In [32]:
# Importando as libraries

from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import dict_factory
from cassandra.query import SimpleStatement
import openai
import numpy
import pandas as pd

keyspace = os.environ["ASTRA_DB_KEYSPACE"]

In [33]:
# Conexão com o banco de dados
cloud_config= {
  'secure_connect_bundle': os.environ["ASTRA_DB_SECURE_BUNDLE_PATH"]
}
auth_provider = PlainTextAuthProvider(os.environ["ASTRA_DB_CLIENT_ID"], os.environ["ASTRA_DB_CLIENT_SECRET"])
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect()
session.set_keyspace(keyspace)
session

<cassandra.cluster.Session at 0x1174f6370>

In [None]:
# Limpando execuções anteriores
session.execute(f"""DROP INDEX IF EXISTS {keyspace}.openai_desc""")
session.execute(f"""DROP INDEX IF EXISTS {keyspace}.minilm_desc""")
session.execute(f"""DROP TABLE IF EXISTS {keyspace}.products_table""")

In [None]:
# Criando a tabela de vetores
session.execute(f"""CREATE TABLE IF NOT EXISTS {keyspace}.products_table
(product_id int,
 chunk_id int,

 product_name text,
 description text,
 price text,

 openai_description_embedding vector<float, 1536>,
 minilm_description_embedding vector<float, 384>,

 PRIMARY KEY (product_id, chunk_id))""")



In [None]:
# Criando os índices sobre as colunas de vetores
session.execute(f"""CREATE CUSTOM INDEX IF NOT EXISTS openai_desc 
  ON {keyspace}.products_table (openai_description_embedding) 
  USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'""")

session.execute(f"""CREATE CUSTOM INDEX IF NOT EXISTS minilm_desc 
  ON {keyspace}.products_table (minilm_description_embedding) 
  USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'""")


In [34]:
# Carregando os dados do arquivo CSV para o Pandas
products_list = pd.read_csv('./data/ProductDataset.csv')
products_list[:5]

Unnamed: 0,product_id,product_name,description,price
0,552,Sony Turntable - PSLX350H,Sony Turntable - PSLX350H/ Belt Drive System/ ...,
1,580,Bose Acoustimass 5 Series III Speaker System -...,Bose Acoustimass 5 Series III Speaker System -...,$399.00
2,4696,Sony Switcher - SBV40S,Sony Switcher - SBV40S/ Eliminates Disconnecti...,$49.00
3,5644,Sony 5 Disc CD Player - CDPCE375,Sony 5 Disc CD Player- CDPCE375/ 5 Disc Change...,
4,6284,Bose 27028 161 Bookshelf Pair Speakers In Whit...,Bose 161 Bookshelf Speakers In White - 161WH/ ...,$158.00


# Gerando os embeddings (vetores que representam numericamente um determinada conteúdo)

In [35]:
# Modelo de embedding
model_id = "text-embedding-ada-002"

In [36]:
# Gerando Chunks
chunk = products_list.at[ 0,"product_name" ] + " " +products_list.at[0,"description"]
print(f"Texto a ser usado para gerar o vetor: '{chunk}'")

Texto a ser usado para gerar o vetor: 'Sony Turntable - PSLX350H Sony Turntable - PSLX350H/ Belt Drive System/ 33-1/3 and 45 RPM Speeds/ Servo Speed Control/ Supplied Moving Magnet Phono Cartridge/ Bonded Diamond Stylus/ Static Balance Tonearm/ Pitch Control'


In [38]:
# embedding
embedding = openai.Embedding.create(input=chunk, model=model_id)['data'][0]['embedding']
print(f"Vetor/Embedding gerado: {embedding}")

Vetor/Embedding gerado: [0.004754268564283848, 0.00481609720736742, -0.013270299881696701, -0.008512777276337147, -0.031916406005620956, 0.02788129635155201, -0.012866788543760777, 0.006212114356458187, 0.0031255818903446198, -0.024262715131044388, -4.410857582115568e-05, 0.028792450204491615, -0.02920897677540779, -0.00037402036832645535, 0.0037357292603701353, 0.018379267305135727, 0.0028912853449583054, 0.005343264434486628, 0.0015554691199213266, -0.008096249774098396, -0.01629663072526455, -0.013745401054620743, -0.016348697245121002, -0.017676377668976784, -0.025603413581848145, -0.0050634099170565605, 0.004874671343713999, -0.0007935984176583588, 0.011447993107140064, 0.013654286041855812, 0.03082302026450634, 0.009599653072655201, -0.018079888075590134, 0.004067649599164724, 0.005857415031641722, -0.0318383052945137, -0.00946297962218523, -0.005730504635721445, 0.0037747786846011877, -0.006303229834884405, 0.015021015889942646, 0.017481129616498947, -0.0019345741020515561, 0.01

In [None]:
# Gerando os embedding para todos os produtos da base
  
for id, row in products_list.iterrows():
  text_chunk_length = 2500
  text_chunks = [row.description[i:i + text_chunk_length] for i in range(0, len(row.description), text_chunk_length)]
    
  for chunk_id, chunk in enumerate(text_chunks):
      
    pricevalue = row.price if isinstance(row.price, str) else ""
    full_chunk = f"{chunk} price: {pricevalue}"
      
    embedding = openai.Embedding.create(input=full_chunk, model=model_id)['data'][0]['embedding']
      
    query = SimpleStatement(
                f"""
                INSERT INTO {keyspace}.products_table
                (product_id, chunk_id, product_name, description, price, openai_description_embedding)
                VALUES (%s, %s, %s, %s, %s, %s)
                """
            )
    display(row)

    session.execute(query, (row.product_id, chunk_id, row.product_name, row.description, pricevalue, embedding))

# Procurando produtos a partir de questões

In [39]:
# iremos gerar um embedding/vetor a partir de uma pergunta
customer_input = "What equipement would you recommend for a computer workstation setup costing less than $2000?"

embedding = openai.Embedding.create(input=customer_input, model=model_id)['data'][0]['embedding']
display(embedding)

[0.008580240420997143,
 -0.00815189816057682,
 -0.004618070088326931,
 -0.009436925873160362,
 0.010527861304581165,
 0.010795575566589832,
 -0.027413934469223022,
 -0.018525823950767517,
 -0.0016916191671043634,
 0.007516076322644949,
 0.0118664326146245,
 0.022581692785024643,
 0.0013561397790908813,
 0.004581259563565254,
 0.004019059706479311,
 0.03879179060459137,
 0.010936125181615353,
 -0.0011837987694889307,
 0.009711332619190216,
 -0.01890062354505062,
 -0.04821532964706421,
 -0.012020368129014969,
 0.019382508471608162,
 -0.0011277460725978017,
 -0.01226131059229374,
 0.0025817689020186663,
 0.020948637276887894,
 -0.013379017822444439,
 -0.013934524729847908,
 0.019596680998802185,
 0.05081215873360634,
 -0.006850137375295162,
 0.002213661791756749,
 -0.029261162504553795,
 -0.03603433445096016,
 0.03316979110240936,
 0.008406226523220539,
 0.019917936995625496,
 0.0316438190639019,
 -0.0015234611928462982,
 0.029689505696296692,
 0.006518840789794922,
 0.010875890031456947,

In [40]:
# Selecionando or cinco produtos cujos vetores são mais similares à questão
query = SimpleStatement(
    f"""
    SELECT *
    FROM {keyspace}.products_table
    ORDER BY openai_description_embedding ANN OF {embedding} LIMIT 5;
    """
    )
display(query)

<SimpleStatement query="
    SELECT *
    FROM demo.products_table
    ORDER BY openai_description_embedding ANN OF [0.008580240420997143, -0.00815189816057682, -0.004618070088326931, -0.009436925873160362, 0.010527861304581165, 0.010795575566589832, -0.027413934469223022, -0.018525823950767517, -0.0016916191671043634, 0.007516076322644949, 0.0118664326146245, 0.022581692785024643, 0.0013561397790908813, 0.004581259563565254, 0.004019059706479311, 0.03879179060459137, 0.010936125181615353, -0.0011837987694889307, 0.009711332619190216, -0.01890062354505062, -0.04821532964706421, -0.012020368129014969, 0.019382508471608162, -0.0011277460725978017, -0.01226131059229374, 0.0025817689020186663, 0.020948637276887894, -0.013379017822444439, -0.013934524729847908, 0.019596680998802185, 0.05081215873360634, -0.006850137375295162, 0.002213661791756749, -0.029261162504553795, -0.03603433445096016, 0.03316979110240936, 0.008406226523220539, 0.019917936995625496, 0.0316438190639019, -0.001523461192

In [41]:
# Registros encontrados
results = session.execute(query)
top_5_products = results._current_rows

for row in top_5_products:
  print(f"""{row.product_id}, {row.product_name}, {row.description}, {row.price}\n""")

37812, Sony VAIO RT Series Black All-In-One Desktop Computer - VGCRT150Y, Sony VAIO RT Series Black All-In-One Desktop Computer - VGCRT150Y/ 2.66GHz Intel Core 2 Quad Q9400 Processor/ 8GB Memory/ Built-In CompactFlash Media Slot/ 1TB Serial ATA Hard Drive/ 25.5' XBRITE-Full HD LCD/ Integrated Stereo A2DP Bluetooth Technology/ 6MB L2 Cache/ Microsoft Windows Vista Ultimate 64-Bit/ Black Finish, $3,999.00

38400, Sony VAIO JS Series Black All-In-One Desktop Computer - VGCJS130JB, Sony VAIO JS Series Black All-In-One Desktop Computer - VGCJS130JB/ 2.5GHz Intel Pentium Dual-Core Processor E5200/ 20.1' (1680 x 1050) Widescreen WSXGA+ XBRITE-HiColor Technology Display/ 500GB Serial ATA 7200rpm Hard Drive/ Built-In 1.3 Megapixel MOTION EYE Camera And Microphone With Face-Tracking Technology/ 4GB PC2-6400 (2GBx2) Installed Memory/ 800MHz Front Side Bus Speed/ 2MB L2 Cache/ Genuine Microsoft Windows Vista Home Premium 64-Bit/ Black Finish, $1,099.00

37877, Apple MacBook Pro 2.4GHz Intel Core 2

In [42]:
# Enviando o prompt ao GPT para geração da resposta

message_objects = []
message_objects.append({"role":"system",
                        "content":"You're a chatbot helping customers with questions and helping them with product recommendations"})

message_objects.append({"role":"user",
                        "content": customer_input})

message_objects.append({"role":"user",
                        "content": "Please give me a detailed explanation of your recommendations"})

message_objects.append({"role":"user",
                        "content": "Please be friendly and talk to me like a person, don't just give me a list of recommendations"})

message_objects.append({"role": "assistant",
                        "content": "I found these 3 products I would recommend"})

products_list = []

for row in top_5_products:
    brand_dict = {'role': "assistant", "content": f"{row.description}"}
    products_list.append(brand_dict)

message_objects.extend(products_list)
message_objects.append({"role": "assistant", "content":"Here's my summarized recommendation of products, and why it would suit you:"})

display(message_objects)

[{'role': 'system',
  'content': "You're a chatbot helping customers with questions and helping them with product recommendations"},
 {'role': 'user',
  'content': 'What equipement would you recommend for a computer workstation setup costing less than $2000?'},
 {'role': 'user',
  'content': 'Please give me a detailed explanation of your recommendations'},
 {'role': 'user',
  'content': "Please be friendly and talk to me like a person, don't just give me a list of recommendations"},
 {'role': 'assistant',
  'content': 'I found these 3 products I would recommend'},
 {'role': 'assistant',
  'content': "Sony VAIO RT Series Black All-In-One Desktop Computer - VGCRT150Y/ 2.66GHz Intel Core 2 Quad Q9400 Processor/ 8GB Memory/ Built-In CompactFlash Media Slot/ 1TB Serial ATA Hard Drive/ 25.5' XBRITE-Full HD LCD/ Integrated Stereo A2DP Bluetooth Technology/ 6MB L2 Cache/ Microsoft Windows Vista Ultimate 64-Bit/ Black Finish"},
 {'role': 'assistant',
  'content': "Sony VAIO JS Series Black All-

In [45]:
# Enviando o prompt
completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=message_objects
)
print(completion.choices[0].message['content'])

For a computer workstation setup costing less than $2000, I would recommend:

1. Sony VAIO RT Series Black All-In-One Desktop Computer - VGCRT150Y: This all-in-one desktop computer features a 25.5-inch XBRITE-Full HD LCD display, a powerful 2.66GHz Intel Core 2 Quad Q9400 processor, 8GB of memory, and a 1TB serial ATA hard drive. It also has built-in Bluetooth technology and comes with the Microsoft Windows Vista Ultimate 64-Bit operating system. The black finish adds a sleek and modern look to your workstation.

2. Sony VAIO JS Series Black All-In-One Desktop Computer - VGCJS130JB: This all-in-one desktop computer offers a 20.1-inch widescreen display with XBRITE-HiColor technology. It is equipped with a 2.5GHz Intel Pentium Dual-Core processor, 4GB of memory, and a 500GB hard drive. It also features a built-in 1.3-megapixel MOTION EYE camera and microphone for video conferencing. The black finish adds a touch of elegance to your workstation.

3. Apple MacBook Pro 2.4GHz Intel Core 2 