In [2]:
!pip install oracledb sentence-transformers oci

Collecting oracledb
  Using cached oracledb-2.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting oci
  Using cached oci-2.137.1-py3-none-any.whl.metadata (5.3 kB)
Collecting cryptography>=3.2.1 (from oracledb)
  Using cached cryptography-43.0.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.46.0-py3-none-any.whl.metadata (44 kB)
Collecting tqdm (from sentence-transformers)
  Using cached tqdm-4.66.6-py3-none-any.whl.metadata (57 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.5.0-cp312-cp312-manylinux1_x86_64.whl.metadata (28 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting s

In [1]:
import os

def loadFAQs(directory_path):
    faqs = {}
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path) as f:
                raw_faq = f.read()
            faqs[filename] = [text.strip() for text in raw_faq.split('=====')]
    return faqs

faqs = loadFAQs('.')

In [2]:
docs = [{'text': filename + ' | ' + section, 'path': filename} for filename, sections in faqs.items() for section in sections]

In [3]:
import oracledb

connection = oracledb.connect(user="scott", password="Welcome_12345", dsn="myatp_medium", config_dir="/home/shamim/projects/tls_wallet", wallet_location="/home/shamim/projects/tls_wallet", wallet_password="Welcome_12345")

table_name = 'genaifaqs'
with connection.cursor() as cursor:
    cursor.execute(f"""
        CREATE TABLE IF NOT EXISTS {table_name} (
            id NUMBER PRIMARY KEY,
            payload CLOB CHECK (payload IS JSON),
            vector VECTOR
        )""")

In [4]:
from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer('all-MiniLM-L12-v2')

data = [{"id": idx, "vector_source": row['text'], "payload": row} for idx, row in enumerate(docs)]
texts = [row['vector_source'] for row in data]
embeddings = encoder.encode(texts, batch_size=10)

import array
for row, embedding in zip(data, embeddings):
    row['vector'] = array.array("f", embedding)

  from tqdm.autonotebook import tqdm, trange


In [5]:
import json

with connection.cursor() as cursor:
    cursor.execute(f"TRUNCATE TABLE {table_name}")
    prepared_data = [(row['id'], json.dumps(row['payload']), row['vector']) for row in data]
    cursor.executemany(f"INSERT INTO {table_name} (id, payload, vector) VALUES (:1, :2, :3)", prepared_data)
    connection.commit()

In [6]:
cr = connection.cursor()
r = cr.execute("SELECT * FROM genaifaqs f where rownum =1")
print(r.fetchall())

[(24, {'text': 'Generative_AI_FAQ.txt | Q25: What is deepfake technology?\nA: Deepfake uses AI to create realistic, altered videos or images of people.', 'path': 'Generative_AI_FAQ.txt'}, array('f', [-0.0007201445405371487, -0.0258498378098011, 0.007152569945901632, -0.003656314220279455, -0.0020476249046623707, 0.02976640872657299, -0.0202650036662817, -0.09278019517660141, 0.03025302290916443, 0.04996906593441963, -0.03872310370206833, -0.01933300867676735, -0.007471167482435703, -0.01518948096781969, -0.042043089866638184, -0.0028244946151971817, 0.022211210802197456, 0.12178391218185425, -0.03381387144327164, -0.0340578518807888, 0.09609763324260712, 0.015460986644029617, 0.019726844504475594, -0.0542815737426281, 0.021050969138741493, -0.11579629778862, 0.0516744926571846, 0.03576965257525444, 0.07302584499120712, -0.031815771013498306, 0.04366393759846687, 0.06992501020431519, 0.04570533707737923, 0.025224527344107628, -0.04681394621729851, -0.03197052329778671, -0.05179609730839

In [13]:
topK = 4
sql = f"""SELECT payload, vector_distance(vector, :vector, COSINE) AS score
          FROM {table_name}
          ORDER BY score
          FETCH FIRST {topK} ROWS ONLY"""

In [14]:
question = "What are GANs?"
embedding = list(encoder.encode(question))
vector = array.array("f", embedding)

results = []
with connection.cursor() as cursor:
    for (info, score,) in cursor.execute(sql, vector=vector):
        text_content = info.read()
        results.append((score, json.loads(text_content)))

In [15]:
print(results)

[(0.27902800283631, {'text': 'Generative_AI_FAQ.txt | Q6: What are GANs?\nA: GANs are models with two networks—generator and discriminator—that work together to produce realistic outputs.', 'path': 'Generative_AI_FAQ.txt'}), (0.5485436443629503, {'text': 'Generative_AI_FAQ.txt | Q26: Are deepfakes harmful?\nA: They can be used maliciously, but also have valid applications in entertainment.', 'path': 'Generative_AI_FAQ.txt'}), (0.5558175537684329, {'text': 'Generative_AI_FAQ.txt | ', 'path': 'Generative_AI_FAQ.txt'}), (0.5777062333924782, {'text': 'Generative_AI_FAQ.txt | Q22: What is AI-generated art?\nA: Artwork created by AI models trained on visual data to generate creative visuals.', 'path': 'Generative_AI_FAQ.txt'})]
