In [22]:
import os
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

from openai import OpenAI
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY_EMBEDDINGS'))


In [23]:
loader = TextLoader('sample.txt', encoding='utf-8')
documents = loader.load()

In [24]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

In [25]:
texts[0].page_content

"The History and Impact of the Internet\n\nThe advent of the internet marks one of the most significant technological developments in human history. Its impact on society, economy, \nand culture is profound, reshaping the way we live, work, and interact. This text explores the internet's origins, its evolution, and its multifaceted effects on the modern world.\n\nOrigins and Development\n\nThe internet's roots can be traced back to the late 1960s, with the development of ARPANET (Advanced Research Projects Agency Network) by the United States Department of Defense. \nThis early network laid the groundwork for today's internet. It was initially a means to share information and resources among computers at different locations."

In [26]:
len(texts)

8

In [27]:
response = client.embeddings.create(
    input="Your text string goes here",
    model="text-embedding-3-small"
)

print(response.data[0].embedding)

[0.005172153003513813, 0.017217181622982025, -0.018686940893530846, -0.01854696311056614, -0.047256264835596085, -0.03026304580271244, 0.027659472078084946, 0.003663900075480342, 0.011233161203563213, 0.006396952550858259, -0.0016980969812721014, 0.01585940271615982, -0.0012702919775620103, -0.007873711176216602, 0.05991019308567047, 0.05030776187777519, -0.02751949429512024, 0.00991037767380476, -0.040397386997938156, 0.04999981448054314, -0.00041380725451745093, 0.0302350502461195, -0.013717753812670708, 0.03295060619711876, 0.01728717051446438, 0.016783252358436584, -0.0017374654999002814, 0.02042265608906746, 0.040789321064949036, -0.03773782029747963, -0.026119723916053772, -0.05002781003713608, 0.0241740420460701, -0.0551229752600193, -0.03227871656417847, 0.04235706478357315, 0.06472540646791458, 0.01469759363681078, -0.01566343568265438, -0.04132123664021492, 0.022200364619493484, 0.00736279459670186, 0.044960640370845795, 0.007107336539775133, -0.024118050932884216, 0.05240742

In [35]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
doc_vectors = embeddings.embed_documents([t.page_content for t in texts])



In [36]:
doc_vectors

[[-0.013956095378081809,
  -0.050380145301006915,
  0.015809230830490423,
  0.0510502058301461,
  0.05306038369227328,
  -0.03834000308427922,
  -0.02617422358701534,
  0.029189494105496486,
  -0.033565823676807716,
  0.05574062580883002,
  0.038758789052346025,
  0.029566402221814683,
  -0.01197732450778219,
  -0.07655436702537706,
  -0.0013689118485248886,
  -0.026718646835618334,
  -0.040559575793084994,
  0.02437343684627637,
  0.009412250674485785,
  -0.01769377513737179,
  0.03431963990944411,
  0.028561311428105904,
  -0.030613370867272068,
  0.034466218909822384,
  0.021588499169025523,
  -0.021588499169025523,
  -0.0224470146876464,
  0.038172487951994426,
  -0.0334820679733105,
  -0.017714714994568684,
  -0.017295927163856695,
  -0.011401491473383851,
  -0.008789303892717022,
  0.058923409596950777,
  0.011663234100409493,
  0.018081153182288433,
  -0.012102960856995788,
  0.0015089440003153796,
  0.025566980766821654,
  0.03214194691503213,
  -0.0034366755985121016,
  -0.050

In [28]:
# doc_vectors = []
# for text in texts:
#     response = client.embeddings.create(
#         input=text.page_content,
#         model="text-embedding-3-small"
#     )
#     doc_vectors.append(response.data[0].embedding)

In [29]:
doc_vectors

[[-0.013956095091998577,
  -0.05038014426827431,
  0.015809230506420135,
  0.051050204783678055,
  0.053060382604599,
  -0.0383400022983551,
  -0.02617422305047512,
  0.029189493507146835,
  -0.03356582298874855,
  0.05574062466621399,
  0.038758788257837296,
  0.02956640161573887,
  -0.01197732426226139,
  -0.07655436545610428,
  -0.0013689118204638362,
  -0.02671864628791809,
  -0.04055957496166229,
  0.024373436346650124,
  0.009412250481545925,
  -0.0176937747746706,
  0.03431963920593262,
  0.028561310842633247,
  -0.03061337023973465,
  0.0344662182033062,
  0.02158849872648716,
  -0.02158849872648716,
  -0.0224470142275095,
  0.038172487169504166,
  -0.03348206728696823,
  -0.017714714631438255,
  -0.017295926809310913,
  -0.011401491239666939,
  -0.008789303712546825,
  0.05892340838909149,
  0.011663233861327171,
  0.01808115281164646,
  -0.012102960608899593,
  0.0015089439693838358,
  0.025566980242729187,
  0.032141946256160736,
  -0.00343667552806437,
  -0.0502545088529586

In [39]:
from langchain_community.vectorstores.pgvector import PGVector

CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver=os.environ.get("PGVECTOR_DRIVER", "psycopg2"),
    host=os.environ.get("PGVECTOR_HOST", "34.159.231.98"),
    port=int(os.environ.get("PGVECTOR_PORT", "5432")),
    database=os.environ.get("PGVECTOR_DATABASE", "embeddings"),
    user=os.environ.get("PGVECTOR_USER", "paschalis"),
    password=os.environ.get("PGVECTOR_PASSWORD", "T^l#^.Y[*|^~cnOu"),
)

In [40]:
COLLECTION_NAME = "test_collection"
db = PGVector.from_documents(embedding=embeddings, documents=texts, connection_string=CONNECTION_STRING, collection_name=COLLECTION_NAME)



In [44]:
query = "The World Wide Web"

resp = db.similarity_search_with_score(query, k=1)
resp

  .all()


TypeError: 'float' object is not subscriptable