In [1]:
import os

In [50]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from pinecone import Pinecone, ServerlessSpec
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV")

In [5]:
#extract data from pdf
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [6]:
extracted_data = load_pdf("data/")

In [7]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [30]:
text_chunks = text_split(extracted_data)
print(f"length of text chunks: {len(text_chunks)}")
text_chunks[0].page_content

length of text chunks: 2175


'Fundamentals of Psychological Disorders'

In [9]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [10]:
embeddings = download_hugging_face_embeddings()

In [11]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [31]:
query_result = embeddings.embed_query(text_chunks[0].page_content)
print("Length", len(query_result))

Length 384


In [33]:
print(query_result)

[0.06232380121946335, 0.054322633892297745, -0.000990081811323762, 0.06290406733751297, -0.011124338954687119, 0.12125708162784576, 0.05478745698928833, 0.03735773265361786, 0.03189612179994583, 0.028428202494978905, 0.03772749379277229, -0.017773399129509926, -0.013580797240138054, -0.00976752769201994, 0.0150513406842947, -0.03773066774010658, 0.021502234041690826, 0.03651547431945801, -0.017953291535377502, 0.017720971256494522, 0.0016600885428488255, -0.027009032666683197, -0.028610767796635628, -0.016111232340335846, -0.10294103622436523, 0.1086084321141243, -0.021212104707956314, -0.12708811461925507, -0.016753869131207466, 0.02798927202820778, 0.04147922620177269, 0.00505919149145484, 0.030136706307530403, 0.018960516899824142, 0.0012934660771861672, -0.04136962071061134, 0.05483477562665939, 0.038614802062511444, 0.02663295902311802, 0.0005681241163983941, -0.006757162511348724, -0.01690855622291565, 0.035277631133794785, 0.015885530039668083, -0.05792301520705223, -0.017839986

In [138]:
def create_embeddings(text_chunk):
    processed_embeddings = []
    ids = []
    i = 1
    metadata = []
    for text in text_chunk:
        current_embedding = embeddings.embed_query(text.page_content)
        metadata.append(text.page_content)
        ids.append(i)
        i = i + 1
        processed_embeddings.append(current_embedding)
    return processed_embeddings, ids, metadata

In [139]:
processed_embeddings, ids, metadata = create_embeddings(text_chunks)


In [140]:
metadata

['Fundamentals of Psychological Disorders',
 'Fundamentals of Psychological Disorders',
 'PDF Version of the Textbook – Fundamentals of Psychological Disorders – 3rd\nedition 5TR – version 3.5\nOrder a print copy:\nhttps://www.lulu.com/shop/lee-w-daffin-jr-and-alexis-bridley/fundamentals-of-psychological-disorders/pa\nperback/product-y2kqn4.html?q=daffin&page=1&pageSize=4',
 'This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0\nInternational License .\nAttribution-NonCommercial-ShareAlike 4.0 International\nOfficial translations of this license are available in other languages .\nCreative Commons Corporation (“Creative Commons”) is not a law firm and does not provide legal\nservices or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-',
 'client or other relationship. Creative Commons makes its licenses and related information available on\nan “as-is” basis. Creative Commons gives no warranties regarding its lic

In [121]:
print(processed_embeddings[0])

[0.06232380121946335, 0.054322633892297745, -0.000990081811323762, 0.06290406733751297, -0.011124338954687119, 0.12125708162784576, 0.05478745698928833, 0.03735773265361786, 0.03189612179994583, 0.028428202494978905, 0.03772749379277229, -0.017773399129509926, -0.013580797240138054, -0.00976752769201994, 0.0150513406842947, -0.03773066774010658, 0.021502234041690826, 0.03651547431945801, -0.017953291535377502, 0.017720971256494522, 0.0016600885428488255, -0.027009032666683197, -0.028610767796635628, -0.016111232340335846, -0.10294103622436523, 0.1086084321141243, -0.021212104707956314, -0.12708811461925507, -0.016753869131207466, 0.02798927202820778, 0.04147922620177269, 0.00505919149145484, 0.030136706307530403, 0.018960516899824142, 0.0012934660771861672, -0.04136962071061134, 0.05483477562665939, 0.038614802062511444, 0.02663295902311802, 0.0005681241163983941, -0.006757162511348724, -0.01690855622291565, 0.035277631133794785, 0.015885530039668083, -0.05792301520705223, -0.017839986

In [122]:
type(processed_embeddings)

list

In [123]:
#import numpy as np
#processed_embeddings = np.array(processed_embeddings)

In [124]:
type(processed_embeddings)

list

In [116]:
print(processed_embeddings[1])

[ 6.23238012e-02  5.43226339e-02 -9.90081811e-04  6.29040673e-02
 -1.11243390e-02  1.21257082e-01  5.47874570e-02  3.73577327e-02
  3.18961218e-02  2.84282025e-02  3.77274938e-02 -1.77733991e-02
 -1.35807972e-02 -9.76752769e-03  1.50513407e-02 -3.77306677e-02
  2.15022340e-02  3.65154743e-02 -1.79532915e-02  1.77209713e-02
  1.66008854e-03 -2.70090327e-02 -2.86107678e-02 -1.61112323e-02
 -1.02941036e-01  1.08608432e-01 -2.12121047e-02 -1.27088115e-01
 -1.67538691e-02  2.79892720e-02  4.14792262e-02  5.05919149e-03
  3.01367063e-02  1.89605169e-02  1.29346608e-03 -4.13696207e-02
  5.48347756e-02  3.86148021e-02  2.66329590e-02  5.68124116e-04
 -6.75716251e-03 -1.69085562e-02  3.52776311e-02  1.58855300e-02
 -5.79230152e-02 -1.78399868e-02 -5.44603765e-02  3.61505896e-02
 -7.14405775e-02 -1.10347703e-01 -1.94131315e-03 -1.59015239e-03
  5.44757694e-02 -1.52262254e-02  2.28400528e-02  5.63276699e-03
  4.09503691e-02  7.50209391e-02 -6.64687455e-02  4.04216945e-02
  1.03125371e-01 -5.14706

In [147]:
# Create a list of dictionaries
data = [{"id": str(id), "values": vector, "metadata": {"info": metadata}} for id, vector, metadata in zip(ids,processed_embeddings, metadata)]

# Output the list of dictionaries


In [148]:
import numpy as np
import itertools
# Initialize Pinecone with your API key
pc = Pinecone(api_key=PINECONE_API_KEY)

# Specify the name of the index
index_name = "psycology-cahtbot"

def chunks(iterable, batch_size=1):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

index = pc.Index(index_name)

for ids_vectors_chunk in chunks(data, batch_size=1):
    index.upsert(vectors=ids_vectors_chunk)

print("Data upserted successfully!")

Data upserted successfully!
