In [1]:
from langchain.document_loaders import UnstructuredFileLoader
import os
import nltk

# Download 'punkt' once at the start of the script
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

def get_document_content(documents):
    # Ensure documents is always a list
    if isinstance(documents, str):
        documents = [documents]

    raw_texts = []

    for document in documents:
        if not os.path.exists(document):
            print(f"File not found: {document}")
            continue
        
        try:
            # Using UnstructuredFileLoader for flexible document loading
            loader = UnstructuredFileLoader(document)
            docs = loader.load()
            for doc in docs:
                raw_texts.append(doc.page_content)
        except Exception as e:
            print(f"An error occurred while processing {document}: {e}")

    # Efficiently concatenate all text chunks
    return "\n".join(raw_texts)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ramgo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\ramgo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [1]:
from PyPDF2 import PdfReader

def get_pdf_content(documents):
    # Ensure documents is always treated as a list
    if isinstance(documents, str):
        documents = [documents]

    raw_text = ""

    for document in documents:
        try:
            pdf_reader = PdfReader(document)
            for page in pdf_reader.pages:
                text = page.extract_text()
                if text:  # Ensure non-empty text is appended
                    raw_text += text
        except FileNotFoundError:
            print(f"File not found: {document}")
        except Exception as e:
            print(f"An error occurred while processing {document}: {e}")

    return raw_text

In [2]:
from langchain.text_splitter import CharacterTextSplitter

def get_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    text_chunks = text_splitter.split_text(text)
    return text_chunks

In [3]:
DB_PARAMS = {
    "dbname": "vector_db",
    "user": "postgres",
    "password": "postgres",
    "host": "16.171.250.194",
    "port": "5433",  # Changed to 5433
}

In [4]:
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import PGVector

def get_embeddings(chunk):
    model = SentenceTransformer("all-mpnet-base-v2")
    embeddings = model.encode(chunk)
    return embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import psycopg2

def get_db_connection():
    conn = psycopg2.connect(**DB_PARAMS)
    cursor = conn.cursor()
    print("Connected to the database successfully!")
    return conn, cursor

def insert_items(text,filename,embedding):
    conn = None
    cursor = None
    try:
        conn, cursor = get_db_connection()
        sql = "INSERT INTO embeddings (text, filename, embedding) VALUES (%s, %s, %s);"
        cursor.execute(sql, (text, str(filename), embedding))       
        conn.commit()
        print("Data inserted successfully!")
    except Exception as e:
        print("Error inserting data:", e)


def get_data():
    conn, cursor = get_db_connection()
    cursor.execute("SELECT * FROM  embeddings;")
    rows = cursor.fetchall()
    return rows


In [6]:
def process_and_store_pdfs(pdf_files):

    for file in pdf_files:

        raw_text = get_pdf_content(file)

        text_chunks = get_chunks(raw_text)
        print(len(text_chunks))


        model = SentenceTransformer("all-mpnet-base-v2")

        conn, cursor = get_db_connection()

        try:
            for chunk in text_chunks:
              
                embedding = model.encode(chunk).tolist()  

                # Insert data into database
                insert_items(chunk, file, embedding)

            print("All data processed and stored successfully!")

        except Exception as e:
            print("Error during processing:", e)


    cursor.close()
    conn.close()

In [7]:
import json

def retrieve_similar_vectors(query_text, top_k=5):
    model = SentenceTransformer("all-mpnet-base-v2")
    query_vector = model.encode(query_text).tolist()

    conn, cursor = get_db_connection()

    vector_str = json.dumps(query_vector)

    query = f"""
    SELECT text, 1 - (embedding <=> '{vector_str}'::vector) AS similarity
    FROM embeddings
    ORDER BY embedding <=> '{vector_str}'::vector
    LIMIT {top_k};
"""


    try:
        cursor.execute(query)
        results = cursor.fetchall()
    except Exception as e:
        print(f"Error retrieving vectors: ",e)
        results = []
    finally:
        cursor.close()
        conn.close()
    
    context_combined = " ".join([article[0] for article in results])
    # if len(results) < top_k:
    #     print(f"Only {len(results)} similar vectors found.")
    return context_combined


In [None]:
from langchain_community.llms import Ollama
from operator import itemgetter
from langchain.prompts import PromptTemplate


def generate_response_ollama(query):

    MODEL = "mistral:latest"

    context = retrieve_similar_vectors(query)
    model = Ollama(model=MODEL, temperature=0.7, num_predict=1000)

    prompt = f"""You are an AI assistant. Use the following retrieved documents as context to answer the question.
    
    Context:
    {context}
    
    Question: {query}
    
    Provide a concise and accurate answer based on the given information.
    """
    
    response = model(prompt)
    
    return response


In [8]:
file = ['../data/india_growth.pdf','../data/mal_cinema.pdf','../data/trump_tariff.pdf']
process_and_store_pdfs(file)

122
Connected to the database successfully!
Connected to the database successfully!
Data inserted successfully!
Connected to the database successfully!
Data inserted successfully!
Connected to the database successfully!
Data inserted successfully!
Connected to the database successfully!
Data inserted successfully!
Connected to the database successfully!
Data inserted successfully!
Connected to the database successfully!
Data inserted successfully!
Connected to the database successfully!
Data inserted successfully!
Connected to the database successfully!
Data inserted successfully!
Connected to the database successfully!
Data inserted successfully!
Connected to the database successfully!
Data inserted successfully!
Connected to the database successfully!
Data inserted successfully!
Connected to the database successfully!
Data inserted successfully!
Connected to the database successfully!
Data inserted successfully!
Connected to the database successfully!
Data inserted successfully!
Conn

KeyboardInterrupt: 

In [22]:
query = "How reforming labout laws adds to growth?"

In [25]:
res = generate_response_ollama(query)
res

Connected to the database successfully!


' Reforming labor laws can contribute to growth by boosting manufacturing competitiveness. Overly rigid labor regulations can deter job creation in the organized sector, which can be addressed by rationalizing these regulations to balance flexibility for enterprises with fair working conditions and social security for laborers. This balance can encourage job creation and enhance overall economic growth.'