In [1]:
import json
import concurrent.futures
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.docstore.document import Document
import ollama as client

In [2]:
# Embedding configuration
model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True}
embedding = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cpu'},
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Function to load JSON lines
class JSONLinesLoader:
    def __init__(self, file_path):
        self.file_path = file_path
    
    def load(self):
        documents = []
        with open(self.file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    doc = json.loads(line)
                    documents.append(doc)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
        return documents

In [4]:
# Function to process documents and split text
def process_documents(docs):
    doc_objects = [Document(page_content=json.dumps(doc)) for doc in docs]
    text_splitter = RecursiveCharacterTextSplitter(separators=["\n"])
    return text_splitter.split_documents(doc_objects)

In [5]:
json_file_path = r'./input_people_data_02.json'
loader = JSONLinesLoader(json_file_path)
documents = loader.load()

In [12]:
# Split documents in parallel
batch_size = 100  # Adjust based on memory capacity
num_batches = len(documents) // batch_size + 1
batches = [documents[i*batch_size:(i+1)*batch_size] for i in range(num_batches)]


In [7]:
len(documents)

999

In [11]:

print(batches)

10


In [13]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    split_texts_batches = list(executor.map(process_documents, batches))

In [14]:
# Flatten the list of lists into a single list
split_texts = [text for batch in split_texts_batches for text in batch]

In [15]:
# Create the vector database
vector_db = Chroma.from_documents(
    documents=split_texts, 
    embedding=embedding,
    collection_name="local-rag"
)

In [16]:
# Function to ask a question using Llama3 model with dynamic context handling
def ask_question(query, context=None, model="llama3"):
    if context:
        messages = [
            {"role": "system", "content": "You are an assistant that provides detailed information based on the provided context."},
            {"role": "user", "content": f"{context}\n{query}"}
        ]
    else:
        messages = [
            {"role": "system", "content": "You are an assistant that provides detailed information based on global knowledge."},
            {"role": "user", "content": query}
        ]

    try:
        response = client.chat(
            model=model,
            messages=messages
        )

        if 'message' in response and 'content' in response['message']:
            return response['message']['content']
        else:
            return "There was an issue with processing your request."

    except Exception as e:
        print(f"Exception occurred: {e}")
        return "There was an error processing your request."

In [18]:
# User interaction
query = input("Enter prompt: ")
prompt = f'AI agent, please expand one or two paragraph to my prompt starts here: {query}'
expanded_context = ask_question(prompt)
print("Expanded context:", expanded_context)

Expanded context: Here's an expansion on your prompt:

Ivy League colleges are a group of prestigious private universities in the United States known for their academic excellence and selectivity. Many notable individuals have studied at these institutions, going on to achieve great success in various fields. For instance, Barack Obama, the 44th President of the United States, attended Columbia University, an Ivy League school. Another example is Michelle Obama, who studied sociology and African American studies at Princeton University, also an Ivy League institution.

Some notable persons who have studied from Ivy colleges include: Elon Musk (Wharton School, University of Pennsylvania); Steve Jobs (Harvard College); Mark Zuckerberg (Columbia University); Bill Gates (Harvard College); Oprah Winfrey (Indiana University Bloomington, not an Ivy League school but a notable alumna); Jeff Bezos (Princeton University); and many more. These individuals have gone on to found successful companie

In [19]:
r1 = vector_db.similarity_search(expanded_context)
print("Similarity search results:", r1)

Similarity search results: [Document(page_content='{"person_id": 93800062, "name": "Wifredo \\"Wifi\\" Fernandez", "education": [{"institution_name": "Harvard Business School Executive Education", "degree": "NaN", "subject": "NaN", "started_on": "2015-01-01", "ended_on": "2015-01-01"}, {"institution_name": "American University", "degree": "Masters;Master Of Arts", "subject": "Teaching", "started_on": "2009-01-01", "ended_on": "2011-01-01"}, {"institution_name": "University Of Miami School Of Law", "degree": "Doctor Of Jurisprudence;Doctorates;Masters;Master Of Laws", "subject": "International Law;Law", "started_on": "2017-01-01", "ended_on": "2020-01-01"}, {"institution_name": "Comillas Pontifical University", "degree": "NaN", "subject": "Business Administration;Management;Business Administration And Management", "started_on": "2007-01-01", "ended_on": "2007-01-01"}, {"institution_name": "Ransom Everglades School", "degree": "NaN", "subject": "NaN", "started_on": "2001-01-01", "ended_o

In [20]:
r2 = ask_question(query, r1)
print("Final response:", r2)

Final response: Based on the provided data, it appears that Sean Galligan has attended Cornell University and Purdue University, which are both Ivy League institutions.

Here is a summary of Sean Galligan's educational background:

* Master of Business Administration (MBA) from Cornell Johnson Graduate School of Management
* MBA/Masters degree from Cornell University
* Bachelor of Science in Economics from Purdue University

Please note that this analysis only considers the provided data and may not be exhaustive or accurate.
