In [1]:
!pip install ollama

Collecting ollama
  Downloading ollama-0.2.1-py3-none-any.whl (9.7 kB)
Collecting httpx<0.28.0,>=0.27.0 (from ollama)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<0.28.0,>=0.27.0->ollama)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<0.28.0,>=0.27.0->ollama)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, ollama
Successfully installed h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 ollama-0.2.1


In [2]:
!ollama pull llama3

/bin/bash: line 1: ollama: command not found


In [None]:
!pip install transformers
!pip install langchain
!pip install faiss-cpu

In [4]:
import json
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np
import ollama as client

In [5]:
# Path to the JSON file
file_path = './input_people_data_02.json'

# Define JSONLinesLoader class for loading data
class JSONLinesLoader:
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> list:
        person_profile_documents = []
        with open(self.file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    doc = json.loads(line)
                    person_profile_documents.append(doc)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
        return person_profile_documents

In [6]:
# Function to load data from JSON lines file
def load_data(file_path):
    loader = JSONLinesLoader(file_path)
    return loader.load()

# Function to convert list of dictionaries to list of Document objects
def convert_to_documents(data):
    doc_objects = []
    for doc in data:
        doc_obj = Document(page_content=json.dumps(doc))
        doc_objects.append(doc_obj)
    return doc_objects

In [7]:
# Function to generate embeddings for a list of Document objects
def generate_embeddings(documents, model_name='sentence-transformers/paraphrase-MiniLM-L6-v2'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    embeddings = []
    for doc in documents:
        inputs = tokenizer(doc.page_content, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            embeddings.append(embedding)

    return np.array(embeddings)

In [8]:
# Function to create Faiss index from embeddings
def create_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

In [9]:
# Function to initialize data and return index, texts, and data
def initialize_data(file_path):
    data = load_data(file_path)
    doc_objects = convert_to_documents(data)

    # Split the text using a text splitter
    text_splitter = RecursiveCharacterTextSplitter(separators=["\n"])
    texts_split = text_splitter.split_documents(doc_objects)
    texts = [text.page_content for text in texts_split]

    # Generate embeddings and create Faiss index
    embeddings = generate_embeddings(texts_split)
    index = create_faiss_index(embeddings)

    return index, texts, data, embeddings

index, texts, data, embeddings1 = initialize_data(file_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [10]:

# Function to ask a question using Llama3 model with dynamic context handling
def ask_question(query, context=None, model="llama3"):
    if context:
        messages = [
            {"role": "system", "content": "You are an assistant that provides detailed information based on the provided context."},
            {"role": "user", "content": f"{context}\n{query}"}
        ]
    else:
        messages = [
            {"role": "system", "content": "You are an assistant that provides detailed information based on global knowledge."},
            {"role": "user", "content": query}
        ]

    try:
        response = client.chat(
            model=model,
            messages=messages
        )

        if 'message' in response and 'content' in response['message']:
            return response['message']['content']
        else:
            return "There was an issue with processing your request."

    except Exception as e:
        print(f"Exception occurred: {e}")
        return "There was an error processing your request."

In [11]:
# Function to generate embeddings for a single document
def generate_single_embedding(doc, model_name='sentence-transformers/paraphrase-MiniLM-L6-v2'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    inputs = tokenizer(doc, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    return embedding

In [12]:
# Function to find the most relevant document based on similarity
def find_most_relevant(embeddings1, query_embedding, k=5):
    index = faiss.IndexFlatIP(embeddings1.shape[1])  # Inner product similarity
    index.add(embeddings1)

    D, I = index.search(np.array([query_embedding]), k)

    return D, I


In [13]:
# Load data and create embeddings
index, texts, data, embeddings1 = initialize_data(file_path)

# Get user input for the question
user_question = input("Please enter your question: ")


Please enter your question: give me peoples who studied in ivy colleges


In [14]:
# Expand the context
context_prompt = "AI agent please expand my prompt in one or two paragraph, as my domain data has less information, it only context information."
expanded_context = ask_question(context_prompt)

# Combine expanded context with the specific question
combined_query = f"{expanded_context}\n\n{user_question}"

# Generate embedding for combined query
query_embedding = generate_single_embedding(combined_query)

Exception occurred: [Errno 111] Connection refused


In [16]:
#prints query_ embeddings
print(query_embedding)

# Find the most relevant documents
distances, indices = find_most_relevant(embeddings1, query_embedding)

[ 1.70612067e-01 -7.10931897e-01  4.09224808e-01  3.12073559e-01
 -2.10713539e-02 -2.37189129e-01 -6.67946115e-02 -5.05026132e-02
 -2.83200979e-01  2.42712833e-02  2.84803301e-01 -1.67838395e-01
 -3.46805692e-01  2.50130184e-02 -1.99801534e-01  1.32172666e-02
 -2.02473298e-01 -3.50832045e-01 -8.95698741e-02  1.42563917e-02
 -1.86493527e-02 -2.92569518e-01 -4.33049560e-01 -1.13128789e-03
  2.81940818e-01  1.62465781e-01  9.03644785e-03 -8.80559310e-02
  1.36337474e-01 -1.94146812e-01 -2.11221486e-01  1.76700890e-01
  7.73104548e-01  1.32332757e-01  7.62806982e-02 -4.04409356e-02
  8.68688375e-02 -1.88621357e-01  1.54245151e-02 -3.85796614e-02
 -2.76334226e-01 -1.35373667e-01  4.17823076e-01  6.40113831e-01
 -9.36709866e-02 -3.10908616e-01 -1.41832843e-01 -1.94114484e-02
  3.89066413e-02  4.67425704e-01 -2.84429014e-01  4.03908223e-01
  3.41622718e-02  1.84457668e-03 -3.82494122e-01  4.14429575e-01
  3.24732900e-01  3.90869796e-01 -1.09451495e-01  6.11305058e-01
  1.29908016e-02 -1.57055

In [17]:
# Find the most relevant documents
# distances, indices = find_most_relevant(embeddings1, query_embedding)

# Print results
print("Nearest neighbors:")
for i in range(len(indices[0])):
    idx = indices[0][i]
    similarity_score = distances[0][i]
    print(f"Document {idx}: Similarity score = {similarity_score}, Content: {texts[idx]}")

Nearest neighbors:
Document 965: Similarity score = 7.577175140380859, Content: {"person_id": 51183773, "name": "Keenan G.", "education": [{"institution_name": "Lehigh University", "degree": "NaN", "subject": "Business Administration;Engineering", "started_on": "1994-01-01", "ended_on": "1995-01-01"}, {"institution_name": "University Of Massachusetts Lowell", "degree": "NaN", "subject": "Business Administration;Engineering", "started_on": "1993-01-01", "ended_on": "1994-01-01"}, {"institution_name": "Lehigh University", "degree": "NaN", "subject": "Supply Chain Management;Management", "started_on": "2001-01-01", "ended_on": "2002-01-01"}, {"institution_name": "Penn State University", "degree": "Bachelors;Bachelor Of Science", "subject": "Business Administration", "started_on": "1998-01-01", "ended_on": "2000-01-01"}, {"institution_name": "American College Students", "degree": "NaN", "subject": "NaN", "started_on": "NaN", "ended_on": "1999-01-01"}], "employments": [{"company_name": "Nok