## JSON PARSING AND PROCESSING

In [1]:
# JSON PROCESSING STRATEGIES

from langchain_community.document_loaders import JSONLoader
import json

employee_data_path = "../data/json_files/company.json"

In [2]:
print("Loading JSON with different strategies...\n")

# JSON LOADER
employee_loader = JSONLoader(file_path=employee_data_path, jq_schema=".employees[]", text_content=False)
employee_data = employee_loader.load()


print(f"Loaded {len(employee_data)} documents using JSONLoader with jq_schema '.employees[]'.\n")
print(f"First document content:\n{employee_data[0]}\n")
print(employee_data)

Loading JSON with different strategies...

Loaded 5 documents using JSONLoader with jq_schema '.employees[]'.

First document content:
page_content='{"id": 1, "name": "Rahul Sharma", "role": "Backend Developer", "age": 29, "department_id": 1, "projects": [{"project_id": "P1001", "name": "AI Chatbot"}]}' metadata={'source': 'E:\\data-projects\\rag-projects\\rag-intro\\data\\json_files\\company.json', 'seq_num': 1}

[Document(metadata={'source': 'E:\\data-projects\\rag-projects\\rag-intro\\data\\json_files\\company.json', 'seq_num': 1}, page_content='{"id": 1, "name": "Rahul Sharma", "role": "Backend Developer", "age": 29, "department_id": 1, "projects": [{"project_id": "P1001", "name": "AI Chatbot"}]}'), Document(metadata={'source': 'E:\\data-projects\\rag-projects\\rag-intro\\data\\json_files\\company.json', 'seq_num': 2}, page_content='{"id": 2, "name": "Priya Singh", "role": "Frontend Developer", "age": 26, "department_id": 1, "projects": [{"project_id": "P1001", "name": "AI Chatbot"

In [3]:
# Custom JSON Parsing
from langchain_core.documents import Document
from typing import List

def parse_json_custom(file_path) -> List[Document]:
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    documents = []
    for employee in data.get('employees', []):
        content = f"Name: {employee['name']}\nAge: {employee['age']}\nDepartment ID: {employee['department_id']}\nRole: {employee['role']}\nProjects: {[p['name'] for p in employee.get('projects', [])]}"
        emp_doc = Document(page_content=content, metadata={
            'employee_id': employee['id'],
            'department_id': employee['department_id'],
            'name': employee['name'],
            'role': employee['role'],
            'projects': employee.get('projects', [])
        })
        documents.append(emp_doc)
    
    return documents

In [4]:
parse_json_custom(employee_data_path)

[Document(metadata={'employee_id': 1, 'department_id': 1, 'name': 'Rahul Sharma', 'role': 'Backend Developer', 'projects': [{'project_id': 'P1001', 'name': 'AI Chatbot'}]}, page_content="Name: Rahul Sharma\nAge: 29\nDepartment ID: 1\nRole: Backend Developer\nProjects: ['AI Chatbot']"),
 Document(metadata={'employee_id': 2, 'department_id': 1, 'name': 'Priya Singh', 'role': 'Frontend Developer', 'projects': [{'project_id': 'P1001', 'name': 'AI Chatbot'}]}, page_content="Name: Priya Singh\nAge: 26\nDepartment ID: 1\nRole: Frontend Developer\nProjects: ['AI Chatbot']"),
 Document(metadata={'employee_id': 3, 'department_id': 2, 'name': 'Amit Verma', 'role': 'Sales Executive', 'projects': [{'project_id': 'P1002', 'name': 'CRM Upgrade'}]}, page_content="Name: Amit Verma\nAge: 32\nDepartment ID: 2\nRole: Sales Executive\nProjects: ['CRM Upgrade']"),
 Document(metadata={'employee_id': 4, 'department_id': 2, 'name': 'Neha Gupta', 'role': 'Account Manager', 'projects': [{'project_id': 'P1002', '