In [2]:
import pymongo
import pandas as pd
import json
import os
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer

# --- 1. CONFIGURATION & CONNECTIONS ---
MONGO_URI = "mongodb://localhost:27017/"
client = pymongo.MongoClient(MONGO_URI)
db = client["helix_hr_db"]

# Load a free, local embedding model (runs on your CPU/GPU)
print("Loading Embedding Model (Sentence-Transformers)...")
model = SentenceTransformer('all-MiniLM-L6-v2') 

# --- 2. STRUCTURED DATA (CSV) ---
def ingest_csv(path):
    print(f"üöÄ Ingesting CSV: {path}")
    df = pd.read_csv(path)
    if 'join_date' in df.columns:
        df['join_date'] = pd.to_datetime(df['join_date']).dt.strftime('%Y-%m-%d')
    db.employees.delete_many({})
    db.employees.insert_many(df.to_dict('records'))
    print(f"‚úÖ Saved {len(df)} employees.")

# --- 3. SEMI-STRUCTURED DATA (JSON) ---
def ingest_json(path):
    print(f"üöÄ Ingesting JSON: {path}")
    with open(path, 'r') as f:
        data = json.load(f)
    db.attendance.delete_many({})
    # If data is a dict, wrap in list; if list, insert directly
    db.attendance.insert_many(data if isinstance(data, list) else [data])
    print(f"‚úÖ Attendance logs synchronized.")

# --- 4. UNSTRUCTURED DATA (PDF + VECTORIZATION) ---
def ingest_pdf_vectors(path):
    print(f"üöÄ Vectorizing PDF: {path}")
    reader = PdfReader(path)
    db.policy_vectors.delete_many({})
    
    chunk_count = 0
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if not text.strip(): continue
        
        # Simple chunking: split by double newlines or large blocks
        chunks = text.split('\n\n') 
        
        for chunk in chunks:
            if len(chunk.strip()) < 20: continue # Skip tiny fragments
            
            # Generate the vector embedding
            vector = model.encode(chunk).tolist()
            
            # Store text + vector + metadata
            doc = {
                "text": chunk.strip(),
                "embedding": vector,
                "metadata": {
                    "source": os.path.basename(path),
                    "page": i + 1
                }
            }
            db.policy_vectors.insert_one(doc)
            chunk_count += 1
            
    print(f"‚úÖ Created {chunk_count} vector chunks in MongoDB.")

# --- 5. EXECUTION ---
csv_file = "C:/Users/user/Downloads/employee_master.csv"
json_file = "C:/Users/user/Downloads/attendance_logs_detailed.json"
pdf_file = "C:/Users/user/Downloads/Helix_Pro_Policy_v2.pdf"

try:
    ingest_csv(csv_file)
    ingest_json(json_file)
    ingest_pdf_vectors(pdf_file)
    
    print("\nüî• SUCCESS: All Helix Corp data is now in MongoDB!")
    print(f"Employees: {db.employees.count_documents({})}")
    print(f"Logs: {db.attendance.count_documents({})}")
    print(f"Policy Vectors: {db.policy_vectors.count_documents({})}")

except Exception as e:
    print(f"‚ùå Critical Error: {e}")

Loading Embedding Model (Sentence-Transformers)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


üöÄ Ingesting CSV: C:/Users/user/Downloads/employee_master.csv
‚úÖ Saved 500 employees.
üöÄ Ingesting JSON: C:/Users/user/Downloads/attendance_logs_detailed.json
‚úÖ Attendance logs synchronized.
üöÄ Vectorizing PDF: C:/Users/user/Downloads/Helix_Pro_Policy_v2.pdf
‚úÖ Created 13 vector chunks in MongoDB.

üî• SUCCESS: All Helix Corp data is now in MongoDB!
Employees: 500
Logs: 1
Policy Vectors: 13


In [8]:
import numpy as np
import redis

# Connect to Redis
r_cache = redis.Redis(host='localhost', port=6379, decode_responses=True)

def helix_hr_assistant(user_query, employee_id=None):
    # --- STEP 1: Check Redis Cache ---
    try:
        cached_response = r_cache.get(user_query)
        if cached_response:
            return f"[CACHED RESPONSE] {cached_response}"
    except:
        pass # If Redis is down, just continue to DB

    # --- STEP 2: Local Vector Search (The Fix) ---
    # Encode the user question
    query_vector = model.encode(user_query)
    
    # Fetch all PDF chunks from your local Docker MongoDB
    all_chunks = list(db.policy_vectors.find())
    
    if not all_chunks:
        context = "No policy documents found in the database."
    else:
        # Manual Cosine Similarity Calculation
        def cosine_sim(v1, v2):
            return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

        # Score every chunk against the query
        for chunk in all_chunks:
            chunk['score'] = cosine_sim(query_vector, chunk['embedding'])
        
        # Sort by best score and take the top 2 matches
        top_matches = sorted(all_chunks, key=lambda x: x['score'], reverse=True)[:2]
        context = "\n".join([m['text'] for m in top_matches])

    # --- STEP 3: Personal Data Lookup ---
    personal_data = ""
    if employee_id:
        # Check CSV data for this employee
        emp_record = db.employees.find_one({"employee_id": employee_id})
        # Check JSON logs for this employee
        attendance = list(db.attendance.find({"employee_id": employee_id}).limit(3))
        
        if emp_record:
            personal_data = f"Employee Record: {emp_record}\nRecent Attendance: {attendance}"

    # --- STEP 4: Generate Real Response ---
    # This calls your OpenAI GPT-4o function
    try:
        final_answer = get_llm_response(user_query, context, personal_data)
        
        # --- STEP 5: Save to Redis ---
        r_cache.setex(user_query, 3600, final_answer)
        return final_answer
    except Exception as e:
        return f"Logic Error: {e}"

print("‚úÖ Assistant updated! You can now use the Gradio chat without MongoDB Atlas errors.")

‚úÖ Assistant updated! You can now use the Gradio chat without MongoDB Atlas errors.


In [9]:
import gradio as gr

def chat_interface(query, emp_id):
    return helix_hr_assistant(query, emp_id)

demo = gr.Interface(
    fn=chat_interface, 
    inputs=["text", "text"], 
    outputs="text",
    title="Helix Corp HR AI Bot",
    description="Ask me about company policies or your attendance records."
)

demo.launch()

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.




In [10]:
import openai

# Set your key
openai.api_key = ""

def generate_grounded_response(user_query, context, personal_data):
    prompt = f"""
    You are the Helix Corp HR Assistant. Use the provided context and employee data to answer the query.
    
    GUIDELINES:
    1. If the ans
    prompt = f"""
    You are the Helix Corp HR Assistant. Use the provided context and employee data to answer the query.
    
    GUIDELINES:
    1. If the answer isn't in the context, say "I don't have that information." 
    2. Do not hallucinate.
    3. If a calculation is requested (like annual leave), show your step-by-step math based on the join date and policy.

    CONTEXT FROM POLICY PDF:
    {context}

    EMPLOYEE DATA (CSV/JSON):
    {personal_data}

    USER QUERY: {user_query}
    """
    
    # Corrected API call syntax
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a precise and helpful HR bot."},
            {"role": "user", "content": prompt}
        ],
        temperature=0 # Keep it factual
    )
    
    return response.choices[0].message.content

# Alias the function so it matches what your assistant calls
generate_grounded_response = get_llm_response

In [None]:
# Install the missing library
!pip install openai

In [None]:
pip install gradio

In [7]:
import logging
from datetime import datetime

# Set up logging to a file
logging.basicConfig(filename='helix_bot.log', level=logging.INFO)

def validate_system_readiness():
    print("üîç Final System Check...")
    checks = {
        "MongoDB": db.command("ping")["ok"] == 1.0,
        "Redis": r_cache.ping(),
        "Policy Data": db.policy_vectors.count_documents({}) > 0,
        "Employee Data": db.employees.count_documents({}) > 0
    }
    
    for service, status in checks.items():
        if status:
            print(f"‚úÖ {service} is Ready")
        else:
            print(f"‚ùå {service} is Missing/Empty")
            
    return all(checks.values())

# Run the check
if validate_system_readiness():
    print("üöÄ SYSTEM ONLINE: Helix HR Bot is ready for deployment.")

üîç Final System Check...
‚úÖ MongoDB is Ready
‚úÖ Redis is Ready
‚úÖ Policy Data is Ready
‚úÖ Employee Data is Ready
üöÄ SYSTEM ONLINE: Helix HR Bot is ready for deployment.
