# Placement Data Extractor
This notebook extracts structured data from placement documents for RAG.

In [None]:
# Install dependencies
!pip install PyMuPDF python-docx python-pptx openpyxl pandas pytesseract Pillow tqdm transformers torch accelerate sentencepiece vllm -q

In [None]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU count: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

In [None]:
# Set environment for using second GPU (less loaded)
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [None]:
# Add project to path
import sys
sys.path.insert(0, '/home/ram/v_rag')

In [None]:
# Update config paths if needed
from extractor import config
config.PLACEMENTS_DIR = Path("/home/ram/v_rag/Placements")  # Update this path
config.OUTPUT_DIR = Path("/home/ram/v_rag/output")
config.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"Placements dir: {config.PLACEMENTS_DIR}")
print(f"Output dir: {config.OUTPUT_DIR}")

In [None]:
# Test directory scanning first
from pathlib import Path
from extractor.directory_scanner import scan_placements_directory

entries = scan_placements_directory(config.PLACEMENTS_DIR)
print(f"Found {len(entries)} placement entries")

# Show first few entries
for entry in entries[:5]:
    print(f"  - {entry.primary_key}: {len(entry.files)} files")

In [None]:
# Test file reading
from extractor.file_readers import read_file

# Test with one entry
if entries:
    test_entry = entries[0]
    print(f"Testing with: {test_entry.primary_key}")
    for f in test_entry.files[:2]:
        text = read_file(f)
        print(f"\n{f.name}: {len(text)} chars")
        print(text[:500] if text else "No content")

In [None]:
# Run the full extraction
from extractor.main_extractor import PlacementDataExtractor

extractor = PlacementDataExtractor()
extractor.run(save_intermediate=True)

In [None]:
# Check results
import json

with open(config.FACTS_OUTPUT, 'r') as f:
    facts = json.load(f)
    
with open(config.SEMANTIC_OUTPUT, 'r') as f:
    semantic = json.load(f)

print(f"Total facts extracted: {len(facts)}")
print(f"Total semantic chunks: {len(semantic)}")

# Show sample
print("\nSample fact:")
print(json.dumps(facts[0], indent=2))

In [None]:
# Show sample semantic chunk
print("\nSample semantic chunk:")
print(json.dumps(semantic[0], indent=2))

In [None]:
# Show sample semantic chunk
print("\nSample semantic chunk:")
print(json.dumps(semantic[0], indent=2))