# Quick Diagnostic - File Discovery

Run these cells to quickly diagnose what's happening with file discovery.

In [None]:
# Configuration
DOWNLOAD_PATH = "/tmp/documents"
FILE_EXTENSIONS = [".md", ".txt", ".html"]

## Check if directory exists and is accessible

In [None]:
import os
from pathlib import Path

print(f"Checking path: {DOWNLOAD_PATH}")
print(f"Exists: {os.path.exists(DOWNLOAD_PATH)}")
print(f"Is directory: {os.path.isdir(DOWNLOAD_PATH)}")

if os.path.exists(DOWNLOAD_PATH):
    print(f"\nDirectory info:")
    # Use ls to quickly see what's there
    !ls -lah {DOWNLOAD_PATH}

## Count files WITHOUT walking tree (fast)

In [None]:
# Use shell commands for speed
print("Total files (all types):")
!find {DOWNLOAD_PATH} -type f | wc -l

print("\nMarkdown files:")
!find {DOWNLOAD_PATH} -type f -name "*.md" | wc -l

print("\nText files:")
!find {DOWNLOAD_PATH} -type f -name "*.txt" | wc -l

print("\nHTML files:")
!find {DOWNLOAD_PATH} -type f -name "*.html" | wc -l

## Get file list (fast shell version)

In [None]:
import subprocess

# Use find command - much faster than os.walk for network filesystems
cmd = f'find {DOWNLOAD_PATH} -type f \\( -name "*.md" -o -name "*.txt" -o -name "*.html" \\)'
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)

files = [f.strip() for f in result.stdout.split('\n') if f.strip()]

print(f"Found {len(files)} files")
print(f"\nFirst 10 files:")
for f in files[:10]:
    print(f"  {f}")

# Save for later use
discovered_files = files

## Test reading ONE file

In [None]:
if discovered_files:
    test_file = discovered_files[0]
    print(f"Testing file read: {test_file}")
    
    try:
        with open(test_file, 'r', encoding='utf-8') as f:
            content = f.read()
        
        print(f"SUCCESS: Read {len(content)} characters")
        print(f"First 200 chars: {content[:200]}")
    except Exception as e:
        print(f"ERROR: {e}")
else:
    print("No files found!")

## Check disk I/O performance

In [None]:
import time

# Test how long it takes to stat all files
if discovered_files:
    print(f"Testing I/O performance on {len(discovered_files)} files...")
    
    start = time.time()
    sizes = []
    for f in discovered_files:
        try:
            size = os.path.getsize(f)
            sizes.append(size)
        except:
            pass
    
    elapsed = time.time() - start
    
    print(f"Time to stat {len(sizes)} files: {elapsed:.2f} seconds")
    print(f"Average: {elapsed/len(sizes)*1000:.2f} ms per file")
    print(f"Total size: {sum(sizes)/1024/1024:.2f} MB")
    
    if elapsed > 10:
        print("\nWARNING: Filesystem is VERY slow! This may be a network mount issue.")
else:
    print("No files to test!")