diff --git a/.gitignore b/.gitignore index 1dc4703..1e97e64 100644 --- a/.gitignore +++ b/.gitignore @@ -18,13 +18,19 @@ env/ .vscode/ *.swp -# Data +# Data and cache *.db metrics.db +.features_cache # Logs *.log +# Old/backup files +*.old +*.bak +*.tmp + # OS .DS_Store Thumbs.db diff --git a/DISTRIBUTION.md b/DISTRIBUTION.md new file mode 100644 index 0000000..bf1c561 --- /dev/null +++ b/DISTRIBUTION.md @@ -0,0 +1,119 @@ +# Distribution Setup Complete + +## Summary +Cluster Health Monitor v1.0.0 is now ready for portable ZIP distribution. + +## What Was Implemented + +### 1. Code Cleanup +- Removed debug print statements from workloads.py +- No emojis or verbose logging in code +- Clean, concise comments throughout + +### 2. Feature Detection & Caching +- `monitor/utils/features.py`: Runtime feature detection +- Detects: nvidia-smi, cupy, torch, gpu_benchmark availability +- Results cached in `.features_cache` JSON file +- Fast subsequent loads (no repeated checks) + +### 3. Requirements Simplified +- Single `requirements.txt` file +- Core dependencies required +- GPU libraries (cupy/torch) commented as optional +- Setup script prompts for GPU library installation + +### 4. PowerShell Setup Script +- `setup.ps1`: Automated Windows setup wizard +- Checks Python 3.8+ +- Detects NVIDIA drivers and CUDA version +- Creates virtual environment +- Installs dependencies +- Prompts for CuPy or PyTorch based on CUDA version +- Runs feature detection and caching +- Verifies installation + +### 5. Update Mechanism +- CLI: `python health_monitor.py --update` +- Web: "Check for Updates" button in header +- Checks GitHub releases API +- Downloads and applies updates automatically +- Preserves venv, config, and data + +### 6. Feature Graying in UI +- `/api/features` endpoint returns cached feature flags +- JavaScript checks features on page load +- Disables benchmark controls if GPU libraries not available +- Visual feedback: opacity 0.5, cursor not-allowed +- Alert message explains missing libraries + +### 7. Multi-GPU Support +- Already implemented in gpu.py collector +- Loops through all NVIDIA GPUs via NVML +- Web UI displays all GPUs in grid +- Benchmark supports any GPU (defaults to GPU 0) + +### 8. Portable ZIP Distribution +- `package.ps1`: Creates distribution ZIP +- Includes: monitor/, health_monitor.py, config.yaml, requirements.txt, setup.ps1, README.md, LICENSE +- Excludes: venv, __pycache__, .features_cache, *.db +- ~50KB compressed size +- Ready for GitHub releases + +### 9. Updated Documentation +- README.md rewritten for ZIP distribution +- Installation: Download → Extract → Run setup.ps1 +- Troubleshooting section updated +- Simplified project structure +- Removed development-focused content + +## Files Created/Modified + +### New Files +- `monitor/utils/features.py` - Feature detection +- `monitor/utils/update.py` - Update mechanism +- `monitor/utils/__init__.py` - Utils module exports +- `setup.ps1` - Windows setup wizard +- `package.ps1` - Distribution packaging script + +### Modified Files +- `health_monitor.py` - Added --update flag +- `monitor/api/server.py` - Added /api/features, /api/update/* endpoints +- `monitor/api/templates/index.html` - Update button, feature graying +- `monitor/benchmark/workloads.py` - Removed debug prints +- `requirements.txt` - Simplified to single file +- `README.md` - Complete rewrite for ZIP distribution + +### Removed Files +- `requirements-base.txt` - Merged into requirements.txt +- `requirements-gpu.txt` - Merged into requirements.txt +- `setup.py` - No longer using pip package +- `MANIFEST.in` - No longer needed +- `BUILD.md` - Removed +- `CHECKLIST.md` - Removed +- `RELEASE_NOTES.md` - Removed + +## Usage + +### For End Users +1. Download `cluster-health-monitor-v1.0.0.zip` from releases +2. Extract to desired location +3. Run `setup.ps1` in PowerShell +4. Activate venv and run: `python health_monitor.py monitor --web` +5. Access dashboard at http://localhost:8090 + +### For Distribution +1. Run `.\package.ps1` to create ZIP +2. Upload `cluster-health-monitor-v1.0.0.zip` to GitHub releases +3. Users download and follow above steps + +### For Updates +Users can update via: +- CLI: `python health_monitor.py --update` +- Web: Click "Check for Updates" button + +## Next Steps (Future) +- Create GitHub Actions workflow for automated releases +- Add version check on startup (optional notification) +- Multi-platform support (Linux setup.sh) +- Configuration wizard in web UI +- Export/import settings diff --git a/README.md b/README.md index f058a28..5418853 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ Real-time GPU and system monitoring with web dashboard and CLI interface. Featur ## Features ### Monitoring + - Real-time GPU metrics (utilization, memory, temperature, power) - System metrics (CPU, memory, disk I/O) - Web dashboard with live charts @@ -12,6 +13,7 @@ Real-time GPU and system monitoring with web dashboard and CLI interface. Featur - Historical data storage and alerting ### GPU Benchmarking + - GEMM (matrix multiplication) stress test - Particle simulation workload - Auto-scaling stress test (dynamically increases load to 98% GPU utilization) @@ -20,82 +22,74 @@ Real-time GPU and system monitoring with web dashboard and CLI interface. Featur ## Requirements -### Core Monitoring (Always Available) - Python 3.8+ - NVIDIA GPU with drivers installed -- `nvidia-smi` command available - -### GPU Benchmarking (Optional) -- CUDA Toolkit 12.0+ or compatible -- One of: - - CuPy: `pip install cupy-cuda12x` (or appropriate CUDA version) - - PyTorch: `pip install torch --index-url https://download.pytorch.org/whl/cu121` +- CUDA Toolkit 12.0+ (for benchmarking) ## Installation -### 1. Clone Repository -```bash -git clone https://github.com/DataBoySu/cluster-monitor.git -cd cluster-health-monitor -``` +### 1. Download -### 2. Create Virtual Environment -```bash -python -m venv .venv -``` - -Activate: -- Windows: `.venv\Scripts\activate` -- Linux/Mac: `source .venv/bin/activate` +Download the latest release ZIP from [Releases](https://github.com/DataBoySu/cluster-monitor/releases). -### 3. Install Dependencies +Extract to your desired location: -**Basic Monitoring:** -```bash -pip install -r requirements.txt +```powershell +Expand-Archive cluster-health-monitor-v1.0.0.zip -DestinationPath C:\Tools\ +cd C:\Tools\cluster-health-monitor ``` -**With GPU Benchmarking (CuPy):** -```bash -pip install -r requirements.txt -pip install cupy-cuda12x # Adjust for your CUDA version -``` +### 2. Run Setup -**With GPU Benchmarking (PyTorch):** -```bash -pip install -r requirements.txt -pip install torch --index-url https://download.pytorch.org/whl/cu121 +```powershell +.\setup.ps1 ``` -### 4. Verify Installation -```bash +The setup script will: + +- Check for NVIDIA drivers and CUDA +- Create Python virtual environment +- Install required dependencies +- Prompt for optional GPU benchmark libraries (CuPy or PyTorch) +- Verify installation + +### 3. Verify + +```powershell +.\venv\Scripts\Activate.ps1 python health_monitor.py --help ``` ## Usage -### Web Dashboard (Recommended) -```bash -python health_monitor.py monitor --web +### Web Dashboard (Default) + +```powershell +python health_monitor.py +# Change port: python health_monitor.py --port 3000 ``` Access at: http://localhost:8090 Features: + - Real-time GPU/system metrics - Interactive benchmark controls - Live performance charts - Historical data visualization +- In-dashboard updates ### Terminal Dashboard -```bash -python health_monitor.py monitor + +```powershell +python health_monitor.py cli ``` Displays live metrics in terminal with auto-refresh. ### CLI Benchmark -```bash + +```powershell # Quick 15-second test python health_monitor.py benchmark --mode quick @@ -136,13 +130,15 @@ The Stress Test mode automatically increases workload intensity: 4. Continues scaling up to 15 times or until 98% GPU utilization achieved Example progression: -``` + +```text 100K particles → 200K → 400K → 800K → 1.2M → 1.8M → 2.2M → 2.6M (94% GPU util) ``` ## Benchmark Types ### GEMM (Matrix Multiplication) + Dense matrix multiplication for maximum compute stress. Measures TFLOPS. ```bash @@ -150,6 +146,7 @@ python health_monitor.py benchmark --type gemm --mode stress-test ``` ### Particle Simulation + Vectorized particle physics simulation with collision detection. Measures steps/second. ```bash @@ -178,33 +175,6 @@ storage: path: ./metrics.db ``` -## Project Structure - -``` -cluster-health-monitor/ -├── monitor/ -│ ├── benchmark/ -│ │ ├── config.py # Benchmark configuration -│ │ ├── storage.py # Baseline storage (SQLite) -│ │ ├── workloads.py # GPU workloads (GEMM/Particle) -│ │ └── runner.py # Benchmark orchestration -│ ├── collectors/ -│ │ ├── gpu.py # GPU metrics via nvidia-smi -│ │ ├── system.py # CPU, memory, disk -│ │ └── network.py # Network info -│ ├── storage/ -│ │ └── sqlite.py # Metrics persistence -│ ├── api/ -│ │ ├── server.py # FastAPI web server -│ │ └── templates/ -│ │ └── index.html # Web dashboard -│ └── cli/ -│ └── benchmark_cli.py # CLI commands -├── config.yaml # Configuration -├── requirements.txt # Dependencies -└── health_monitor.py # Main entry point -``` - ## API Endpoints When running web server (`--web`): @@ -215,57 +185,36 @@ When running web server (`--web`): - `POST /api/benchmark/start` - Start benchmark - `GET /api/benchmark/status` - Benchmark progress - `POST /api/benchmark/stop` - Stop benchmark -- `GET /api/benchmark/results` - Get results -- `GET /api/benchmark/baseline` - Get baseline for GPU -## Troubleshooting +## Updates -### "nvidia-smi not found" -- Install NVIDIA drivers -- Add nvidia-smi to PATH -- Verify: `nvidia-smi` in terminal +### CLI -### "No CUDA libraries found" -Benchmarking features disabled without CUDA libraries. Install CuPy or PyTorch. +```powershell +python health_monitor.py --update +``` -### Web dashboard not loading data -- Check terminal for errors -- Verify port 8090 is available -- Check firewall settings -- Try: `http://127.0.0.1:8090` +### Web Dashboard -### Benchmark not scaling GPU to 98% -- Increase max_scales in runner.py -- Check GPU has available memory -- Verify no other GPU workloads running -- Try different benchmark type (GEMM vs Particle) +Click the "Check for Updates" button in the dashboard. -## Performance Tips +## Troubleshooting -1. **Close other GPU applications** during benchmarking -2. **Adequate cooling** for stress tests -3. **Monitor temperatures** - tests will stop at temp limit -4. **Use Stress Test mode** to find maximum GPU performance -5. **Run Extended mode** for stability validation +### "nvidia-smi not found" +Install NVIDIA drivers from https://www.nvidia.com/download/index.aspx -## Development +### "No CUDA Toolkit found" +Download CUDA from https://developer.nvidia.com/cuda-downloads +Re-run `.\setup.ps1` after installation. -### Run Tests -```bash -pytest tests/ -``` +### Web dashboard not loading data +- Check port 8090 is available +- Try: `http://127.0.0.1:8090` +- Check firewall settings -### Code Structure -- Modular design: config, storage, workloads, runner separated -- Clean API exports via `__init__.py` -- Type hints throughout -- Comprehensive error handling +### Benchmark features grayed out -### Contributing -1. Fork repository -2. Create feature branch -3. Add tests for new features -4. Submit pull request +GPU benchmark libraries not installed. Run setup script and select CuPy or PyTorch installation. ## License @@ -279,6 +228,4 @@ MIT License - See LICENSE file ## Support -- Issues: GitHub Issues -- Documentation: This README -- CUDA setup: https://developer.nvidia.com/cuda-downloads +GitHub: https://github.com/DataBoySu/cluster-monitor diff --git a/health_monitor.py b/health_monitor.py index 50e917f..0c82fba 100644 --- a/health_monitor.py +++ b/health_monitor.py @@ -295,12 +295,40 @@ async def main(): @click.group(invoke_without_command=True) @click.option('--config', '-c', type=click.Path(), help='Configuration file path.') +@click.option('--port', '-p', type=int, help='Web server port (default: 8090).') +@click.option('--update', is_flag=True, help='Check for and install updates.') @click.pass_context -def cli(ctx, config): +def cli(ctx, config, port, update): """Cluster Health Monitor: Real-time GPU and system health monitoring.""" + if update: + from monitor.utils import check_for_updates, perform_update + console.print("\n[cyan]Checking for updates...[/cyan]") + + status = check_for_updates() + + if status.get('error'): + console.print(f"[red]{status['error']}[/red]") + return + + if not status['available']: + console.print(f"[green]You have the latest version ({status['current']})[/green]") + return + + console.print(f"\n[yellow]Update available:[/yellow]") + console.print(f" Current: {status['current']}") + console.print(f" Latest: {status['latest']}") + + if click.confirm("\nDownload and install update?"): + console.print("\n[cyan]Downloading update...[/cyan]") + if perform_update(): + console.print("[green]Update complete! Restart the application.[/green]") + else: + console.print("[red]Update failed. Try again later.[/red]") + return + ctx.obj = {'config_path': config} if ctx.invoked_subcommand is None: - _run_app(config, port=None, nodes=None, once=False, web_mode=True) + _run_app(config, port=port, nodes=None, once=False, web_mode=True) @cli.command() @click.option('--port', '-p', type=int, help='Web server port (overrides config).') diff --git a/monitor/__version__.py b/monitor/__version__.py new file mode 100644 index 0000000..09ad572 --- /dev/null +++ b/monitor/__version__.py @@ -0,0 +1,5 @@ +"""Version information for Cluster Health Monitor.""" + +__version__ = "1.0.0" +__author__ = "DataBoySu" +__license__ = "MIT" diff --git a/monitor/api/server.py b/monitor/api/server.py index 89dfff0..ce4b8be 100644 --- a/monitor/api/server.py +++ b/monitor/api/server.py @@ -11,6 +11,7 @@ from fastapi import FastAPI, HTTPException, BackgroundTasks from fastapi.responses import HTMLResponse, StreamingResponse, FileResponse +from fastapi.staticfiles import StaticFiles from monitor.collectors.gpu import GPUCollector from monitor.collectors.system import SystemCollector @@ -20,6 +21,7 @@ # Path to the templates directory, relative to this file TEMPLATE_DIR = Path(__file__).parent / "templates" +STATIC_DIR = Path(__file__).parent / "static" def create_app(config: Dict[str, Any]) -> FastAPI: @@ -29,6 +31,9 @@ def create_app(config: Dict[str, Any]) -> FastAPI: version="1.0.0" ) + # Mount static files + app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static") + storage = MetricsStorage(config['storage']['path']) alert_engine = AlertEngine(config.get('alerts', {})) @@ -81,7 +86,23 @@ async def get_gpus(): @app.get("/api/processes") async def get_processes(): collector = GPUCollector() - return {'processes': collector.collect_processes()} + gpus = collector.collect() + processes = collector.collect_processes() + + # Calculate total VRAM usage from processes + gpu_memory_stats = {} + for gpu in gpus: + if not gpu.get('error'): + gpu_memory_stats[gpu['index']] = { + 'total': gpu.get('memory_total', 0), + 'used': gpu.get('memory_used', 0), + 'free': gpu.get('memory_free', 0) + } + + return { + 'processes': processes, + 'gpu_memory': gpu_memory_stats + } @app.get("/api/system") async def get_system(): @@ -110,6 +131,28 @@ async def get_available_metrics(): ] } + @app.get("/api/features") + async def get_features_endpoint(): + """Get available features (always fresh to detect newly installed packages).""" + from monitor.utils.features import detect_features + return detect_features(force=True) + + @app.post("/api/update/check") + async def check_update(): + """Check for available updates.""" + from monitor.utils import check_for_updates + return check_for_updates() + + @app.post("/api/update/install") + async def install_update(): + """Install available update.""" + from monitor.utils import perform_update + success = perform_update() + if success: + return {'status': 'success', 'message': 'Update installed. Restart application.'} + else: + return {'status': 'error', 'message': 'Update failed'} + @app.get("/api/export/json") async def export_json(hours: int = 24): metrics = await storage.query(hours=hours) diff --git a/monitor/api/static/main.js b/monitor/api/static/main.js new file mode 100644 index 0000000..da7f82a --- /dev/null +++ b/monitor/api/static/main.js @@ -0,0 +1,810 @@ +let countdown = 5; +let historyChart = null; + +// Tab switching +document.querySelectorAll('.tab').forEach(tab => { + tab.addEventListener('click', () => { + document.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); + document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active')); + tab.classList.add('active'); + document.getElementById(tab.dataset.tab).classList.add('active'); + + if (tab.dataset.tab === 'history') loadHistory(); + if (tab.dataset.tab === 'processes') loadProcesses(); + if (tab.dataset.tab === 'benchmark') { loadBenchmarkResults(); loadBaseline(); } + }); +}); + +async function loadBenchmarkResults() { + try { + const response = await fetch('/api/benchmark/results'); + const results = await response.json(); + if (results && results.status !== 'no_results') { + displayBenchmarkResults(results); + } + } catch (error) { + console.error('Error loading benchmark results:', error); + } +} + +async function fetchStatus() { + try { + console.log('Fetching status from /api/status...'); + const response = await fetch('/api/status'); + console.log('Response status:', response.status); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + const data = await response.json(); + console.log('Received data:', data); + updateDashboard(data); + } catch (error) { + console.error('Error fetching status:', error); + document.getElementById('gpu-list').innerHTML = '
Error: Failed to fetch GPU data. Check console for details.
'; + document.getElementById('system-info').innerHTML = '
Error loading system info
'; + } +} + +function updateDashboard(data) { + console.log('Updating dashboard with:', data); + const badge = document.getElementById('status-badge'); + badge.className = 'status-badge status-' + data.status; + badge.textContent = data.status.toUpperCase(); + + // Add tooltip with alert count + const alertCount = data.alerts ? data.alerts.length : 0; + if (data.status === 'warning' && alertCount > 0) { + badge.setAttribute('data-tooltip', `${alertCount} active alert${alertCount > 1 ? 's' : ''}`); + } else if (data.status === 'info') { + badge.setAttribute('data-tooltip', 'System information available'); + } else { + badge.setAttribute('data-tooltip', 'All systems operational'); + } + + const gpuList = document.getElementById('gpu-list'); + gpuList.innerHTML = data.metrics.gpus.map(gpu => { + if (gpu.error) return `
Error: ${gpu.error}
`; + + const util = gpu.utilization || 0; + const memPct = gpu.memory_total > 0 ? (gpu.memory_used / gpu.memory_total * 100) : 0; + const temp = gpu.temperature || 0; + + return ` +
+
+ GPU ${gpu.index}: ${gpu.name} + ${temp}C +
+
+ Utilization + ${util}% +
+
+
+
+
+ Memory + ${(gpu.memory_used/1024).toFixed(1)}/${(gpu.memory_total/1024).toFixed(1)} GB +
+
+
+
+
+ Power + ${(gpu.power || 0).toFixed(0)}W +
+
+ `; + }).join(''); + + const sys = data.metrics.system; + document.getElementById('system-info').innerHTML = ` +
Hostname${sys.hostname || 'N/A'}
+
CPU${(sys.cpu_percent || 0).toFixed(1)}%
+
+
Memory${(sys.memory_used_gb || 0).toFixed(1)}/${(sys.memory_total_gb || 0).toFixed(1)} GB
+
+
Disk${(sys.disk_used_gb || 0).toFixed(1)}/${(sys.disk_total_gb || 0).toFixed(1)} GB
+
+ `; + + const alertsList = document.getElementById('alerts-list'); + if (data.alerts && data.alerts.length > 0) { + alertsList.innerHTML = data.alerts.map(a => `
${a.severity.toUpperCase()}: ${a.message}
`).join(''); + } else { + alertsList.innerHTML = '
No active alerts
'; + } + + document.getElementById('last-update').textContent = 'Last update: ' + new Date().toLocaleTimeString(); +} + +async function loadHistory() { + const metric = document.getElementById('metric-select').value; + const hours = document.getElementById('hours-select').value; + + try { + const historyResponse = await fetch(`/api/history?metric=${metric}&hours=${hours}`); + const historyData = await historyResponse.json(); + + const ctx = document.getElementById('historyChart').getContext('2d'); + + if (historyChart) historyChart.destroy(); + + const getUnit = (metric) => { + if (metric.includes('utilization') || metric.includes('percent')) return '%'; + if (metric.includes('memory_used')) return 'MB'; + if (metric.includes('temperature')) return '°C'; + if (metric.includes('power')) return 'W'; + return ''; + } + + const unit = getUnit(metric); + + const yAxisOptions = { + ticks: { color: '#a0a0a0' }, + grid: { color: '#4a4a4a' }, + beginAtZero: true, + title: { + display: true, + text: unit, + color: '#a0a0a0', + font: { + size: 14, + weight: 'bold' + } + } + }; + + if (metric.includes('utilization') || metric.includes('percent')) { + yAxisOptions.suggestedMax = 100; + } + if (metric.includes('temperature')) { + yAxisOptions.suggestedMax = 100; + } + + if (metric.startsWith('gpu_') && metric.includes('_memory_used')) { + const statusResponse = await fetch('/api/status'); + const statusData = await statusResponse.json(); + const gpuIndex = parseInt(metric.split('_')[1]); + const gpu = statusData.metrics.gpus[gpuIndex]; + if (gpu && gpu.memory_total) { + yAxisOptions.max = gpu.memory_total; + } + } + + historyChart = new Chart(ctx, { + type: 'line', + data: { + labels: historyData.data.map(d => new Date(d.timestamp).toLocaleTimeString()), + datasets: [{ + label: document.getElementById('metric-select').selectedOptions[0].text, + data: historyData.data.map(d => d.value), + borderColor: '#76b900', + backgroundColor: 'rgba(118, 185, 0, 0.1)', + fill: true, + tension: 0.3 + }] + }, + options: { + responsive: true, + plugins: { + legend: { + display: true, + labels: { + color: '#f0f0f0', + font: { + size: 14 + } + } + } + }, + scales: { + x: { + ticks: { color: '#a0a0a0', maxTicksLimit: 10 }, + grid: { color: '#4a4a4a' } + }, + y: yAxisOptions + } + } + }); + } catch (error) { + console.error('Error loading history:', error); + } +} + +async function loadProcesses() { + try { + const response = await fetch('/api/processes'); + const data = await response.json(); + + // Update VRAM bar if we have GPU memory stats + if (data.gpu_memory && Object.keys(data.gpu_memory).length > 0) { + const gpuKeys = Object.keys(data.gpu_memory); + const gpu0 = data.gpu_memory[gpuKeys[0]]; + + if (gpu0 && gpu0.total > 0) { + const usedGB = (gpu0.used / 1024).toFixed(1); + const totalGB = (gpu0.total / 1024).toFixed(1); + const freeGB = (gpu0.free / 1024).toFixed(1); + const usedPct = ((gpu0.total - gpu0.free) / gpu0.total) * 100; + + document.getElementById('vram-bar-container').style.display = 'block'; + document.getElementById('vram-used-bar').style.width = usedPct + '%'; + document.getElementById('vram-free').textContent = `${usedGB} / ${totalGB} GB (${freeGB} GB Free)`; + + // Change color based on usage - solid colors only + const bar = document.getElementById('vram-used-bar'); + if (usedPct > 90) { + bar.style.background = 'var(--accent-red)'; + } else if (usedPct > 70) { + bar.style.background = 'var(--accent-yellow)'; + } else { + bar.style.background = 'var(--accent-green)'; + } + } + } + + const tbody = document.getElementById('process-list'); + if (!data.processes || data.processes.length === 0) { + tbody.innerHTML = 'No GPU processes running'; + } else { + // Check if any process has utilization data + const hasUtilData = data.processes.some(p => p.gpu_utilization !== null && p.gpu_utilization !== undefined); + + // Sort by GPU memory usage (descending) + const sorted = data.processes.sort((a, b) => (b.gpu_memory_mb || 0) - (a.gpu_memory_mb || 0)); + + tbody.innerHTML = sorted.map(p => { + let utilDisplay; + if (p.gpu_utilization !== null && p.gpu_utilization !== undefined) { + utilDisplay = `${p.gpu_utilization.toFixed(1)}%`; + } else if (hasUtilData) { + // Some processes have data, this one doesn't + utilDisplay = 'N/A'; + } else { + // No processes have data - show helpful message on first row only + if (sorted.indexOf(p) === 0) { + utilDisplay = 'Not available'; + } else { + utilDisplay = ''; + } + } + + return ` + + ${p.pid} + ${p.name || 'N/A'} + GPU ${p.gpu_index} + ${utilDisplay} + ${p.username || 'N/A'} + + `; + }).join(''); + } + } catch (error) { + console.error('Error loading processes:', error); + document.getElementById('process-list').innerHTML = + 'Error loading processes'; + } +} + +function exportData(format) { + const hours = document.getElementById('export-hours').value; + window.location.href = `/api/export/${format}?hours=${hours}`; +} + +// Benchmark functions +let benchmarkPollInterval = null; +let benchCharts = {}; +let selectedMode = 'quick'; +let selectedBenchType = 'gemm'; + +// Load baseline on page load +async function loadBaseline() { + try { + const response = await fetch('/api/benchmark/baseline?benchmark_type=' + selectedBenchType); + const baseline = await response.json(); + if (baseline && baseline.status !== 'no_baseline') { + document.getElementById('baseline-info').style.display = 'block'; + const benchTypeLabel = baseline.benchmark_type === 'gemm' ? 'GEMM' : 'Particle'; + document.getElementById('baseline-details').innerHTML = ` +
GPU${baseline.gpu_name}
+
Type${benchTypeLabel}
+
Iterations${baseline.iterations_completed}
+
Avg Iteration${baseline.avg_iteration_time_ms.toFixed(2)} ms
+
Avg Temp${baseline.avg_temperature.toFixed(1)} C
+

Saved: ${new Date(baseline.timestamp).toLocaleString()}

+ `; + } else { + document.getElementById('baseline-info').style.display = 'none'; + } + } catch (error) { + console.error('Error loading baseline:', error); + } +} + +function selectBenchType(type) { + selectedBenchType = type; + // Reload baseline when benchmark type changes + loadBaseline(); + document.querySelectorAll('.type-btn').forEach(btn => { + btn.classList.toggle('active', btn.dataset.type === type); + }); + + // Update description + const descriptions = { + 'gemm': 'Dense matrix multiplication for maximum GPU compute stress. Measures TFLOPS.', + 'particle': '2D particle physics simulation with millions of particles. Measures steps/second.' + }; + document.getElementById('type-description').textContent = descriptions[type] || ''; + + // Show/hide type-specific settings in custom mode + document.getElementById('gemm-settings').style.display = type === 'gemm' ? 'block' : 'none'; + document.getElementById('particle-settings').style.display = type === 'particle' ? 'block' : 'none'; +} + +function selectMode(mode) { + selectedMode = mode; + document.querySelectorAll('.mode-btn').forEach(btn => { + btn.classList.toggle('active', btn.dataset.mode === mode); + }); + document.getElementById('custom-controls').style.display = mode === 'custom' ? 'block' : 'none'; + + // Update mode description + const descriptions = { + 'quick': 'Quick baseline test - 15 seconds with fixed workload size', + 'standard': 'Standard benchmark - 60 seconds with fixed workload size', + 'extended': 'Extended burn-in test - 180 seconds with fixed workload size for thorough validation', + 'stress-test': 'Stress test - 60 seconds with AUTO-SCALING workload that dynamically increases to push GPU to 98% utilization', + 'custom': 'Custom configuration - set your own duration, limits, and workload parameters' + }; + document.getElementById('mode-description').textContent = descriptions[mode] || ''; +} + +function updateSliderValue(type) { + const slider = document.getElementById('custom-' + type); + const input = document.getElementById('custom-' + type + '-val'); + input.value = slider.value; +} + +// Sync input to slider +['duration', 'temp', 'memory', 'power', 'matrix', 'particles'].forEach(type => { + const input = document.getElementById('custom-' + type + '-val'); + if (input) { + input.addEventListener('change', () => { + document.getElementById('custom-' + type).value = input.value; + }); + } +}); + +async function startBenchmark() { + const btn = document.getElementById('start-bench-btn'); + const stopBtn = document.getElementById('stop-bench-btn'); + btn.disabled = true; + btn.textContent = 'Running...'; + stopBtn.style.display = 'inline-block'; + + document.getElementById('benchmark-progress').style.display = 'block'; + document.getElementById('benchmark-live-charts').style.display = 'block'; + document.getElementById('benchmark-results').innerHTML = ''; + document.getElementById('bench-stop-reason').textContent = ''; + document.getElementById('iteration-counter').style.display = 'inline'; + document.getElementById('iteration-counter').textContent = 'Iteration #0'; + + // Build URL with params + let url = '/api/benchmark/start?benchmark_type=' + selectedBenchType; + + // Handle different modes + if (selectedMode === 'quick') { + url += '&mode=fixed&duration=15&auto_scale=false'; + } else if (selectedMode === 'standard') { + url += '&mode=fixed&duration=60&auto_scale=false'; + } else if (selectedMode === 'stress-test') { + url += '&mode=stress&duration=60&auto_scale=true'; + } else if (selectedMode === 'extended') { + url += '&mode=fixed&duration=180&auto_scale=false'; + } else if (selectedMode === 'custom') { + url += '&mode=custom&auto_scale=false'; + url += '&duration=' + document.getElementById('custom-duration-val').value; + url += '&temp_limit=' + document.getElementById('custom-temp-val').value; + url += '&memory_limit=' + document.getElementById('custom-memory-val').value; + url += '&power_limit=' + document.getElementById('custom-power-val').value; + if (selectedBenchType === 'gemm') { + url += '&matrix_size=' + document.getElementById('custom-matrix-val').value; + } else if (selectedBenchType === 'particle') { + const particles = Math.round(parseFloat(document.getElementById('custom-particles-val').value) * 1000000); + url += '&num_particles=' + particles; + } + } + + // Initialize live charts + initLiveCharts(); + + try { + await fetch(url, { method: 'POST' }); + benchmarkPollInterval = setInterval(pollBenchmarkStatus, 500); + } catch (error) { + console.error('Error starting benchmark:', error); + btn.disabled = false; + btn.textContent = 'Start Benchmark'; + stopBtn.style.display = 'none'; + } +} + +async function stopBenchmark() { + try { + await fetch('/api/benchmark/stop', { method: 'POST' }); + } catch (error) { + console.error('Error stopping benchmark:', error); + } +} + +function createSmallChart(canvasId, color, maxY = null) { + const ctx = document.getElementById(canvasId).getContext('2d'); + return new Chart(ctx, { + type: 'line', + data: { labels: [], datasets: [{ data: [], borderColor: color, backgroundColor: color + '20', fill: true, tension: 0.3, pointRadius: 0 }] }, + options: { + responsive: true, + plugins: { legend: { display: false } }, + scales: { + x: { display: false }, + y: { min: 0, max: maxY, ticks: { color: '#a0a0a0' }, grid: { color: '#4a4a4a' } } + } + } + }); +} + +function initLiveCharts() { + Object.values(benchCharts).forEach(c => c.destroy()); + benchCharts = { + utilization: createSmallChart('chartUtilization', '#76b900', 100), + temperature: createSmallChart('chartTemperature', '#ffc107', 100), + memory: createSmallChart('chartMemory', '#00a0ff'), + power: createSmallChart('chartPower', '#dc3545') + }; +} + +async function pollBenchmarkStatus() { + try { + const [statusRes, samplesRes] = await Promise.all([ + fetch('/api/benchmark/status'), + fetch('/api/benchmark/samples') + ]); + const status = await statusRes.json(); + const samplesData = await samplesRes.json(); + + document.getElementById('bench-progress-bar').style.width = status.progress + '%'; + document.getElementById('bench-percent').textContent = status.progress + '%'; + document.getElementById('iteration-counter').textContent = 'Iteration #' + (status.iterations || 0); + document.getElementById('workload-info').textContent = 'Workload: ' + (status.workload_type || 'N/A'); + document.getElementById('bench-workload').textContent = status.workload_type || ''; + + // Update live charts with samples + if (samplesData.samples && benchCharts.utilization) { + const samples = samplesData.samples; + const labels = samples.map(s => s.elapsed_sec + 's'); + + benchCharts.utilization.data.labels = labels; + benchCharts.utilization.data.datasets[0].data = samples.map(s => s.utilization || 0); + benchCharts.utilization.update('none'); + + benchCharts.temperature.data.labels = labels; + benchCharts.temperature.data.datasets[0].data = samples.map(s => s.temperature_c || 0); + benchCharts.temperature.update('none'); + + benchCharts.memory.data.labels = labels; + benchCharts.memory.data.datasets[0].data = samples.map(s => s.memory_used_mb || 0); + benchCharts.memory.update('none'); + + benchCharts.power.data.labels = labels; + benchCharts.power.data.datasets[0].data = samples.map(s => s.power_w || 0); + benchCharts.power.update('none'); + } + + if (!status.running) { + clearInterval(benchmarkPollInterval); + document.getElementById('start-bench-btn').disabled = false; + document.getElementById('start-bench-btn').textContent = 'Start Benchmark'; + document.getElementById('stop-bench-btn').style.display = 'none'; + document.getElementById('bench-status').textContent = 'Completed'; + + const resultsResponse = await fetch('/api/benchmark/results'); + const results = await resultsResponse.json(); + displayBenchmarkResults(results); + + // Reload baseline if saved + loadBaseline(); + } + } catch (error) { + console.error('Error polling benchmark:', error); + } +} + +function displayBenchmarkResults(results) { + if (!results || results.status === 'no_results') { + document.getElementById('benchmark-results').innerHTML = '

No results available

'; + return; + } + + const gpuInfo = results.gpu_info || {}; + const config = results.config || {}; + const scores = results.scores || {}; + const baseline = results.baseline; + const perf = results.performance || {}; + + // Show stop reason if benchmark was stopped early + if (results.stop_reason && results.stop_reason !== 'Duration completed') { + document.getElementById('bench-stop-reason').textContent = 'Stopped: ' + results.stop_reason; + } + + // Baseline comparison + let baselineComparison = ''; + if (baseline) { + const iterDiff = results.iterations_completed - baseline.iterations_completed; + const iterPct = ((iterDiff / baseline.iterations_completed) * 100).toFixed(1); + const iterColor = iterDiff >= 0 ? 'var(--accent-green)' : 'var(--accent-red)'; + baselineComparison = ` +
+

vs Baseline

+
+ Iterations + ${iterDiff >= 0 ? '+' : ''}${iterDiff} (${iterPct}%) +
+
+ Baseline Iterations + ${baseline.iterations_completed} +
+
+ `; + } + + // Performance results section - varies by benchmark type + let perfMetrics = ''; + if (results.benchmark_type === 'gemm' || perf.tflops !== undefined) { + perfMetrics = ` +
+ ${perf.tflops || 0} + TFLOPS +
+
+ ${perf.gflops || 0} + GFLOPS +
+
+ ${results.iterations_completed || 0} + iterations +
+ `; + } else if (results.benchmark_type === 'particle' || perf.steps_per_second !== undefined) { + const particlesPerSec = perf.particles_updated_per_second || 0; + const formattedParticles = particlesPerSec >= 1e9 + ? (particlesPerSec / 1e9).toFixed(2) + 'B' + : particlesPerSec >= 1e6 + ? (particlesPerSec / 1e6).toFixed(2) + 'M' + : particlesPerSec.toLocaleString(); + perfMetrics = ` +
+ ${(perf.steps_per_second || 0).toLocaleString()} + steps/sec +
+
+ ${formattedParticles} + particles/sec +
+
+ ${(perf.total_steps || 0).toLocaleString()} + total steps +
+ `; + } else { + perfMetrics = ` +
+ ${results.iterations_completed || 0} + iterations +
+
+ ${results.avg_iteration_time_ms || 0} + ms/iter +
+
+ ${results.iterations_per_second || 0} + iter/sec +
+ `; + } + + let html = ` +
+

GPU Info

+
Name${gpuInfo.name || 'N/A'}
+
Memory${gpuInfo.memory_total_mb ? Math.round(gpuInfo.memory_total_mb) + ' MB' : 'N/A'}
+
Driver${gpuInfo.driver_version || 'N/A'}
+
+ +
+

Performance Results

+
+ ${perfMetrics} +
+ ${results.saved_as_baseline ? '

Saved as new baseline

' : ''} +
+ + ${baselineComparison} + +
+

Test Config

+
Workload${results.workload_type || 'N/A'}
+
Mode${config.mode || 'N/A'}
+
Type${config.benchmark_type || 'N/A'}
+
Duration${results.duration_actual_sec || 0}s / ${config.duration_seconds || 0}s
+
Stop Reason${results.stop_reason || 'N/A'}
+
+ +
+

Scores

+
Stability${scores.stability || 0}/100
+
Thermal${scores.thermal || 0}/100
+
Performance${scores.performance || 0}/100
+
Overall${scores.overall || 0}/100
+
+ +

Metrics Summary

+ `; + + const metrics = [ + { key: 'utilization', label: 'Utilization', unit: '%' }, + { key: 'temperature_c', label: 'Temperature', unit: 'C' }, + { key: 'memory_used_mb', label: 'Memory Used', unit: 'MB' }, + { key: 'power_w', label: 'Power Draw', unit: 'W' } + ]; + + metrics.forEach(m => { + const data = results[m.key]; + if (data) { + html += ` +
+
+ ${m.label} +
+
Min${data.min} ${m.unit}
+
Avg${data.avg} ${m.unit}
+
Max${data.max} ${m.unit}
+
+ `; + } + }); + + html += `

Benchmark completed at ${new Date(results.timestamp).toLocaleString()}

`; + + document.getElementById('benchmark-results').innerHTML = html; +} + +async function checkForUpdates() { + const btn = document.getElementById('update-btn'); + btn.disabled = true; + btn.textContent = 'Checking...'; + btn.removeAttribute('data-tooltip'); + + try { + const response = await fetch('/api/update/check', { method: 'POST' }); + const data = await response.json(); + + if (data.available) { + btn.textContent = `Update: ${data.latest}`; + btn.classList.remove('success', 'error'); + btn.disabled = false; + btn.setAttribute('data-tooltip', `Current: ${data.current} → Latest: ${data.latest}`); + + btn.onclick = async () => { + btn.textContent = 'Installing...'; + btn.disabled = true; + const install = await fetch('/api/update/install', { method: 'POST' }); + const result = await install.json(); + + if (result.status === 'success') { + btn.textContent = '✓ Restart App'; + btn.classList.add('success'); + btn.setAttribute('data-tooltip', 'Update installed - restart application'); + } else { + btn.textContent = '✗ Update Failed'; + btn.classList.add('error'); + btn.setAttribute('data-tooltip', result.message); + btn.disabled = false; + } + }; + } else if (data.error) { + btn.textContent = '✗ Check Failed'; + btn.classList.add('error'); + btn.setAttribute('data-tooltip', data.error); + btn.disabled = false; + } else { + btn.textContent = '✓ Latest Version'; + btn.classList.add('success'); + btn.setAttribute('data-tooltip', `Version ${data.current}`); + setTimeout(() => { + btn.textContent = 'Check for Updates'; + btn.classList.remove('success'); + btn.disabled = false; + btn.removeAttribute('data-tooltip'); + }, 3000); + } + } catch (error) { + btn.textContent = '✗ Network Error'; + btn.classList.add('error'); + btn.setAttribute('data-tooltip', 'Could not connect to update server'); + btn.disabled = false; + } +} + +function tick() { + countdown--; + document.getElementById('countdown').textContent = countdown; + if (countdown <= 0) { + countdown = 5; + fetchStatus(); + + // Auto-refresh active tab content + const activeTab = document.querySelector('.tab-content.active'); + if (activeTab) { + const tabId = activeTab.id; + if (tabId === 'processes-tab') { + loadProcesses(); + } else if (tabId === 'history-tab') { + const activeChart = document.querySelector('.chart-tab.active'); + if (activeChart) { + loadHistory(); + } + } + } + } +} + +async function loadFeatures() { + try { + const response = await fetch('/api/features'); + const features = await response.json(); + + // Disable benchmark controls if GPU benchmark not available + if (!features.gpu_benchmark) { + const benchTab = document.querySelector('[data-tab="benchmark"]'); + const startBtn = document.getElementById('start-bench-btn'); + const typeButtons = document.querySelectorAll('.type-btn'); + const modeButtons = document.querySelectorAll('.mode-btn'); + + if (benchTab) { + benchTab.classList.add('disabled'); + benchTab.setAttribute('data-tooltip', 'Install CuPy or PyTorch for GPU benchmarking'); + benchTab.style.pointerEvents = 'auto'; + } + + if (startBtn) { + startBtn.disabled = true; + startBtn.style.opacity = '0.5'; + startBtn.style.cursor = 'not-allowed'; + startBtn.title = 'GPU benchmark libraries not installed'; + } + + typeButtons.forEach(btn => { + btn.disabled = true; + btn.style.opacity = '0.5'; + btn.style.cursor = 'not-allowed'; + }); + + modeButtons.forEach(btn => { + btn.disabled = true; + btn.style.opacity = '0.5'; + btn.style.cursor = 'not-allowed'; + }); + } + } catch (error) { + console.error('Error loading features:', error); + } +} + +fetchStatus(); +loadBaseline(); +loadFeatures(); +setInterval(tick, 1000); diff --git a/monitor/api/static/style.css b/monitor/api/static/style.css new file mode 100644 index 0000000..0100125 --- /dev/null +++ b/monitor/api/static/style.css @@ -0,0 +1,374 @@ +@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@300;400;600;700&display=swap'); + +:root { + --bg-primary: #1a1a1a; + --bg-secondary: #2a2a2a; + --bg-tertiary: #3a3a3a; + --text-primary: #f0f0f0; + --text-secondary: #a0a0a0; + --accent-green: #76b900; + --accent-blue: #00a0ff; + --accent-yellow: #ffc107; + --accent-red: #dc3545; + --border-color: #4a4a4a; +} + +* { margin: 0; padding: 0; box-sizing: border-box; } + +body { + font-family: 'JetBrains Mono', 'Consolas', 'Monaco', monospace; + background: var(--bg-primary); + color: var(--text-primary); + line-height: 1.6; +} + +.container { max-width: 1400px; margin: 0 auto; padding: 20px; } + +header { + background: var(--accent-green); + padding: 20px 30px; + border-radius: 12px; + margin-bottom: 20px; + display: flex; + justify-content: space-between; + align-items: center; +} + +header h1 { + font-size: 1.5em; + color: #000; +} + +.header-right { + display: flex; + gap: 15px; + align-items: center; +} + +.update-btn { + padding: 8px 16px; + background: var(--accent-blue); + border: none; + border-radius: 6px; + color: white; + cursor: pointer; + font-weight: bold; + transition: all 0.2s; + position: relative; +} + +.update-btn:hover { + opacity: 0.8; +} + +.update-btn:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +.update-btn.success { + background: var(--accent-green); + cursor: default; +} + +.update-btn.error { + background: var(--accent-red); +} + +.update-btn::before { + content: attr(data-tooltip); + position: absolute; + bottom: 100%; + right: 0; + background: rgba(0, 0, 0, 0.9); + color: white; + padding: 8px 12px; + border-radius: 6px; + font-size: 0.85em; + white-space: nowrap; + opacity: 0; + pointer-events: none; + transition: opacity 0.2s; + margin-bottom: 5px; +} + +.update-btn:hover::before { + opacity: 1; +} + +.tab { + padding: 10px 20px; + background: var(--bg-secondary); + border: 1px solid var(--border-color); + border-radius: 8px 8px 0 0; + cursor: pointer; + color: var(--text-secondary); + transition: all 0.2s; + position: relative; +} + +.tab:hover { background: var(--bg-tertiary); } +.tab.active { + background: var(--accent-green); + color: #000; + border-color: var(--accent-green); +} + +.tab.disabled { + opacity: 0.5; + cursor: not-allowed; +} + +.tab.disabled::after { + content: attr(data-tooltip); + position: absolute; + bottom: 100%; + left: 50%; + transform: translateX(-50%); + background: rgba(0, 0, 0, 0.9); + color: white; + padding: 8px 12px; + border-radius: 6px; + font-size: 0.85em; + white-space: nowrap; + opacity: 0; + pointer-events: none; + transition: opacity 0.2s; + margin-bottom: 5px; +} + +.tab.disabled:hover::after { + opacity: 1; +} + +.status-badge { + padding: 6px 16px; + border-radius: 20px; + font-weight: bold; + text-transform: uppercase; + font-size: 0.85em; + color: #000; + position: relative; + cursor: default; +} + +.status-badge::before { + content: attr(data-tooltip); + position: absolute; + bottom: 100%; + right: 0; + background: rgba(0, 0, 0, 0.9); + color: white; + padding: 8px 12px; + border-radius: 6px; + font-size: 0.85em; + white-space: nowrap; + opacity: 0; + pointer-events: none; + transition: opacity 0.2s; + margin-bottom: 5px; +} + +.status-badge:hover::before { + opacity: 1; +} + +.status-healthy { background: var(--accent-blue); } +.status-info { background: var(--accent-green); } +.status-warning { background: var(--accent-yellow); } + +/* Tabs */ +.tabs { + display: flex; + gap: 5px; + margin-bottom: 20px; + border-bottom: 2px solid var(--border-color); + padding-bottom: 10px; +} + +.tab { + padding: 10px 20px; + background: var(--bg-secondary); + border: 1px solid var(--border-color); + border-radius: 8px 8px 0 0; + cursor: pointer; + color: var(--text-secondary); + transition: all 0.2s; +} + +.tab:hover { background: var(--bg-tertiary); } +.tab.active { + background: var(--accent-green); + color: #000; + border-color: var(--accent-green); +} + +.tab-content { display: none; } +.tab-content.active { display: block; } + +/* Cards */ +.grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); + gap: 20px; + margin-bottom: 20px; +} + +.card { + background: var(--bg-secondary); + border: 1px solid var(--border-color); + border-radius: 12px; + padding: 20px; +} + +.card h2 { + color: var(--accent-green); + margin-bottom: 15px; + font-size: 1.1em; +} + +.gpu-card { + background: var(--bg-tertiary); + border-radius: 8px; + padding: 15px; + margin-bottom: 10px; +} + +.gpu-header { + display: flex; + justify-content: space-between; + margin-bottom: 10px; +} + +.gpu-name { font-weight: bold; } +.gpu-temp { color: var(--accent-yellow); } +.gpu-temp.hot { color: var(--accent-red); } + +.progress-bar { + background: var(--bg-secondary); + border-radius: 4px; + height: 8px; + margin: 8px 0; + overflow: hidden; +} + +.progress-fill { + height: 100%; + background: var(--accent-green); + transition: width 0.3s ease; +} + +.progress-fill.warn { background: var(--accent-yellow); } +.progress-fill.crit { background: var(--accent-red); } + +.metric-row { + display: flex; + justify-content: space-between; + padding: 5px 0; + border-bottom: 1px solid var(--border-color); +} + +.metric-label { color: var(--text-secondary); font-size: 0.9em; } +.metric-value { font-weight: bold; } + +/* Process table */ +.process-table { + width: 100%; + border-collapse: collapse; + font-size: 0.9em; +} + +.process-table th, .process-table td { + padding: 10px; + text-align: left; + border-bottom: 1px solid var(--border-color); +} + +.process-table th { + background: var(--bg-tertiary); + color: var(--accent-green); +} + +.process-table tr:hover { background: var(--bg-tertiary); } + +/* Chart */ +.chart-container { + background: var(--bg-secondary); + border-radius: 12px; + padding: 20px; + margin-bottom: 20px; +} + +.chart-controls { + display: flex; + gap: 10px; + margin-bottom: 15px; + flex-wrap: wrap; +} + +.chart-controls select, .chart-controls button { + padding: 8px 15px; + background: var(--bg-tertiary); + border: 1px solid var(--border-color); + border-radius: 6px; + color: var(--text-primary); + cursor: pointer; +} + +.chart-controls button:hover { + background: var(--accent-green); + color: #000; +} + +/* Export */ +.export-section { + display: flex; + gap: 15px; + flex-wrap: wrap; +} + +.export-btn { + padding: 12px 25px; + background: var(--accent-green); + border: none; + border-radius: 8px; + color: #000; + cursor: pointer; + font-size: 1em; + font-weight: bold; +} + +.export-btn:hover { opacity: 0.9; } +.export-btn.secondary { + background: var(--bg-tertiary); + border: 1px solid var(--border-color); + color: var(--text-primary); +} + +footer { + text-align: center; + padding: 20px; + color: var(--text-secondary); + font-size: 0.9em; +} + +.alert-item { + background: var(--bg-tertiary); + border-left: 4px solid var(--accent-yellow); + padding: 10px 15px; + margin-bottom: 10px; + border-radius: 0 8px 8px 0; +} + +.mode-btn, .type-btn { + padding: 10px 20px; + background: var(--bg-tertiary); + border: 1px solid var(--border-color); + border-radius: 6px; + color: var(--text-primary); + cursor: pointer; + transition: all 0.2s; +} + +.mode-btn:hover, .type-btn:hover { background: var(--bg-secondary); border-color: var(--accent-green); } +.mode-btn.active, .type-btn.active { background: var(--accent-green); color: #000; border-color: var(--accent-green); } diff --git a/monitor/api/templates/benchmark_cli.py b/monitor/api/templates/benchmark_cli.py deleted file mode 100644 index e69de29..0000000 diff --git a/monitor/api/templates/index.html b/monitor/api/templates/index.html index 8c6759f..954d22c 100644 --- a/monitor/api/templates/index.html +++ b/monitor/api/templates/index.html @@ -4,257 +4,10 @@ Cluster Health Monitor + + + -
@@ -263,7 +16,10 @@

Cluster Health Monitor

Auto-refresh: 5s
-
HEALTHY
+
+ +
HEALTHY
+
@@ -320,6 +76,15 @@

Alerts

GPU Processes

+ @@ -327,7 +92,7 @@

GPU Processes

- + @@ -483,641 +248,6 @@

Pow - + \ No newline at end of file diff --git a/monitor/benchmark/gpu_bench.py.old b/monitor/benchmark/gpu_bench.py.old deleted file mode 100644 index d52ba02..0000000 --- a/monitor/benchmark/gpu_bench.py.old +++ /dev/null @@ -1,749 +0,0 @@ -"""GPU Benchmark with GEMM and Particle Simulation stress tests. - -Uses standard GPU libraries (cupy/torch) for stable, production-ready stress testing. -No JIT compilation - simple vectorized operations for maximum GPU load. -""" - -import time -import subprocess -import json -import sqlite3 -import threading -import math -from typing import Dict, Any, List, Optional -from datetime import datetime -from dataclasses import dataclass -from pathlib import Path - - -@dataclass -class BenchmarkConfig: - mode: str = "fixed" # fixed, stress, or adaptive - benchmark_type: str = "gemm" # "gemm" or "particle" - duration_seconds: int = 30 - memory_limit_mb: int = 0 # 0 = no limit - temp_limit_c: int = 85 - power_limit_w: int = 0 # 0 = no limit - sample_interval_ms: int = 500 - # GEMM-specific settings - matrix_size: int = 2048 # Reduced for stability - # Particle-specific settings - num_particles: int = 100000 # 100k particles default - # Auto-scaling settings - auto_scale: bool = False # Automatically scale workload to reach target utilization - target_gpu_util: int = 98 # Target GPU utilization % - - @classmethod - def from_mode(cls, mode: str, benchmark_type: str = "gemm") -> 'BenchmarkConfig': - presets = { - 'quick': cls(mode='quick', benchmark_type=benchmark_type, duration_seconds=15, temp_limit_c=85, sample_interval_ms=500), - 'standard': cls(mode='standard', benchmark_type=benchmark_type, duration_seconds=60, temp_limit_c=85, sample_interval_ms=500), - 'stress': cls(mode='stress', benchmark_type=benchmark_type, duration_seconds=180, temp_limit_c=92, sample_interval_ms=250), - } - return presets.get(mode, cls(mode='standard', benchmark_type=benchmark_type)) - - @classmethod - def custom(cls, duration: int, temp_limit: int, memory_limit: int = 0, power_limit: int = 0, - benchmark_type: str = "gemm", matrix_size: int = 2048, - num_particles: int = 100000) -> 'BenchmarkConfig': - return cls( - mode='custom', - benchmark_type=benchmark_type, - duration_seconds=duration, - temp_limit_c=temp_limit, - memory_limit_mb=memory_limit, - power_limit_w=power_limit, - sample_interval_ms=500, - matrix_size=matrix_size, - num_particles=num_particles - ) - - -class BaselineStorage: - """Storage for benchmark baseline results.""" - - def __init__(self, db_path: str = './metrics.db'): - self.db_path = Path(db_path) - self._ensure_table() - - def _ensure_table(self): - conn = sqlite3.connect(str(self.db_path)) - - # Check if table exists and has old schema - cursor = conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' AND name='benchmark_baseline'" - ) - table_exists = cursor.fetchone() is not None - - if table_exists: - # Check if benchmark_type column exists - cursor = conn.execute("PRAGMA table_info(benchmark_baseline)") - columns = [row[1] for row in cursor.fetchall()] - - if 'benchmark_type' not in columns: - # Migrate old table - drop and recreate - conn.execute('DROP TABLE IF EXISTS benchmark_baseline') - - # Create table with new schema - conn.execute(''' - CREATE TABLE IF NOT EXISTS benchmark_baseline ( - gpu_name TEXT NOT NULL, - benchmark_type TEXT NOT NULL, - timestamp TEXT NOT NULL, - iterations_completed INTEGER, - avg_iteration_time_ms REAL, - avg_utilization REAL, - avg_temperature REAL, - avg_power REAL, - avg_memory_used REAL, - results_json TEXT, - PRIMARY KEY (gpu_name, benchmark_type) - ) - ''') - conn.commit() - conn.close() - - def save_baseline(self, gpu_name: str, benchmark_type: str, results: Dict[str, Any]): - conn = sqlite3.connect(str(self.db_path)) - conn.execute(''' - INSERT OR REPLACE INTO benchmark_baseline - (gpu_name, benchmark_type, timestamp, iterations_completed, avg_iteration_time_ms, - avg_utilization, avg_temperature, avg_power, avg_memory_used, results_json) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - ''', ( - gpu_name, - benchmark_type, - results.get('timestamp', datetime.now().isoformat()), - results.get('iterations_completed', 0), - results.get('avg_iteration_time_ms', 0), - results.get('utilization', {}).get('avg', 0), - results.get('temperature_c', {}).get('avg', 0), - results.get('power_w', {}).get('avg', 0), - results.get('memory_used_mb', {}).get('avg', 0), - json.dumps(results) - )) - conn.commit() - conn.close() - - def get_baseline(self, gpu_name: str, benchmark_type: str) -> Optional[Dict[str, Any]]: - conn = sqlite3.connect(str(self.db_path)) - conn.row_factory = sqlite3.Row - cursor = conn.execute( - 'SELECT * FROM benchmark_baseline WHERE gpu_name = ? AND benchmark_type = ?', - (gpu_name, benchmark_type) - ) - row = cursor.fetchone() - conn.close() - - if row: - return { - 'gpu_name': row['gpu_name'], - 'benchmark_type': row['benchmark_type'], - 'timestamp': row['timestamp'], - 'iterations_completed': row['iterations_completed'], - 'avg_iteration_time_ms': row['avg_iteration_time_ms'], - 'avg_utilization': row['avg_utilization'], - 'avg_temperature': row['avg_temperature'], - 'avg_power': row['avg_power'], - 'avg_memory_used': row['avg_memory_used'], - 'full_results': json.loads(row['results_json']) if row['results_json'] else None - } - return None - - -class GPUStressWorker: - """GPU stress workload - uses standard GPU libraries (cupy/torch).""" - - def __init__(self, benchmark_type: str = "gemm", config: Optional[BenchmarkConfig] = None): - self.iterations = 0 - self.benchmark_type = benchmark_type - self.config = config or BenchmarkConfig() - self.workload_type = "Detecting..." - self._method = None # 'cupy', 'torch', or 'passive' - self._initialized = False - - # Performance tracking - self.total_flops = 0.0 - self.total_steps = 0 - - # GPU state - self._gpu_arrays = {} - - self._detect_and_setup() - - def _detect_and_setup(self): - """Detect available GPU libraries and setup workload.""" - # Try cupy first - try: - import cupy as cp - self._method = 'cupy' - self._cp = cp - self._setup_cupy() - self._initialized = True - return - except ImportError: - pass - except Exception as e: - print(f"cupy failed: {e}") - - # Try torch - try: - import torch - if torch.cuda.is_available(): - self._method = 'torch' - self._torch = torch - self._setup_torch() - self._initialized = True - return - except ImportError: - pass - except Exception as e: - print(f"torch failed: {e}") - - # Fallback: passive monitoring - self._method = 'passive' - self.workload_type = "Passive Monitoring (cupy/torch not available - run your own GPU workload)" - - def _setup_cupy(self): - """Setup workload using cupy.""" - cp = self._cp - n = self.config.matrix_size if self.benchmark_type == "gemm" else self.config.num_particles - - if self.benchmark_type == "gemm": - self.workload_type = f"GEMM {n}x{n} (cupy)" - self._gpu_arrays['A'] = cp.random.rand(n, n, dtype=cp.float32) - self._gpu_arrays['B'] = cp.random.rand(n, n, dtype=cp.float32) - self._flops_per_iter = 2.0 * (n ** 3) - else: - self.workload_type = f"Particle Sim ({n:,} particles, cupy)" - self._gpu_arrays['x'] = cp.random.rand(n, dtype=cp.float32) * 1000.0 - self._gpu_arrays['y'] = cp.random.rand(n, dtype=cp.float32) * 1000.0 - self._gpu_arrays['vx'] = (cp.random.rand(n, dtype=cp.float32) - 0.5) * 10.0 - self._gpu_arrays['vy'] = (cp.random.rand(n, dtype=cp.float32) - 0.5) * 10.0 - - def _setup_torch(self): - """Setup workload using torch.""" - torch = self._torch - device = torch.device('cuda') - n = self.config.matrix_size if self.benchmark_type == "gemm" else self.config.num_particles - - if self.benchmark_type == "gemm": - self.workload_type = f"GEMM {n}x{n} (torch)" - self._gpu_arrays['A'] = torch.randn(n, n, device=device, dtype=torch.float32) - self._gpu_arrays['B'] = torch.randn(n, n, device=device, dtype=torch.float32) - self._flops_per_iter = 2.0 * (n ** 3) - else: - self.workload_type = f"Particle Sim ({n:,} particles, torch)" - self._gpu_arrays['x'] = torch.rand(n, device=device, dtype=torch.float32) * 1000.0 - self._gpu_arrays['y'] = torch.rand(n, device=device, dtype=torch.float32) * 1000.0 - self._gpu_arrays['vx'] = (torch.rand(n, device=device, dtype=torch.float32) - 0.5) * 10.0 - self._gpu_arrays['vy'] = (torch.rand(n, device=device, dtype=torch.float32) - 0.5) * 10.0 - - def run_iteration(self) -> float: - """Run one iteration. Returns time in ms.""" - start = time.perf_counter() - - if not self._initialized or self._method == 'passive': - # Passive mode - just sleep and count iterations - try: - subprocess.run( - ['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader'], - capture_output=True, timeout=2 - ) - except Exception: - pass - time.sleep(0.05) - self.iterations += 1 - return (time.perf_counter() - start) * 1000 - - # Run GPU workload - if self.benchmark_type == "gemm": - self._run_gemm() - elif self.benchmark_type == "particle": - self._run_particle() - - self.iterations += 1 - return (time.perf_counter() - start) * 1000 - - def _run_gemm(self): - """Run GEMM (matrix multiply) workload.""" - if self._method == 'cupy': - A = self._gpu_arrays['A'] - B = self._gpu_arrays['B'] - C = self._cp.matmul(A, B) - self._cp.cuda.Stream.null.synchronize() - self.total_flops += self._flops_per_iter - elif self._method == 'torch': - A = self._gpu_arrays['A'] - B = self._gpu_arrays['B'] - C = self._torch.matmul(A, B) - self._torch.cuda.synchronize() - self.total_flops += self._flops_per_iter - - def _run_particle(self): - """Run particle simulation workload.""" - if self._method == 'cupy': - cp = self._cp - x = self._gpu_arrays['x'] - y = self._gpu_arrays['y'] - vx = self._gpu_arrays['vx'] - vy = self._gpu_arrays['vy'] - dt = 0.001 - - # Simple particle update: gravity, velocity, position, wall bounce - vy = vy + 9.81 * dt - x = x + vx * dt - y = y + vy * dt - - # Wall bouncing - mask_x_min = x < 0 - mask_x_max = x > 1000.0 - mask_y_min = y < 0 - mask_y_max = y > 1000.0 - - x[mask_x_min] = 0 - x[mask_x_max] = 1000.0 - vx[mask_x_min | mask_x_max] *= -0.8 - - y[mask_y_min] = 0 - y[mask_y_max] = 1000.0 - vy[mask_y_min | mask_y_max] *= -0.8 - - # Update arrays - self._gpu_arrays['x'] = x - self._gpu_arrays['y'] = y - self._gpu_arrays['vx'] = vx - self._gpu_arrays['vy'] = vy - - cp.cuda.Stream.null.synchronize() - self.total_steps += 1 - - elif self._method == 'torch': - torch = self._torch - x = self._gpu_arrays['x'] - y = self._gpu_arrays['y'] - vx = self._gpu_arrays['vx'] - vy = self._gpu_arrays['vy'] - dt = 0.001 - - # Simple particle update - vy = vy + 9.81 * dt - x = x + vx * dt - y = y + vy * dt - - # Wall bouncing - x = torch.clamp(x, 0, 1000.0) - y = torch.clamp(y, 0, 1000.0) - - vx[x <= 0] *= -0.8 - vx[x >= 1000.0] *= -0.8 - vy[y <= 0] *= -0.8 - vy[y >= 1000.0] *= -0.8 - - # Update arrays - self._gpu_arrays['x'] = x - self._gpu_arrays['y'] = y - self._gpu_arrays['vx'] = vx - self._gpu_arrays['vy'] = vy - - torch.cuda.synchronize() - self.total_steps += 1 - - def reset(self): - """Reset counters.""" - self.iterations = 0 - self.total_flops = 0.0 - self.total_steps = 0 - - def cleanup(self): - """Free GPU memory.""" - if self._method == 'cupy': - for key in list(self._gpu_arrays.keys()): - self._gpu_arrays[key] = None - elif self._method == 'torch': - for key in list(self._gpu_arrays.keys()): - if self._gpu_arrays[key] is not None: - del self._gpu_arrays[key] - self._gpu_arrays.clear() - - def scale_workload(self, scale_factor: float = 1.5): - """Scale workload size up (for auto-scaling stress test).""" - if not self._initialized or self._method == 'passive': - return - - if self.benchmark_type == "gemm": - # Increase matrix size - old_size = self.config.matrix_size - new_size = int(old_size * math.sqrt(scale_factor)) - self.config.matrix_size = new_size - - # Recreate arrays - if self._method == 'cupy': - cp = self._cp - self._gpu_arrays['A'] = cp.random.rand(new_size, new_size, dtype=cp.float32) - self._gpu_arrays['B'] = cp.random.rand(new_size, new_size, dtype=cp.float32) - self._flops_per_iter = 2.0 * (new_size ** 3) - elif self._method == 'torch': - torch = self._torch - device = torch.device('cuda') - self._gpu_arrays['A'] = torch.randn(new_size, new_size, device=device, dtype=torch.float32) - self._gpu_arrays['B'] = torch.randn(new_size, new_size, device=device, dtype=torch.float32) - self._flops_per_iter = 2.0 * (new_size ** 3) - - self.workload_type = f"GEMM {new_size}x{new_size} ({self._method})" - - elif self.benchmark_type == "particle": - # Increase particle count - old_count = self.config.num_particles - new_count = int(old_count * scale_factor) - self.config.num_particles = new_count - - # Recreate arrays - if self._method == 'cupy': - cp = self._cp - self._gpu_arrays['x'] = cp.random.rand(new_count, dtype=cp.float32) * 1000.0 - self._gpu_arrays['y'] = cp.random.rand(new_count, dtype=cp.float32) * 1000.0 - self._gpu_arrays['vx'] = (cp.random.rand(new_count, dtype=cp.float32) - 0.5) * 10.0 - self._gpu_arrays['vy'] = (cp.random.rand(new_count, dtype=cp.float32) - 0.5) * 10.0 - elif self._method == 'torch': - torch = self._torch - device = torch.device('cuda') - self._gpu_arrays['x'] = torch.rand(new_count, device=device, dtype=torch.float32) * 1000.0 - self._gpu_arrays['y'] = torch.rand(new_count, device=device, dtype=torch.float32) * 1000.0 - self._gpu_arrays['vx'] = (torch.rand(new_count, device=device, dtype=torch.float32) - 0.5) * 10.0 - self._gpu_arrays['vy'] = (torch.rand(new_count, device=device, dtype=torch.float32) - 0.5) * 10.0 - - self.workload_type = f"Particle Sim ({new_count:,} particles, {self._method})" - - def get_performance_stats(self, elapsed_seconds: float) -> Dict[str, Any]: - """Get performance statistics.""" - stats = { - 'iterations': self.iterations, - 'workload_type': self.workload_type, - } - - if self.benchmark_type == "gemm" and elapsed_seconds > 0: - tflops = (self.total_flops / elapsed_seconds) / 1e12 - stats['total_flops'] = self.total_flops - stats['tflops'] = round(tflops, 3) - stats['gflops'] = round(tflops * 1000, 2) - elif self.benchmark_type == "particle" and elapsed_seconds > 0: - stats['total_steps'] = self.total_steps - stats['steps_per_second'] = round(self.total_steps / elapsed_seconds, 2) - stats['particles_updated_per_second'] = round( - (self.total_steps * self.config.num_particles) / elapsed_seconds, 0 - ) - - return stats - - -class GPUBenchmark: - """GPU Benchmark with real-time monitoring and stress workload.""" - - def __init__(self, db_path: str = './metrics.db'): - self.running = False - self.should_stop = False - self.config: Optional[BenchmarkConfig] = None - self.samples: List[Dict[str, Any]] = [] - self.stop_reason: Optional[str] = None - self.start_time: Optional[float] = None - self.progress = 0 - self.current_phase = "" - self.results: Dict[str, Any] = {} - self.baseline_storage = BaselineStorage(db_path) - self.stress_worker: Optional[GPUStressWorker] = None - self.iteration_times: List[float] = [] - self.completed_full = False - self.db_path = db_path - - def get_gpu_info(self) -> Dict[str, Any]: - """Get GPU information.""" - try: - result = subprocess.run( - ['nvidia-smi', '--query-gpu=name,memory.total,driver_version,pcie.link.gen.current,pcie.link.width.current', - '--format=csv,noheader,nounits'], - capture_output=True, text=True, timeout=10 - ) - - if result.returncode != 0: - return {'error': 'nvidia-smi failed'} - - parts = [p.strip() for p in result.stdout.strip().split(',')] - return { - 'name': parts[0], - 'memory_total_mb': float(parts[1]), - 'driver_version': parts[2], - 'pcie_gen': parts[3], - 'pcie_width': parts[4], - } - except Exception as e: - return {'error': str(e)} - - def sample_metrics(self) -> Dict[str, Any]: - """Collect a single sample of GPU metrics.""" - try: - result = subprocess.run( - ['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw', - '--format=csv,noheader,nounits'], - capture_output=True, text=True, timeout=5 - ) - - if result.returncode != 0: - return {'error': 'nvidia-smi failed'} - - parts = [p.strip() for p in result.stdout.strip().split(',')] - - return { - 'timestamp': time.time(), - 'utilization': float(parts[0]) if parts[0] != '[N/A]' else 0, - 'memory_used_mb': float(parts[1]) if parts[1] != '[N/A]' else 0, - 'memory_total_mb': float(parts[2]) if parts[2] != '[N/A]' else 0, - 'temperature_c': float(parts[3]) if parts[3] != '[N/A]' else 0, - 'power_w': float(parts[4]) if parts[4] != '[N/A]' else 0, - } - except Exception as e: - return {'error': str(e), 'timestamp': time.time()} - - def check_stop_conditions(self, sample: Dict[str, Any]) -> Optional[str]: - """Check if any stop condition is met.""" - if self.should_stop: - return "User stopped" - - if 'error' in sample: - return f"GPU error: {sample['error']}" - - if self.config.temp_limit_c > 0 and sample.get('temperature_c', 0) >= self.config.temp_limit_c: - return f"Temperature limit reached ({sample['temperature_c']}C >= {self.config.temp_limit_c}C)" - - if self.config.power_limit_w > 0 and sample.get('power_w', 0) >= self.config.power_limit_w: - return f"Power limit reached ({sample['power_w']}W >= {self.config.power_limit_w}W)" - - if self.config.memory_limit_mb > 0 and sample.get('memory_used_mb', 0) >= self.config.memory_limit_mb: - return f"Memory limit reached ({sample['memory_used_mb']}MB >= {self.config.memory_limit_mb}MB)" - - return None - - def run_stress_benchmark(self) -> Dict[str, Any]: - """Run the stress benchmark with real GPU workload.""" - self.current_phase = "Running GPU Stress Test" - self.samples = [] - self.iteration_times = [] - self.stress_worker.reset() - self.start_time = time.time() - self.completed_full = False - - sample_interval = self.config.sample_interval_ms / 1000.0 - last_sample_time = 0 - last_scale_check = 0 - scale_interval = 2.0 # Check for scaling every 2 seconds - scale_count = 0 - max_scales = 5 # Limit scaling attempts - - while True: - elapsed = time.time() - self.start_time - - # Check duration - if elapsed >= self.config.duration_seconds: - self.stop_reason = "Duration completed" - self.completed_full = True - break - - # Run one iteration of stress work - iter_time = self.stress_worker.run_iteration() - self.iteration_times.append(iter_time) - - # Sample metrics periodically - if elapsed - last_sample_time >= sample_interval: - sample = self.sample_metrics() - sample['elapsed_sec'] = round(elapsed, 2) - sample['iterations'] = self.stress_worker.iterations - sample['last_iter_ms'] = round(iter_time, 2) - self.samples.append(sample) - last_sample_time = elapsed - - # Auto-scaling logic: check if we need to increase workload - if self.config.auto_scale and elapsed - last_scale_check >= scale_interval: - gpu_util = sample.get('utilization', 0) - if gpu_util < 93 and scale_count < max_scales: - print(f"[Auto-Scale] GPU util {gpu_util}% < target, scaling up workload...") - self.stress_worker.scale_workload(1.5) - scale_count += 1 - # Update workload type in current phase - self.current_phase = f"Auto-Scaling: {self.stress_worker.workload_type}" - last_scale_check = elapsed - - # Check stop conditions - stop = self.check_stop_conditions(sample) - if stop: - self.stop_reason = stop - break - - # Update progress - self.progress = int((elapsed / self.config.duration_seconds) * 100) - - return self._calculate_results() - - def _calculate_results(self) -> Dict[str, Any]: - """Calculate benchmark results from samples.""" - if not self.samples: - return {'error': 'No samples collected'} - - valid_samples = [s for s in self.samples if 'error' not in s] - - if not valid_samples: - return {'error': 'All samples had errors'} - - def calc_stats(key: str) -> Dict[str, float]: - values = [s.get(key, 0) for s in valid_samples] - return { - 'min': round(min(values), 2), - 'max': round(max(values), 2), - 'avg': round(sum(values) / len(values), 2), - } - - avg_iter_time = sum(self.iteration_times) / len(self.iteration_times) if self.iteration_times else 0 - elapsed_sec = time.time() - self.start_time - - results = { - 'duration_actual_sec': round(elapsed_sec, 2), - 'samples_collected': len(valid_samples), - 'stop_reason': self.stop_reason, - 'completed_full': self.completed_full, - 'workload_type': self.stress_worker.workload_type, - 'benchmark_type': self.config.benchmark_type, - 'iterations_completed': self.stress_worker.iterations, - 'avg_iteration_time_ms': round(avg_iter_time, 2), - 'iterations_per_second': round(1000 / avg_iter_time, 2) if avg_iter_time > 0 else 0, - 'utilization': calc_stats('utilization'), - 'memory_used_mb': calc_stats('memory_used_mb'), - 'temperature_c': calc_stats('temperature_c'), - 'power_w': calc_stats('power_w'), - } - - # Add performance stats from stress worker - perf_stats = self.stress_worker.get_performance_stats(elapsed_sec) - results['performance'] = perf_stats - - # Calculate scores based on performance - temp_range = results['temperature_c']['max'] - results['temperature_c']['min'] - stability_score = max(0, 100 - int(temp_range * 5)) - thermal_score = max(0, min(100, int((90 - results['temperature_c']['max']) * 5))) - - # Performance score based on benchmark type - if self.config.benchmark_type == 'gemm': - # Score based on TFLOPS (scale: 1 TFLOPS = 10 points, max 100) - tflops = perf_stats.get('tflops', 0) - perf_score = min(100, int(tflops * 10)) - elif self.config.benchmark_type == 'particle': - # Score based on steps per second (scale: 1M steps/sec = 10 points) - sps = perf_stats.get('steps_per_second', 0) - perf_score = min(100, int(sps / 100000)) - else: - perf_score = min(100, int(results['iterations_completed'] / 10)) - - results['scores'] = { - 'stability': stability_score, - 'thermal': thermal_score, - 'performance': perf_score, - 'overall': (stability_score + thermal_score + perf_score) // 3 - } - - return results - - def start(self, config: BenchmarkConfig) -> None: - """Start benchmark with given configuration.""" - self.config = config - self.running = True - self.should_stop = False - self.stop_reason = None - self.progress = 0 - self.samples = [] - self.completed_full = False - - # Initialize stress worker with benchmark type from config - self.stress_worker = GPUStressWorker( - benchmark_type=config.benchmark_type, - config=config - ) - - try: - gpu_info = self.get_gpu_info() - - self.results = { - 'timestamp': datetime.now().isoformat(), - 'config': { - 'mode': config.mode, - 'benchmark_type': config.benchmark_type, - 'duration_seconds': config.duration_seconds, - 'temp_limit_c': config.temp_limit_c, - 'power_limit_w': config.power_limit_w, - 'memory_limit_mb': config.memory_limit_mb, - 'matrix_size': config.matrix_size if config.benchmark_type == 'gemm' else None, - 'num_particles': config.num_particles if config.benchmark_type == 'particle' else None, - }, - 'gpu_info': gpu_info, - 'status': 'running', - } - - # Get baseline for comparison (benchmark-type specific) - if 'name' in gpu_info: - baseline = self.baseline_storage.get_baseline(gpu_info['name'], config.benchmark_type) - if baseline: - self.results['baseline'] = baseline - - results = self.run_stress_benchmark() - self.results.update(results) - self.results['status'] = 'completed' - - # Save as baseline only if completed fully - if self.completed_full and 'name' in gpu_info: - self.baseline_storage.save_baseline(gpu_info['name'], config.benchmark_type, self.results) - self.results['saved_as_baseline'] = True - - except Exception as e: - self.results['status'] = 'failed' - self.results['error'] = str(e) - finally: - self.running = False - self.progress = 100 - - def stop(self) -> None: - """Stop the benchmark.""" - self.should_stop = True - - def get_status(self) -> Dict[str, Any]: - """Get current benchmark status.""" - return { - 'running': self.running, - 'progress': self.progress, - 'phase': self.current_phase, - 'samples_count': len(self.samples), - 'iterations': self.stress_worker.iterations if self.stress_worker else 0, - 'workload_type': self.stress_worker.workload_type if self.stress_worker else 'N/A', - 'latest_sample': self.samples[-1] if self.samples else None, - } - - def get_samples(self) -> List[Dict[str, Any]]: - """Get all collected samples for real-time graphing.""" - return self.samples.copy() - - def get_results(self) -> Dict[str, Any]: - """Get benchmark results.""" - return self.results - - def get_baseline(self, benchmark_type: str = "gemm") -> Optional[Dict[str, Any]]: - """Get stored baseline for current GPU and benchmark type.""" - gpu_info = self.get_gpu_info() - if 'name' in gpu_info: - return self.baseline_storage.get_baseline(gpu_info['name'], benchmark_type) - return None - - -# Global instance -_benchmark: Optional[GPUBenchmark] = None - -def get_benchmark_instance() -> GPUBenchmark: - global _benchmark - if _benchmark is None: - _benchmark = GPUBenchmark() - return _benchmark diff --git a/monitor/benchmark/workloads.py b/monitor/benchmark/workloads.py index 7bc118f..6e15bb6 100644 --- a/monitor/benchmark/workloads.py +++ b/monitor/benchmark/workloads.py @@ -35,8 +35,8 @@ def _detect_and_setup(self): return except ImportError: pass - except Exception as e: - print(f"cupy failed: {e}") + except Exception: + pass # Try torch try: @@ -49,8 +49,8 @@ def _detect_and_setup(self): return except ImportError: pass - except Exception as e: - print(f"torch failed: {e}") + except Exception: + pass # Fallback: passive monitoring self._method = 'passive' diff --git a/monitor/collectors/gpu.py b/monitor/collectors/gpu.py index dfa0597..c822f50 100644 --- a/monitor/collectors/gpu.py +++ b/monitor/collectors/gpu.py @@ -2,6 +2,8 @@ import subprocess import os +import csv +import io from typing import List, Dict, Any try: @@ -43,9 +45,12 @@ def collect(self) -> List[Dict[str, Any]]: return self._collect_nvidia_smi() def collect_processes(self) -> List[Dict[str, Any]]: - """Get detailed process info for all GPUs.""" + """Get detailed process info for all GPUs with utilization.""" + # Try to get utilization data from nvidia-smi accounting mode + utilization_map = self._get_process_utilization() + if not self.nvml_initialized: - return self._collect_processes_nvidia_smi() + return self._collect_processes_nvidia_smi(utilization_map) processes = [] try: @@ -62,6 +67,7 @@ def collect_processes(self) -> List[Dict[str, Any]]: 'gpu_name': gpu_name, 'pid': proc.pid, 'gpu_memory_mb': proc.usedGpuMemory / (1024**2) if proc.usedGpuMemory else 0, + 'gpu_utilization': utilization_map.get(proc.pid, {}).get('gpu_util', None), 'name': 'Unknown', 'username': 'Unknown', } @@ -85,8 +91,46 @@ def collect_processes(self) -> List[Dict[str, Any]]: return processes - def _collect_processes_nvidia_smi(self) -> List[Dict[str, Any]]: + def _get_process_utilization(self) -> Dict[int, Dict[str, Any]]: + """Get per-process GPU utilization using nvidia-smi accounting mode. + + Note: This only works for CUDA/compute workloads, not graphics processes. + Requires accounting mode to be enabled: nvidia-smi --accounting-mode=1 + """ + utilization_map = {} + + try: + # Try to query accounted apps (requires accounting mode enabled) + result = subprocess.run( + ['nvidia-smi', '--query-accounted-apps=pid,gpu_util,mem_util', + '--format=csv,noheader,nounits'], + capture_output=True, text=True, timeout=5 + ) + + if result.returncode == 0 and result.stdout.strip(): + reader = csv.reader(io.StringIO(result.stdout)) + for row in reader: + if len(row) >= 2: + try: + pid = int(row[0].strip()) + gpu_util = float(row[1].strip()) if row[1].strip() != '[N/A]' else None + utilization_map[pid] = { + 'gpu_util': gpu_util, + } + except (ValueError, IndexError): + continue + except (subprocess.TimeoutExpired, FileNotFoundError, Exception) as e: + # Accounting mode not available or nvidia-smi failed + # This is expected for graphics workloads or when accounting is disabled + pass + + return utilization_map + + def _collect_processes_nvidia_smi(self, utilization_map: Dict[int, Dict[str, Any]] = None) -> List[Dict[str, Any]]: """Fallback process collection via nvidia-smi.""" + if utilization_map is None: + utilization_map = {} + try: result = subprocess.run( ['nvidia-smi', '--query-compute-apps=gpu_uuid,pid,used_memory,process_name', @@ -103,11 +147,13 @@ def _collect_processes_nvidia_smi(self) -> List[Dict[str, Any]]: continue parts = [p.strip() for p in line.split(',')] if len(parts) >= 4: + pid = int(parts[1]) processes.append({ 'gpu_index': 0, - 'pid': int(parts[1]), + 'pid': pid, 'gpu_memory_mb': float(parts[2]) if parts[2] != '[N/A]' else 0, 'name': parts[3], + 'gpu_utilization': utilization_map.get(pid, {}).get('gpu_util', None), }) return processes except Exception: @@ -186,12 +232,15 @@ def _collect_nvidia_smi(self) -> List[Dict[str, Any]]: parts = [p.strip() for p in line.split(',')] if len(parts) >= 7: + mem_used = float(parts[3]) if parts[3] != '[N/A]' else 0 + mem_total = float(parts[4]) if parts[4] != '[N/A]' else 0 gpus.append({ 'index': int(parts[0]), 'name': parts[1], 'utilization': int(parts[2]) if parts[2] != '[N/A]' else 0, - 'memory_used': float(parts[3]) if parts[3] != '[N/A]' else 0, - 'memory_total': float(parts[4]) if parts[4] != '[N/A]' else 0, + 'memory_used': mem_used, + 'memory_total': mem_total, + 'memory_free': mem_total - mem_used, 'temperature': int(parts[5]) if parts[5] != '[N/A]' else 0, 'power': float(parts[6]) if parts[6] != '[N/A]' else 0, }) diff --git a/monitor/utils/__init__.py b/monitor/utils/__init__.py new file mode 100644 index 0000000..e0738fc --- /dev/null +++ b/monitor/utils/__init__.py @@ -0,0 +1,6 @@ +"""Utility modules for cluster health monitor.""" + +from .features import detect_features, get_features, refresh_features +from .update import check_for_updates, perform_update + +__all__ = ['detect_features', 'get_features', 'refresh_features', 'check_for_updates', 'perform_update'] diff --git a/monitor/utils/features.py b/monitor/utils/features.py new file mode 100644 index 0000000..2ac71fd --- /dev/null +++ b/monitor/utils/features.py @@ -0,0 +1,79 @@ +"""Feature detection and caching system.""" + +import json +import os +from pathlib import Path +from typing import Dict + +CACHE_FILE = '.features_cache' + +def _detect_nvidia_smi() -> bool: + """Check if nvidia-smi is available.""" + try: + import subprocess + result = subprocess.run(['nvidia-smi', '--version'], + capture_output=True, timeout=5) + return result.returncode == 0 + except Exception: + return False + +def _detect_cupy() -> bool: + """Check if cupy is available.""" + try: + import cupy as cp + cp.cuda.Device(0).compute_capability + return True + except Exception: + return False + +def _detect_torch() -> bool: + """Check if torch with CUDA is available.""" + try: + import torch + return torch.cuda.is_available() + except Exception: + return False + +def detect_features(force: bool = False) -> Dict[str, bool]: + """ + Detect available features. Uses cache unless force=True. + + Returns: + dict: {'nvidia_smi': bool, 'cupy': bool, 'torch': bool, 'gpu_benchmark': bool} + """ + cache_path = Path(CACHE_FILE) + + # Check cache + if not force and cache_path.exists(): + try: + with open(cache_path, 'r') as f: + return json.load(f) + except Exception: + pass + + # Detect features + features = { + 'nvidia_smi': _detect_nvidia_smi(), + 'cupy': _detect_cupy(), + 'torch': _detect_torch(), + } + + # GPU benchmark available if cupy or torch available + features['gpu_benchmark'] = features['cupy'] or features['torch'] + + # Cache results + try: + with open(cache_path, 'w') as f: + json.dump(features, f, indent=2) + except Exception: + pass + + return features + +def get_features() -> Dict[str, bool]: + """Get cached features or detect them.""" + return detect_features(force=False) + +def refresh_features() -> Dict[str, bool]: + """Force feature re-detection.""" + return detect_features(force=True) diff --git a/monitor/utils/update.py b/monitor/utils/update.py new file mode 100644 index 0000000..42ec5fe --- /dev/null +++ b/monitor/utils/update.py @@ -0,0 +1,181 @@ +"""Update mechanism for cluster health monitor.""" + +import requests +import subprocess +import sys +import zipfile +from pathlib import Path +from typing import Optional, Dict + +GITHUB_API = "https://api.github.com/repos/DataBoySu/cluster-monitor/releases/latest" +CURRENT_VERSION = "1.0.0" + +def get_latest_version() -> Optional[Dict]: + """ + Check GitHub for latest release. + + Returns: + dict with 'version' and 'download_url', or None if error + """ + try: + response = requests.get(GITHUB_API, timeout=10) + response.raise_for_status() + data = response.json() + + version = data.get('tag_name', '').lstrip('v') + assets = data.get('assets', []) + + # Find ZIP asset + download_url = None + for asset in assets: + if asset['name'].endswith('.zip'): + download_url = asset['browser_download_url'] + break + + if not download_url: + return None + + return { + 'version': version, + 'download_url': download_url, + 'name': data.get('name', ''), + 'body': data.get('body', '') + } + except Exception: + return None + +def compare_versions(v1: str, v2: str) -> int: + """ + Compare two version strings. + + Returns: + -1 if v1 < v2, 0 if equal, 1 if v1 > v2 + """ + def parse_version(v): + return [int(x) for x in v.split('.')] + + try: + parts1 = parse_version(v1) + parts2 = parse_version(v2) + + for p1, p2 in zip(parts1, parts2): + if p1 < p2: + return -1 + elif p1 > p2: + return 1 + + if len(parts1) < len(parts2): + return -1 + elif len(parts1) > len(parts2): + return 1 + + return 0 + except Exception: + return 0 + +def download_update(url: str, dest: Path) -> bool: + """Download update ZIP file.""" + try: + response = requests.get(url, stream=True, timeout=30) + response.raise_for_status() + + with open(dest, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + return True + except Exception: + return False + +def apply_update(zip_path: Path) -> bool: + """ + Extract update ZIP and replace files. + """ + try: + # Extract to temp directory + temp_dir = Path("update_temp") + temp_dir.mkdir(exist_ok=True) + + with zipfile.ZipFile(zip_path, 'r') as zf: + zf.extractall(temp_dir) + + # Find extracted directory + extracted = list(temp_dir.glob("cluster-health-monitor*")) + if not extracted: + return False + + src = extracted[0] + + # Copy files (excluding venv, cache, data) + import shutil + exclude = {'venv', '__pycache__', '.features_cache', '*.db', 'config.yaml'} + + for item in src.rglob('*'): + if item.is_file(): + rel = item.relative_to(src) + + # Skip excluded patterns + skip = False + for ex in exclude: + if ex in str(rel): + skip = True + break + + if not skip: + dest = Path(rel) + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(item, dest) + + # Cleanup + shutil.rmtree(temp_dir) + zip_path.unlink() + + return True + except Exception: + return False + +def check_for_updates() -> Dict: + """ + Check for updates and return status. + + Returns: + dict with 'available', 'current', 'latest', 'info' + """ + latest = get_latest_version() + + if not latest: + return { + 'available': False, + 'current': CURRENT_VERSION, + 'latest': None, + 'error': 'Could not check for updates' + } + + latest_version = latest['version'] + is_newer = compare_versions(latest_version, CURRENT_VERSION) > 0 + + return { + 'available': is_newer, + 'current': CURRENT_VERSION, + 'latest': latest_version, + 'info': latest + } + +def perform_update() -> bool: + """ + Download and apply update. + + Returns: + True if successful + """ + latest = get_latest_version() + if not latest: + return False + + # Download + zip_path = Path("update.zip") + if not download_update(latest['download_url'], zip_path): + return False + + # Apply + return apply_update(zip_path) diff --git a/package.ps1 b/package.ps1 new file mode 100644 index 0000000..4aa4e7f --- /dev/null +++ b/package.ps1 @@ -0,0 +1,81 @@ +# Package Cluster Health Monitor for distribution +# Creates portable ZIP for users to download + +$ErrorActionPreference = "Stop" + +$VERSION = "1.0.0" +$OUTPUT_NAME = "cluster-health-monitor-v$VERSION" +$OUTPUT_ZIP = "$OUTPUT_NAME.zip" + +Write-Host "`n=== Packaging Cluster Health Monitor v$VERSION ===" -ForegroundColor Cyan + +# Clean previous build +if (Test-Path $OUTPUT_NAME) { + Write-Host "Removing old build directory..." -ForegroundColor Yellow + Remove-Item $OUTPUT_NAME -Recurse -Force +} + +if (Test-Path $OUTPUT_ZIP) { + Write-Host "Removing old ZIP..." -ForegroundColor Yellow + Remove-Item $OUTPUT_ZIP -Force +} + +# Create build directory +Write-Host "Creating build directory..." -ForegroundColor Yellow +New-Item -ItemType Directory -Path $OUTPUT_NAME | Out-Null + +# Copy project files +Write-Host "Copying project files..." -ForegroundColor Yellow + +$include = @( + "monitor", + "health_monitor.py", + "config.yaml", + "requirements.txt", + "setup.ps1", + "README.md", + "LICENSE" +) + +foreach ($item in $include) { + if (Test-Path $item) { + Write-Host " - $item" + Copy-Item $item -Destination $OUTPUT_NAME -Recurse + } +} + +# Remove unwanted files +Write-Host "Cleaning build directory..." -ForegroundColor Yellow + +$cleanup = @( + "__pycache__", + "*.pyc", + "*.pyo", + ".features_cache", + "*.db", + ".pytest_cache", + ".vscode" +) + +Get-ChildItem $OUTPUT_NAME -Recurse -Force | Where-Object { + $file = $_ + $cleanup | Where-Object { $file.Name -like $_ } | Select-Object -First 1 +} | Remove-Item -Force -Recurse + +# Create ZIP +Write-Host "`nCreating ZIP archive..." -ForegroundColor Yellow +Compress-Archive -Path $OUTPUT_NAME -DestinationPath $OUTPUT_ZIP -Force + +# Calculate size +$size = (Get-Item $OUTPUT_ZIP).Length / 1MB + +Write-Host "`n=== Package Complete ===" -ForegroundColor Green +Write-Host "Output: $OUTPUT_ZIP" -ForegroundColor Cyan +Write-Host "Size: $([math]::Round($size, 2)) MB" -ForegroundColor Cyan +Write-Host "`nUpload to GitHub Releases:" -ForegroundColor Yellow +Write-Host "https://github.com/DataBoySu/cluster-monitor/releases/new`n" -ForegroundColor Cyan + +# Cleanup build directory +if (Test-Path $OUTPUT_NAME) { + Remove-Item $OUTPUT_NAME -Recurse -Force +} diff --git a/requirements.txt b/requirements.txt index 2766821..e273e50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,16 @@ -# Core dependencies (required for CLI) -nvidia-ml-py>=12.535.0 # NVIDIA GPU metrics via NVML (replaces deprecated pynvml) -psutil>=5.9.0 # System metrics (CPU, RAM, disk) -pyyaml>=6.0 # Configuration file parsing -click>=8.1.0 # Command line interface -rich>=13.0.0 # Terminal UI +# Core dependencies fastapi>=0.104.0 -uvicorn>=0.24.0 +uvicorn[standard]>=0.24.0 +psutil>=5.9.0 +pyyaml>=6.0 +click>=8.1.0 +rich>=13.0.0 +numpy>=1.24.0 +requests>=2.31.0 -# GPU Benchmark dependencies -numpy>=1.24.0 # Array operations (required) -# For active GPU stress testing, install ONE of: -# pip install cupy-cuda13x (NVIDIA CUDA 13.x) -# pip install torch --index-url https://download.pytorch.org/whl/cu131 -# Without cupy/torch, benchmark runs in passive monitoring mode +# GPU libraries (optional - install based on your CUDA version) +# Option 1: CuPy (recommended for CUDA 12.x) +# cupy-cuda12x>=12.0.0 + +# Option 2: PyTorch (install separately with correct CUDA version) +# pip install torch --index-url https://download.pytorch.org/whl/cu121 diff --git a/setup.ps1 b/setup.ps1 new file mode 100644 index 0000000..d5cd3a7 --- /dev/null +++ b/setup.ps1 @@ -0,0 +1,172 @@ +# Cluster Health Monitor - Setup Script +# Automated setup for Windows with CUDA detection + +$ErrorActionPreference = "Stop" + +Write-Host "`n=== Cluster Health Monitor Setup ===" -ForegroundColor Cyan +Write-Host "Version 1.0.0`n" -ForegroundColor Cyan + +# Check Python +Write-Host "Checking Python..." -ForegroundColor Yellow +try { + $pythonVersion = python --version 2>&1 + Write-Host "[OK] $pythonVersion" -ForegroundColor Green + + # Check version >= 3.8 + $versionMatch = $pythonVersion -match "Python (\d+)\.(\d+)" + if ($versionMatch) { + $major = [int]$matches[1] + $minor = [int]$matches[2] + if ($major -lt 3 -or ($major -eq 3 -and $minor -lt 8)) { + Write-Host "[ERROR] Python 3.8+ required" -ForegroundColor Red + exit 1 + } + } +} catch { + Write-Host "[ERROR] Python not found. Install from https://www.python.org/downloads/" -ForegroundColor Red + exit 1 +} + +# Check NVIDIA drivers +Write-Host "`nChecking NVIDIA drivers..." -ForegroundColor Yellow +$nvidiaFound = $false +try { + $nvidiaSmi = nvidia-smi --version 2>&1 | Out-String + Write-Host "[OK] nvidia-smi found" -ForegroundColor Green + $nvidiaFound = $true + + # Parse driver version + if ($nvidiaSmi -match "Driver Version: ([\d\.]+)") { + Write-Host "Driver Version: $($matches[1])" -ForegroundColor Cyan + } +} catch { + Write-Host "[WARNING] nvidia-smi not found" -ForegroundColor Yellow + Write-Host "Install NVIDIA drivers: https://www.nvidia.com/download/index.aspx" -ForegroundColor Yellow +} + +# Check CUDA +Write-Host "`nChecking CUDA Toolkit..." -ForegroundColor Yellow +$cudaFound = $false +$cudaVersion = "" + +try { + $nvccVersion = nvcc --version 2>&1 | Out-String + if ($nvccVersion -match "release (\d+\.\d+)") { + $cudaVersion = $matches[1] + $cudaFound = $true + Write-Host "[OK] CUDA $cudaVersion detected" -ForegroundColor Green + } +} catch { + Write-Host "[WARNING] CUDA Toolkit not found" -ForegroundColor Yellow + Write-Host "GPU benchmarking requires CUDA. Download from:" -ForegroundColor Yellow + Write-Host "https://developer.nvidia.com/cuda-downloads" -ForegroundColor Cyan +} + +# Summary and user choice +Write-Host "`n=== Setup Options ===" -ForegroundColor Cyan +Write-Host "This will:" -ForegroundColor White +Write-Host " 1. Create Python virtual environment" -ForegroundColor White +Write-Host " 2. Install core dependencies (FastAPI, uvicorn, etc.)" -ForegroundColor White + +if ($cudaFound) { + Write-Host " 3. Optionally install GPU libraries (CuPy or PyTorch)" -ForegroundColor White +} else { + Write-Host " 3. Run in monitoring-only mode (no GPU benchmarking)" -ForegroundColor Yellow +} + +Write-Host "" +$continue = Read-Host "Continue with setup? [Y/n]" +if ($continue -eq "n" -or $continue -eq "N") { + Write-Host "`nSetup cancelled." -ForegroundColor Yellow + exit 0 +} + +# Create virtual environment +Write-Host "`nCreating virtual environment..." -ForegroundColor Yellow +if (Test-Path "venv") { + Write-Host "[OK] Virtual environment already exists" -ForegroundColor Green +} else { + python -m venv venv + Write-Host "[OK] Virtual environment created" -ForegroundColor Green +} + +# Activate venv +Write-Host "`nActivating virtual environment..." -ForegroundColor Yellow +& ".\venv\Scripts\Activate.ps1" + +# Upgrade pip +Write-Host "`nUpgrading pip..." -ForegroundColor Yellow +python -m pip install --upgrade pip --quiet + +# Install core dependencies +Write-Host "`nInstalling core dependencies..." -ForegroundColor Yellow +pip install -r requirements.txt --quiet +Write-Host "[OK] Core dependencies installed" -ForegroundColor Green + +# GPU benchmark libraries +if ($cudaFound) { + Write-Host "`n=== GPU Benchmark Libraries ===" -ForegroundColor Cyan + Write-Host "CUDA $cudaVersion detected. Install GPU libraries for benchmarking?" -ForegroundColor Yellow + Write-Host " 1) CuPy (recommended for CUDA 12.x)" -ForegroundColor White + Write-Host " 2) PyTorch (alternative)" -ForegroundColor White + Write-Host " 3) Skip (passive monitoring only)" -ForegroundColor White + + $choice = Read-Host "`nChoice [1-3]" + + switch ($choice) { + "1" { + Write-Host "`nInstalling CuPy..." -ForegroundColor Yellow + if ($cudaVersion -match "^12") { + pip install cupy-cuda12x --quiet + } elseif ($cudaVersion -match "^11") { + pip install cupy-cuda11x --quiet + } else { + Write-Host "[WARNING] Unsupported CUDA version, trying cuda12x" -ForegroundColor Yellow + pip install cupy-cuda12x --quiet + } + Write-Host "[OK] CuPy installed" -ForegroundColor Green + } + "2" { + Write-Host "`nInstalling PyTorch..." -ForegroundColor Yellow + if ($cudaVersion -match "^12") { + pip install torch --index-url https://download.pytorch.org/whl/cu121 --quiet + } elseif ($cudaVersion -match "^11") { + pip install torch --index-url https://download.pytorch.org/whl/cu118 --quiet + } + Write-Host "[OK] PyTorch installed" -ForegroundColor Green + } + "3" { + Write-Host "[OK] Skipping GPU libraries" -ForegroundColor Yellow + } + default { + Write-Host "[OK] Skipping GPU libraries" -ForegroundColor Yellow + } + } +} + +# Detect features and cache +Write-Host "`nDetecting available features..." -ForegroundColor Yellow +python -c "from monitor.utils import detect_features; detect_features(force=True)" +Write-Host "[OK] Features cached" -ForegroundColor Green + +# Verify installation +Write-Host "`nVerifying installation..." -ForegroundColor Yellow +try { + python health_monitor.py --help > $null 2>&1 + Write-Host "[OK] Installation verified" -ForegroundColor Green +} catch { + Write-Host "[ERROR] Installation verification failed" -ForegroundColor Red + exit 1 +} + +# Complete +Write-Host "`n=== Setup Complete ===" -ForegroundColor Green +Write-Host "`nTo get started:" -ForegroundColor Cyan +Write-Host " .\venv\Scripts\Activate.ps1" -ForegroundColor White +Write-Host " python health_monitor.py" -ForegroundColor White +Write-Host "`nAccess web dashboard at: http://localhost:8090" -ForegroundColor Cyan +Write-Host "Change port: python health_monitor.py --port 3000" -ForegroundColor DarkGray +Write-Host "`nOther commands:" -ForegroundColor Cyan +Write-Host " python health_monitor.py cli - Terminal dashboard" -ForegroundColor White +Write-Host " python health_monitor.py benchmark - GPU benchmark" -ForegroundColor White +Write-Host " python health_monitor.py --update - Check for updates`n" -ForegroundColor White

PID Process GPUGPU MemoryGPU Util % User