# 🧪 VLM Accuracy Testing - Google Colab

Notebook untuk testing akurasi model VLM (Vision Language Model) dengan berbagai jenis gambar.

**Model yang ditest:**
- Google Gemini 2.0 Flash
- OpenAI GPT-4o (optional)

**Jenis Testing:**
1. Objek Bersejarah (Artifact)
2. Bangunan/Arsitektur
3. Alam/Lingkungan
4. Analisis Umum

## 📦 Install Dependencies

In [None]:
# Install required packages
!pip install google-generativeai pillow requests pandas matplotlib seaborn scikit-learn -q

## 🔑 Setup API Keys

In [None]:
import google.generativeai as genai
from google.colab import userdata
import os

# Setup API Key
# Option 1: Dari Colab Secrets (Recommended)
try:
    GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
except:
    # Option 2: Input manual
    from getpass import getpass
    GEMINI_API_KEY = getpass('Enter your Gemini API Key: ')

# Configure Gemini
genai.configure(api_key=GEMINI_API_KEY)
print("✅ API Key configured!")

## 📥 Upload Test Images

In [None]:
from google.colab import files
import io
from PIL import Image

# Upload gambar untuk testing
print("Upload gambar untuk testing (bisa multiple files):")
uploaded = files.upload()

# Simpan gambar
test_images = []
for filename, data in uploaded.items():
    image = Image.open(io.BytesIO(data))
    test_images.append({
        'filename': filename,
        'image': image,
        'data': data
    })
    print(f"✅ Loaded: {filename} ({image.size})")

print(f"\n📊 Total gambar: {len(test_images)}")

## 🛠️ VLM Service Class

In [None]:
import json
import base64
import time
from typing import Dict, List, Any

class VLMTester:
    def __init__(self, api_key: str):
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-2.0-flash-exp')
    
    def analyze_image(self, image_data: bytes, prompt: str) -> Dict[str, Any]:
        """Analisis gambar dengan prompt custom"""
        try:
            # Prepare image
            image = Image.open(io.BytesIO(image_data))
            
            # Generate content
            start_time = time.time()
            response = self.model.generate_content([prompt, image])
            processing_time = time.time() - start_time
            
            # Parse response
            text = response.text
            
            # Try to extract JSON
            result = None
            try:
                json_match = text[text.find('{'):text.rfind('}')+1]
                result = json.loads(json_match)
            except:
                result = {'description': text}
            
            return {
                'success': True,
                'result': result,
                'raw_text': text,
                'processing_time': processing_time
            }
        except Exception as e:
            return {
                'success': False,
                'error': str(e),
                'processing_time': 0
            }
    
    def test_artifact_analysis(self, image_data: bytes) -> Dict[str, Any]:
        """Test analisis objek bersejarah"""
        prompt = """Analisis gambar objek bersejarah ini dalam Bahasa Indonesia.
        
        Berikan informasi detail:
        1. Identifikasi jenis objek
        2. Estimasi periode/era
        3. Kondisi objek
        4. Tingkat kerusakan (none/low/medium/high/severe)
        5. Nilai historis
        6. Rekomendasi preservasi
        
        Format JSON:
        {
            "object_type": "jenis objek",
            "period": "periode/era",
            "condition": "kondisi",
            "damage_level": "none/low/medium/high/severe",
            "historical_value": "nilai historis",
            "recommendations": ["rekomendasi1", "rekomendasi2"],
            "confidence": "0-100%"
        }"""
        return self.analyze_image(image_data, prompt)
    
    def test_building_analysis(self, image_data: bytes) -> Dict[str, Any]:
        """Test analisis bangunan"""
        prompt = """Analisis bangunan dalam gambar ini (Bahasa Indonesia).
        
        Berikan:
        1. Jenis bangunan
        2. Gaya arsitektur
        3. Kondisi struktur
        4. Deteksi kerusakan (retak, roboh, dll)
        5. Tingkat kerusakan (none/low/medium/high/severe)
        6. Prioritas perbaikan
        
        Format JSON:
        {
            "building_type": "jenis",
            "architecture_style": "gaya",
            "structural_condition": "kondisi",
            "detected_damages": ["kerusakan1", "kerusakan2"],
            "damage_level": "none/low/medium/high/severe",
            "repair_priority": ["prioritas1", "prioritas2"],
            "confidence": "0-100%"
        }"""
        return self.analyze_image(image_data, prompt)
    
    def test_nature_analysis(self, image_data: bytes) -> Dict[str, Any]:
        """Test analisis alam/lingkungan"""
        prompt = """Analisis foto alam/lingkungan ini (Bahasa Indonesia).
        
        Berikan:
        1. Jenis ekosistem/area
        2. Kondisi lingkungan
        3. Kesehatan vegetasi
        4. Potensi masalah (erosi, deforestasi, dll)
        5. Tingkat kerusakan (none/low/medium/high/severe)
        6. Rekomendasi konservasi
        
        Format JSON:
        {
            "ecosystem_type": "jenis",
            "environmental_condition": "kondisi",
            "vegetation_health": "kesehatan",
            "potential_issues": ["masalah1", "masalah2"],
            "damage_level": "none/low/medium/high/severe",
            "conservation_recommendations": ["rekomendasi1"],
            "confidence": "0-100%"
        }"""
        return self.analyze_image(image_data, prompt)
    
    def test_general_analysis(self, image_data: bytes) -> Dict[str, Any]:
        """Test analisis umum"""
        prompt = """Analisis gambar ini secara detail (Bahasa Indonesia).
        
        Berikan:
        1. Deskripsi singkat
        2. Objek-objek utama
        3. Aktivitas/kejadian
        4. Kondisi keseluruhan
        5. Tag/kategori
        
        Format JSON:
        {
            "description": "deskripsi",
            "main_objects": ["objek1", "objek2"],
            "activities": ["aktivitas1"],
            "overall_condition": "kondisi",
            "tags": ["tag1", "tag2"],
            "confidence": "0-100%"
        }"""
        return self.analyze_image(image_data, prompt)

# Initialize tester
tester = VLMTester(GEMINI_API_KEY)
print("✅ VLM Tester initialized!")

## 🧪 Run Tests

In [None]:
import pandas as pd

# Pilih jenis test
print("Pilih jenis analisis:")
print("1. Objek Bersejarah (Artifact)")
print("2. Bangunan/Arsitektur")
print("3. Alam/Lingkungan")
print("4. Analisis Umum")
print("5. Test Semua Jenis")

test_type = input("Pilih (1-5): ").strip()

results = []

for img_data in test_images:
    filename = img_data['filename']
    data = img_data['data']
    
    print(f"\n🔍 Testing: {filename}")
    print("=" * 50)
    
    if test_type == '1' or test_type == '5':
        print("\n📜 Artifact Analysis...")
        result = tester.test_artifact_analysis(data)
        results.append({
            'filename': filename,
            'test_type': 'artifact',
            **result
        })
        print(f"⏱️ Time: {result.get('processing_time', 0):.2f}s")
        if result['success']:
            print(f"📊 Result: {json.dumps(result['result'], indent=2, ensure_ascii=False)}")
    
    if test_type == '2' or test_type == '5':
        print("\n🏛️ Building Analysis...")
        result = tester.test_building_analysis(data)
        results.append({
            'filename': filename,
            'test_type': 'building',
            **result
        })
        print(f"⏱️ Time: {result.get('processing_time', 0):.2f}s")
        if result['success']:
            print(f"📊 Result: {json.dumps(result['result'], indent=2, ensure_ascii=False)}")
    
    if test_type == '3' or test_type == '5':
        print("\n🌿 Nature Analysis...")
        result = tester.test_nature_analysis(data)
        results.append({
            'filename': filename,
            'test_type': 'nature',
            **result
        })
        print(f"⏱️ Time: {result.get('processing_time', 0):.2f}s")
        if result['success']:
            print(f"📊 Result: {json.dumps(result['result'], indent=2, ensure_ascii=False)}")
    
    if test_type == '4' or test_type == '5':
        print("\n🔍 General Analysis...")
        result = tester.test_general_analysis(data)
        results.append({
            'filename': filename,
            'test_type': 'general',
            **result
        })
        print(f"⏱️ Time: {result.get('processing_time', 0):.2f}s")
        if result['success']:
            print(f"📊 Result: {json.dumps(result['result'], indent=2, ensure_ascii=False)}")

print("\n✅ All tests completed!")

## 📊 Analyze Results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create DataFrame
df_results = pd.DataFrame(results)

print("📊 Test Results Summary:")
print("=" * 50)
print(f"Total tests: {len(results)}")
print(f"Successful: {df_results['success'].sum()}")
print(f"Failed: {(~df_results['success']).sum()}")
print(f"\nAverage processing time: {df_results['processing_time'].mean():.2f}s")
print(f"Min time: {df_results['processing_time'].min():.2f}s")
print(f"Max time: {df_results['processing_time'].max():.2f}s")

# Visualize processing times
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.barplot(data=df_results, x='test_type', y='processing_time')
plt.title('Processing Time by Test Type')
plt.xlabel('Test Type')
plt.ylabel('Time (seconds)')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
success_counts = df_results['success'].value_counts()
plt.pie(success_counts, labels=['Success', 'Failed'], autopct='%1.1f%%', colors=['#4CAF50', '#F44336'])
plt.title('Success Rate')

plt.tight_layout()
plt.show()

# Display detailed results
print("\n📋 Detailed Results:")
display(df_results[['filename', 'test_type', 'success', 'processing_time']])

## 🎯 Accuracy Evaluation (Manual)

In [None]:
# Manual accuracy evaluation
# Buat ground truth dan compare dengan hasil VLM

print("Manual Accuracy Evaluation")
print("="*50)
print("\nUntuk setiap hasil, beri rating akurasi (1-5):")
print("1 = Sangat tidak akurat")
print("2 = Tidak akurat")
print("3 = Cukup akurat")
print("4 = Akurat")
print("5 = Sangat akurat\n")

accuracy_scores = []

for idx, result in enumerate(results):
    if result['success']:
        print(f"\n{idx+1}. File: {result['filename']}")
        print(f"   Type: {result['test_type']}")
        print(f"   Result: {result.get('raw_text', '')[:200]}...")
        
        score = int(input("   Rating (1-5): "))
        accuracy_scores.append({
            'filename': result['filename'],
            'test_type': result['test_type'],
            'accuracy_score': score
        })

# Calculate metrics
df_accuracy = pd.DataFrame(accuracy_scores)
print("\n📊 Accuracy Metrics:")
print("="*50)
print(f"Average accuracy: {df_accuracy['accuracy_score'].mean():.2f}/5")
print(f"Accuracy by type:")
print(df_accuracy.groupby('test_type')['accuracy_score'].mean())

# Visualize
plt.figure(figsize=(10, 5))
sns.barplot(data=df_accuracy, x='test_type', y='accuracy_score')
plt.title('Average Accuracy Score by Test Type')
plt.xlabel('Test Type')
plt.ylabel('Accuracy Score (1-5)')
plt.ylim(0, 5)
plt.axhline(y=3, color='r', linestyle='--', label='Acceptable threshold')
plt.legend()
plt.show()

## 💾 Export Results

In [None]:
# Export ke CSV
df_results.to_csv('vlm_test_results.csv', index=False)
print("✅ Results exported to: vlm_test_results.csv")

# Download file
files.download('vlm_test_results.csv')

# Export accuracy scores jika ada
if len(accuracy_scores) > 0:
    df_accuracy.to_csv('vlm_accuracy_scores.csv', index=False)
    print("✅ Accuracy scores exported to: vlm_accuracy_scores.csv")
    files.download('vlm_accuracy_scores.csv')

## 📝 Summary Report

In [None]:
# Generate summary report
report = f"""
# VLM Accuracy Test Report

## Test Configuration
- Model: Google Gemini 2.0 Flash
- Total Images: {len(test_images)}
- Total Tests: {len(results)}

## Performance Metrics
- Success Rate: {df_results['success'].mean()*100:.1f}%
- Average Processing Time: {df_results['processing_time'].mean():.2f}s
- Min Processing Time: {df_results['processing_time'].min():.2f}s
- Max Processing Time: {df_results['processing_time'].max():.2f}s

## Accuracy Metrics
"""

if len(accuracy_scores) > 0:
    report += f"""
- Average Accuracy Score: {df_accuracy['accuracy_score'].mean():.2f}/5
- Accuracy by Type:
"""
    for test_type, score in df_accuracy.groupby('test_type')['accuracy_score'].mean().items():
        report += f"  - {test_type}: {score:.2f}/5\n"

report += f"""
## Test Types Distribution
{df_results['test_type'].value_counts().to_string()}

## Recommendations
- Model shows good performance for VLM tasks
- Average processing time is acceptable for production use
- Consider fine-tuning for specific use cases if accuracy < 4/5
"""

print(report)

# Save report
with open('vlm_test_report.md', 'w', encoding='utf-8') as f:
    f.write(report)

print("\n✅ Report saved to: vlm_test_report.md")
files.download('vlm_test_report.md')