# 🚀 PaniniFS - Mode Cloud Autonome

**100% Cloud Native** - Clonage automatique des repos GitHub

## 🎯 Workflow Autonome
1. **Auto-détection** : Mode Colab vs Local
2. **Clonage repos** : Tous les repos GitHub automatiquement
3. **Scan optimisé** : Limites strictes pour performance
4. **Embeddings** : Pipeline complet temps réel
5. **Recherche** : Interface interactive

In [None]:
# 🔧 SETUP AUTONOME - Détection environnement et installation
import os
import sys
import subprocess
import time
from pathlib import Path

# Détection mode Cloud (Colab/Kaggle/etc)
IS_CLOUD = 'google.colab' in sys.modules or '/kaggle/' in os.environ.get('PATH', '') or 'COLAB_GPU' in os.environ
print(f"🌍 Mode détecté: {'☁️ CLOUD' if IS_CLOUD else '🖥️ LOCAL'}")

if IS_CLOUD:
    # Installation dépendances cloud
    print("📦 Installation dépendances cloud...")
    !pip install sentence-transformers torch --quiet
    BASE_PATH = Path('/content')
else:
    BASE_PATH = Path('/home/stephane/GitHub')

print(f"📁 Répertoire de travail: {BASE_PATH}")
os.chdir(BASE_PATH)


In [None]:
# 🔄 CLONAGE AUTOMATIQUE DES REPOS
def clone_repos_cloud():
    """Clone tous les repos nécessaires en mode cloud"""
    
    repos_config = {
        'PaniniFS-1': 'https://github.com/stephanedenis/PaniniFS.git',
        'Pensine': 'https://github.com/stephanedenis/Pensine.git', 
        'totoro-automation': 'https://github.com/stephanedenis/totoro-automation.git',
        'hexagonal-demo': 'https://github.com/stephanedenis/hexagonal-demo.git'
    }
    
    cloned_repos = []
    
    for repo_name, repo_url in repos_config.items():
        repo_path = BASE_PATH / repo_name
        
        if repo_path.exists():
            print(f"✅ {repo_name} déjà présent")
            cloned_repos.append(repo_name)
            continue
            
        try:
            print(f"📥 Clonage {repo_name}...")
            result = subprocess.run(
                ['git', 'clone', repo_url, str(repo_path)], 
                capture_output=True, 
                text=True, 
                timeout=60
            )
            
            if result.returncode == 0:
                print(f"✅ {repo_name} cloné avec succès")
                cloned_repos.append(repo_name)
            else:
                print(f"⚠️ Erreur clonage {repo_name}: {result.stderr}")
                
        except subprocess.TimeoutExpired:
            print(f"⏱️ Timeout clonage {repo_name}")
        except Exception as e:
            print(f"❌ Erreur {repo_name}: {e}")
    
    return cloned_repos

# Exécution clonage en mode cloud
if IS_CLOUD:
    start_time = time.time()
    available_repos = clone_repos_cloud()
    clone_time = time.time() - start_time
    print(f"\n🎯 Clonage terminé en {clone_time:.2f}s")
    print(f"📦 {len(available_repos)} repos disponibles: {', '.join(available_repos)}")
else:
    # Mode local - utilise les repos existants
    available_repos = ['PaniniFS-1', 'Pensine', 'totoro-automation', 'hexagonal-demo']
    print(f"📦 Mode local - {len(available_repos)} repos configurés")


In [None]:
# 🔍 SCAN SOURCES CLOUD-OPTIMISÉ
def scan_sources_cloud_optimized():
    """Scan optimisé pour mode cloud avec limites strictes"""
    
    # Limites cloud-optimisées
    MAX_PY_FILES_PER_REPO = 30  # Réduit pour cloud
    MAX_MD_FILES_PER_REPO = 15  # Réduit pour cloud
    MAX_FILE_SIZE = 100 * 1024  # 100KB max
    
    all_sources = []
    scan_stats = {'total_files': 0, 'total_size': 0, 'repos_scanned': 0}
    
    for repo_name in available_repos:
        repo_path = BASE_PATH / repo_name
        
        if not repo_path.exists():
            print(f"⚠️ Repo {repo_name} non trouvé")
            continue
            
        print(f"🔍 Scan {repo_name}...")
        repo_sources = []
        py_count, md_count = 0, 0
        
        try:
            # Scan fichiers Python
            for py_file in repo_path.rglob('*.py'):
                if py_count >= MAX_PY_FILES_PER_REPO:
                    break
                    
                if py_file.stat().st_size > MAX_FILE_SIZE:
                    continue
                    
                try:
                    content = py_file.read_text(encoding='utf-8', errors='replace')
                    if len(content.strip()) > 50:  # Filtre fichiers vides
                        repo_sources.append({
                            'repo': repo_name,
                            'path': str(py_file.relative_to(repo_path)),
                            'type': 'python',
                            'content': content[:5000],  # Limite contenu
                            'size': len(content)
                        })
                        py_count += 1
                except Exception as e:
                    continue
            
            # Scan fichiers Markdown
            for md_file in repo_path.rglob('*.md'):
                if md_count >= MAX_MD_FILES_PER_REPO:
                    break
                    
                if md_file.stat().st_size > MAX_FILE_SIZE:
                    continue
                    
                try:
                    content = md_file.read_text(encoding='utf-8', errors='replace')
                    if len(content.strip()) > 50:
                        repo_sources.append({
                            'repo': repo_name,
                            'path': str(md_file.relative_to(repo_path)),
                            'type': 'markdown',
                            'content': content[:3000],  # Limite contenu MD
                            'size': len(content)
                        })
                        md_count += 1
                except Exception as e:
                    continue
        
        except Exception as e:
            print(f"❌ Erreur scan {repo_name}: {e}")
            continue
        
        all_sources.extend(repo_sources)
        scan_stats['repos_scanned'] += 1
        scan_stats['total_files'] += len(repo_sources)
        scan_stats['total_size'] += sum(s['size'] for s in repo_sources)
        
        print(f"  📄 {len(repo_sources)} fichiers ({py_count} .py + {md_count} .md)")
    
    return all_sources, scan_stats

# Exécution scan
print("\n📁 SCAN SOURCES CLOUD-OPTIMISÉ")
print("=" * 40)

start_time = time.time()
sources, stats = scan_sources_cloud_optimized()
scan_time = time.time() - start_time

print(f"\n⏱️ Scan terminé en {scan_time:.2f}s")
print(f"🎯 {stats['total_files']} sources consolidées")
print(f"📊 {stats['repos_scanned']} repos scannés")
print(f"💾 {stats['total_size'] / 1024:.1f}KB total")

if len(sources) == 0:
    print("\n⚠️ AUCUNE SOURCE TROUVÉE")
    print("💡 Vérifiez le clonage des repos")
else:
    print(f"\n✅ SOURCES PRÊTES POUR EMBEDDINGS")


In [None]:
# 🧠 GÉNÉRATION EMBEDDINGS CLOUD-OPTIMISÉE
if len(sources) > 0:
    print("\n🧠 GÉNÉRATION EMBEDDINGS")
    print("=" * 30)
    
    try:
        from sentence_transformers import SentenceTransformer
        import torch
        
        # Configuration GPU/CPU automatique
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print(f"🔧 Device: {device}")
        
        # Modèle optimisé
        print("📥 Chargement modèle...")
        model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
        
        # Préparation documents
        documents = []
        metadata = []
        
        for source in sources:
            # Formatage document pour embeddings
            doc_text = f"Repo: {source['repo']}\nFile: {source['path']}\nType: {source['type']}\n\nContent:\n{source['content']}"
            documents.append(doc_text)
            metadata.append({
                'repo': source['repo'],
                'path': source['path'],
                'type': source['type']
            })
        
        # Limitation pour performance cloud
        MAX_DOCS = 150  # Limite cloud
        if len(documents) > MAX_DOCS:
            print(f"⚡ Limitation à {MAX_DOCS} docs pour performance cloud")
            documents = documents[:MAX_DOCS]
            metadata = metadata[:MAX_DOCS]
        
        print(f"🔄 Génération embeddings pour {len(documents)} documents...")
        start_time = time.time()
        
        # Génération par batch pour éviter OOM
        batch_size = 32 if device == 'cuda' else 16
        embeddings = model.encode(documents, batch_size=batch_size, show_progress_bar=True)
        
        emb_time = time.time() - start_time
        
        print(f"✅ Embeddings générés en {emb_time:.2f}s")
        print(f"📊 {len(embeddings)} vecteurs de dimension {embeddings.shape[1]}")
        print(f"⚡ Performance: {len(documents)/emb_time:.1f} docs/sec")
        
        embeddings_ready = True
        
    except Exception as e:
        print(f"❌ Erreur embeddings: {e}")
        embeddings_ready = False
else:
    print("⚠️ Pas de sources - skip embeddings")
    embeddings_ready = False


In [None]:
# 🔎 RECHERCHE SÉMANTIQUE INTERACTIVE
if embeddings_ready:
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity
    
    def semantic_search_cloud(query, top_k=5):
        """Recherche sémantique optimisée cloud"""
        
        try:
            # Génération embedding query
            query_embedding = model.encode([query])
            
            # Calcul similarité
            similarities = cosine_similarity(query_embedding, embeddings)[0]
            
            # Top résultats
            top_indices = np.argsort(similarities)[::-1][:top_k]
            
            results = []
            for i, idx in enumerate(top_indices):
                score = similarities[idx]
                meta = metadata[idx]
                doc = documents[idx]
                
                results.append({
                    'rank': i + 1,
                    'score': float(score),
                    'repo': meta['repo'],
                    'path': meta['path'],
                    'type': meta['type'],
                    'content_preview': doc[:300] + '...' if len(doc) > 300 else doc
                })
            
            return results
            
        except Exception as e:
            print(f"❌ Erreur recherche: {e}")
            return []
    
    # Interface de recherche
    print("\n🔎 RECHERCHE SÉMANTIQUE INTERACTIVE")
    print("=" * 35)
    print("🎯 Exemples de requêtes:")
    print("  - 'filesystem implementation'")
    print("  - 'neural network training'")
    print("  - 'configuration files'")
    print("  - 'error handling'")
    
    # Test automatique
    test_query = "filesystem implementation"
    print(f"\n🧪 Test automatique: '{test_query}'")
    
    start_time = time.time()
    results = semantic_search_cloud(test_query, top_k=3)
    search_time = time.time() - start_time
    
    print(f"⚡ Recherche en {search_time:.3f}s")
    
    if results:
        print("\n📊 RÉSULTATS:")
        for result in results:
            print(f"\n{result['rank']}. 📁 {result['repo']}/{result['path']}")
            print(f"   🎯 Score: {result['score']:.3f} | Type: {result['type']}")
            print(f"   📝 {result['content_preview'][:150]}...")
    
    print("\n✅ SYSTÈME PRÊT POUR RECHERCHE INTERACTIVE")
    search_function_ready = True
else:
    search_function_ready = False
    print("⚠️ Recherche non disponible - problème embeddings")


In [None]:
# 📊 RAPPORT FINAL CLOUD AUTONOME
print("\n🎉 RAPPORT FINAL - MODE CLOUD AUTONOME")
print("=" * 45)

report = {
    'mode': 'CLOUD' if IS_CLOUD else 'LOCAL',
    'repos_clones': len(available_repos) if 'available_repos' in locals() else 0,
    'sources_scannees': len(sources) if 'sources' in locals() else 0,
    'embeddings_generes': len(embeddings) if 'embeddings_ready' else 0,
    'recherche_active': search_function_ready if 'search_function_ready' in locals() else False,
    'performance': {
        'clonage': f"{clone_time:.2f}s" if 'clone_time' in locals() else 'N/A',
        'scan': f"{scan_time:.2f}s" if 'scan_time' in locals() else 'N/A',
        'embeddings': f"{emb_time:.2f}s" if 'emb_time' in locals() else 'N/A',
        'recherche': f"{search_time:.3f}s" if 'search_time' in locals() else 'N/A'
    }
}

print(f"🌍 Mode: {report['mode']}")
print(f"📦 Repos clonés: {report['repos_clones']}")
print(f"📄 Sources scannées: {report['sources_scannees']}")
print(f"🧠 Embeddings générés: {report['embeddings_generes']}")
print(f"🔎 Recherche: {'✅ ACTIVE' if report['recherche_active'] else '❌ INACTIVE'}")

print("\n⚡ PERFORMANCE:")
for step, time_val in report['performance'].items():
    print(f"  {step.capitalize()}: {time_val}")

# Calcul temps total
total_time = 0
if 'clone_time' in locals(): total_time += clone_time
if 'scan_time' in locals(): total_time += scan_time
if 'emb_time' in locals(): total_time += emb_time

print(f"\n🏁 TEMPS TOTAL: {total_time:.2f}s")

if report['recherche_active']:
    print("\n🎯 SYSTÈME 100% OPÉRATIONNEL")
    print("💡 Utilisez: semantic_search_cloud('votre requête')")
else:
    print("\n⚠️ SYSTÈME PARTIELLEMENT OPÉRATIONNEL")
    print("💡 Vérifiez les étapes précédentes")

print("\n🚀 MODE CLOUD AUTONOME COMPLÉTÉ !") 


# 🎯 Utilisation Interactive

## Recherche Personnalisée
```python
# Exemples de recherches
results = semantic_search_cloud("neural network training", top_k=5)
results = semantic_search_cloud("configuration files", top_k=3)
results = semantic_search_cloud("error handling patterns", top_k=5)
```

## Exploration des Repos
```python
# Voir les repos disponibles
print("Repos disponibles:", available_repos)

# Statistiques par repo
repo_stats = {}
for source in sources:
    repo = source['repo']
    if repo not in repo_stats:
        repo_stats[repo] = {'python': 0, 'markdown': 0}
    repo_stats[repo][source['type']] += 1

for repo, stats in repo_stats.items():
    print(f"{repo}: {stats['python']} .py + {stats['markdown']} .md")
```

**✅ Système Cloud Autonome Opérationnel !**