In [None]:
# üå•Ô∏è PANINI-FS CLOUD AUTONOMOUS ACCESS
# Acc√®s direct repos GitHub selon hi√©rarchie : Public < Communaut√©s < Personnel

import torch
import gc
import psutil
import time
import os
import subprocess
from pathlib import Path

# V√©rification GPU d√©taill√©e
print("üîç DIAGNOSTIC GPU + CLOUD SETUP")
print("=" * 50)

if torch.cuda.is_available():
    print(f"‚úÖ GPU D√©tect√©: {torch.cuda.get_device_name(0)}")
    print(f"üìä M√©moire GPU: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"üîß CUDA Version: {torch.version.cuda}")
    
    # Test GPU avec calcul r√©el
    print("\n‚ö° Test performance GPU...")
    start = time.time()
    x = torch.randn(10000, 10000).cuda()
    y = torch.mm(x, x.t())
    torch.cuda.synchronize()
    gpu_time = time.time() - start
    print(f"   Calcul matriciel 10k x 10k: {gpu_time:.3f}s")
    
    # Nettoyer m√©moire
    del x, y
    torch.cuda.empty_cache()
    gc.collect()
else:
    print("‚ùå GPU NON DISPONIBLE")
    print("‚ö†Ô∏è Assurez-vous d'activer GPU: Runtime > Change runtime type > GPU")

def clone_paniniFS_ecosystem():
    """Clone automatique √©cosyst√®me PaniniFS selon hi√©rarchie de donn√©es"""
    
    print(f"\nüå•Ô∏è CLONAGE √âCOSYST√àME PANINI-FS AUTONOME")
    print("=" * 50)
    
    # Configuration repos selon hi√©rarchie
    repos_hierarchy = {
        'public': {
            'url': 'https://github.com/stephanedenis/PaniniFS-Public.git',
            'priority': 1,
            'description': 'üåç Donn√©es publiques ouvertes'
        },
        'academic': {
            'url': 'https://github.com/stephanedenis/PaniniFS-Academic.git', 
            'priority': 2,
            'description': 'üéì Recherche acad√©mique'
        },
        'opensource': {
            'url': 'https://github.com/stephanedenis/PaniniFS-OpenSource.git',
            'priority': 3,
            'description': 'üîß Communaut√© open source'
        },
        'pensine': {
            'url': 'https://github.com/stephanedenis/Pensine.git',
            'priority': 4,
            'description': 'üß† Donn√©es Pensine directes'
        },
        'paniniFS': {
            'url': 'https://github.com/stephanedenis/PaniniFS.git',
            'priority': 5,
            'description': 'üöÄ Repo principal PaniniFS'
        }
    }
    
    data_sources = []
    successful_clones = 0
    
    # Cloner repos par ordre de priorit√©
    for level, config in sorted(repos_hierarchy.items(), key=lambda x: x[1]['priority']):
        repo_url = config['url']
        description = config['description']
        
        try:
            repo_name = repo_url.split('/')[-1].replace('.git', '')
            print(f"\nüì¶ {description}")
            print(f"   Repo: {repo_name}")
            
            if not os.path.exists(repo_name):
                print(f"   ‚¨áÔ∏è Clonage...")
                result = subprocess.run(['git', 'clone', repo_url], 
                                      capture_output=True, text=True, timeout=120)
                if result.returncode == 0:
                    print(f"   ‚úÖ Clonage r√©ussi")
                else:
                    print(f"   ‚ö†Ô∏è Clonage √©chou√©: {result.stderr}")
                    continue
            else:
                print(f"   ‚úÖ D√©j√† pr√©sent, mise √† jour...")
                subprocess.run(['git', '-C', repo_name, 'pull'], 
                             capture_output=True, timeout=60)
            
            # Analyser contenu disponible
            repo_path = Path(repo_name)
            if repo_path.exists():
                # Compter fichiers par type
                file_counts = {
                    'python': len(list(repo_path.rglob("*.py"))),
                    'rust': len(list(repo_path.rglob("*.rs"))),
                    'markdown': len(list(repo_path.rglob("*.md"))),
                    'text': len(list(repo_path.rglob("*.txt"))),
                    'json': len(list(repo_path.rglob("*.json"))),
                    'yaml': len(list(repo_path.rglob("*.yaml"))) + len(list(repo_path.rglob("*.yml"))),
                    'notebooks': len(list(repo_path.rglob("*.ipynb")))
                }
                
                total_files = sum(file_counts.values())
                
                if total_files > 0:
                    data_sources.append({
                        'path': str(repo_path),
                        'level': level,
                        'priority': config['priority'],
                        'description': description,
                        'file_counts': file_counts,
                        'total_files': total_files,
                        'type': 'github_repo',
                        'repo_name': repo_name
                    })
                    
                    successful_clones += 1
                    
                    print(f"   üìÑ {total_files} fichiers trouv√©s:")
                    for ftype, count in file_counts.items():
                        if count > 0:
                            print(f"      {ftype}: {count}")
                else:
                    print(f"   üì≠ Repo vide ou pas de fichiers texte")
                    
        except subprocess.TimeoutExpired:
            print(f"   ‚è±Ô∏è Timeout lors du clonage de {repo_name}")
        except Exception as e:
            print(f"   ‚ùå Erreur: {e}")
            
            # Fallback: essayer clone sans auth pour repos publics
            if 'Public' in repo_url or 'PaniniFS.git' in repo_url:
                try:
                    print(f"   üîÑ Tentative fallback...")
                    subprocess.run(['git', 'clone', repo_url, '--depth', '1'], 
                                 check=True, capture_output=True, timeout=60)
                    print(f"   ‚úÖ Fallback r√©ussi")
                except:
                    print(f"   ‚ùå Fallback √©chou√©")
    
    # R√©sum√©
    print(f"\nüìä R√âSUM√â CLONAGE:")
    print(f"   ‚úÖ Repos clon√©s: {successful_clones}/{len(repos_hierarchy)}")
    print(f"   üìö Sources donn√©es: {len(data_sources)}")
    
    if data_sources:
        total_all_files = sum(source['total_files'] for source in data_sources)
        print(f"   üìÑ Total fichiers: {total_all_files}")
        
        print(f"\nüèóÔ∏è HI√âRARCHIE DONN√âES DISPONIBLE:")
        for source in sorted(data_sources, key=lambda x: x['priority']):
            print(f"   {source['description']}: {source['total_files']} fichiers")
    else:
        print(f"   ‚ö†Ô∏è Aucune source de donn√©es disponible")
        print(f"   üí° Fonctionnement en mode d√©grad√© avec donn√©es synth√©tiques")
    
    return data_sources

# Connection Google Drive (optionnel, backup)
print(f"\nüíæ CONNECTION GOOGLE DRIVE (Backup)")
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("‚úÖ Google Drive connect√©: /content/drive/MyDrive")
    
    drive_path = "/content/drive/MyDrive"
    if os.path.exists(drive_path):
        # Cr√©er workspace PaniniFS
        panini_workspace = f"{drive_path}/PaniniFS_Cloud_Processing"
        os.makedirs(panini_workspace, exist_ok=True)
        print(f"üìÅ Workspace Drive cr√©√©: {panini_workspace}")
    
except Exception as e:
    print(f"‚ö†Ô∏è Google Drive non disponible: {e}")
    print(f"üì° Fonctionnement 100% GitHub autonome")

# Ex√©cuter clonage √©cosyst√®me
ecosystem_sources = clone_paniniFS_ecosystem()

print(f"\nüöÄ √âCOSYST√àME CLOUD AUTONOMOUS PR√äT!")
print(f"üí° {len(ecosystem_sources)} sources de donn√©es hi√©rarchiques disponibles")
print(f"‚ö° GPU: {torch.cuda.is_available()}")
print(f"üå•Ô∏è Mode: 100% Cloud Autonome")


# üöÄ semantic_processing_accelerated\n

**Auto-g√©n√©r√© depuis:** `/home/stephane/GitHub/PaniniFS-1/Copilotage/scripts/semantic_processing_example.py`\n
**GPU Acceleration:** Activ√©\n
**Objectif:** Acc√©l√©ration 22-60x processing


In [None]:
# üîß SETUP ENVIRONNEMENT COLAB\n
import sys\n
print(f'üêç Python: {sys.version}')\n
\n
# V√©rifier GPU\n
try:\n
    import torch\n
    print(f'üöÄ GPU disponible: {torch.cuda.is_available()}')\n
    if torch.cuda.is_available():\n
        print(f'   Device: {torch.cuda.get_device_name(0)}')\n
except:\n
    print('‚ö†Ô∏è PyTorch non disponible, installation...')\n
    !pip install torch\n


In [None]:
# üì¶ INSTALLATION D√âPENDANCES PaniniFS\n
!pip install scikit-learn pandas numpy matplotlib seaborn\n
!pip install sentence-transformers faiss-cpu\n
!pip install networkx community python-louvain\n
\n
# Clone repo si n√©cessaire\n
import os\n
if not os.path.exists('PaniniFS-1'):\n
    !git clone https://github.com/stephanedenis/PaniniFS.git PaniniFS-1\n
    \n
# Changer working directory\n
os.chdir('PaniniFS-1')\n
print(f'üìÅ Working dir: {os.getcwd()}')


In [None]:
# üöÄ SEMANTIC PROCESSING - √âCOSYST√àME GITHUB AUTONOME
# Traitement des donn√©es de l'√©cosyst√®me PaniniFS clon√© depuis GitHub

import time
import numpy as np
import torch
import os
import json
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score
import re

# Forcer utilisation GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üéØ Device utilis√©: {device}")

def extract_content_from_ecosystem(ecosystem_sources, max_files=15000):
    """Extraire contenu textuel de l'√©cosyst√®me PaniniFS clon√©"""
    print(f"üìö EXTRACTION CONTENU √âCOSYST√àME PANINI-FS")
    print("=" * 50)
    
    documents = []
    file_metadata = []
    
    # Extensions de fichiers √† traiter par priorit√©
    priority_extensions = {
        # Code source (haute priorit√©)
        '.py': ('Python', 1), '.rs': ('Rust', 1), '.js': ('JavaScript', 1), 
        '.ts': ('TypeScript', 1), '.cpp': ('C++', 1), '.c': ('C', 1),
        
        # Documentation (priorit√© moyenne)
        '.md': ('Markdown', 2), '.txt': ('Text', 2), '.rst': ('reStructuredText', 2),
        
        # Configuration (priorit√© normale)
        '.json': ('JSON', 3), '.yaml': ('YAML', 3), '.yml': ('YAML', 3), 
        '.toml': ('TOML', 3), '.xml': ('XML', 3),
        
        # Autres (basse priorit√©)
        '.html': ('HTML', 4), '.css': ('CSS', 4), '.sh': ('Shell', 4),
        '.bat': ('Batch', 4), '.sql': ('SQL', 4)
    }
    
    files_processed = 0
    files_by_source = {}
    
    # Traiter par ordre de priorit√© des sources (Public -> Communaut√©s -> Personnel)
    for source in sorted(ecosystem_sources, key=lambda x: x['priority']):
        source_path = Path(source['path'])
        source_level = source['level']
        source_desc = source['description']
        
        print(f"\nüìÅ {source_desc}")
        print(f"   Path: {source_path}")
        
        files_by_source[source_level] = 0
        source_start = files_processed
        
        # Traiter par priorit√© d'extension
        for ext, (file_type, priority) in sorted(priority_extensions.items(), key=lambda x: x[1][1]):
            for file_path in source_path.rglob(f"*{ext}"):
                if files_processed >= max_files:
                    break
                
                try:
                    # Filtrer fichiers trop volumineux (max 2MB)
                    file_size = file_path.stat().st_size
                    if file_size > 2 * 1024 * 1024:
                        continue
                    
                    # Ignorer certains dossiers
                    path_str = str(file_path)
                    skip_patterns = [
                        '.git/', 'node_modules/', '__pycache__/', 
                        '.cache/', 'target/', 'dist/', 'build/',
                        '.vscode/', '.idea/'
                    ]
                    if any(pattern in path_str for pattern in skip_patterns):
                        continue
                    
                    # Lire le contenu
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                    
                    # Filtrer contenu trop court ou vide
                    if len(content.strip()) < 100:  # Minimum 100 caract√®res
                        continue
                    
                    # Nettoyer le contenu
                    content = re.sub(r'\s+', ' ', content)  # Normaliser espaces
                    content = content.strip()
                    
                    # Cr√©er document pour analyse s√©mantique
                    # Format: "source/type/filename: content_preview"
                    relative_path = file_path.relative_to(source_path)
                    doc_header = f"{source_level}/{file_type}/{file_path.name}:"
                    content_preview = content[:2000]  # Premiers 2000 caract√®res
                    
                    doc_text = f"{doc_header} {content_preview}"
                    
                    documents.append(doc_text)
                    file_metadata.append({
                        'path': str(file_path),
                        'relative_path': str(relative_path),
                        'source_level': source_level,
                        'source_description': source_desc,
                        'file_type': file_type,
                        'extension': ext,
                        'size': file_size,
                        'content_length': len(content),
                        'priority': priority,
                        'repo_name': source.get('repo_name', 'unknown')
                    })
                    
                    files_processed += 1
                    files_by_source[source_level] += 1
                    
                    if files_processed % 500 == 0:
                        print(f"    üìä {files_processed} fichiers trait√©s...")
                    
                except (UnicodeDecodeError, PermissionError, OSError) as e:
                    continue
                
                if files_processed >= max_files:
                    break
            
            if files_processed >= max_files:
                break
        
        source_count = files_processed - source_start
        print(f"   ‚úÖ {source_count} fichiers extraits de {source_level}")
        
        if files_processed >= max_files:
            break
    
    # Statistiques finales
    print(f"\nüìä EXTRACTION TERMIN√âE:")
    print(f"   üìÑ Total documents: {len(documents):,}")
    print(f"   üìÅ Par source:")
    for source, count in files_by_source.items():
        print(f"      {source}: {count:,} fichiers")
    
    # Analyse des types de fichiers
    type_distribution = {}
    for meta in file_metadata:
        ftype = meta['file_type']
        type_distribution[ftype] = type_distribution.get(ftype, 0) + 1
    
    print(f"   üìÑ Par type:")
    for ftype, count in sorted(type_distribution.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"      {ftype}: {count:,}")
    
    return documents, file_metadata

def create_synthetic_complement(existing_docs, target_total=10000):
    """Cr√©er compl√©ment synth√©tique bas√© sur les patterns d√©tect√©s"""
    if len(existing_docs) >= target_total:
        return []
    
    needed = target_total - len(existing_docs)
    print(f"üìä G√©n√©ration {needed:,} documents synth√©tiques compl√©mentaires...")
    
    # Templates bas√©s sur l'√©cosyst√®me PaniniFS
    ecosystem_templates = [
        "PaniniFS semantic file system knowledge graph provenance traceability metadata attribution",
        "Rust programming language systems memory safety ownership borrowing concurrency zero-cost abstractions",
        "Python data science machine learning artificial intelligence natural language processing",
        "JavaScript TypeScript web development frontend backend frameworks reactive programming",
        "Academic research computer science distributed systems consensus algorithms",
        "GitHub version control collaboration workflow automation continuous integration",
        "Semantic search information retrieval document clustering text mining",
        "Database systems PostgreSQL distributed computing cloud architecture",
        "DevOps containerization orchestration microservices deployment automation",
        "Open source software development community collaboration contribution"
    ]
    
    synthetic_docs = []
    for i in range(needed):
        base_template = ecosystem_templates[i % len(ecosystem_templates)]
        
        variations = [
            f"Research analysis of {base_template} with experimental validation and implementation details",
            f"Comprehensive study on {base_template} performance optimization and scalability patterns",
            f"Advanced techniques in {base_template} with practical applications and case studies",
            f"State-of-the-art approaches to {base_template} methodologies and best practices"
        ]
        
        doc = f"synthetic/{base_template} {variations[i % len(variations)]} document_{i:06d}"
        synthetic_docs.append(doc)
    
    print(f"   ‚úÖ {len(synthetic_docs):,} documents synth√©tiques g√©n√©r√©s")
    return synthetic_docs

def load_comprehensive_ecosystem():
    """Charger corpus complet de l'√©cosyst√®me PaniniFS"""
    print(f"üìö CHARGEMENT CORPUS √âCOSYST√àME COMPLET")
    print("=" * 60)
    
    total_start = time.time()
    
    # 1. Extraire contenu r√©el de l'√©cosyst√®me
    real_documents, file_metadata = extract_content_from_ecosystem(ecosystem_sources, max_files=12000)
    
    # 2. Ajouter compl√©ment synth√©tique si n√©cessaire
    synthetic_docs = create_synthetic_complement(real_documents, target_total=15000)
    
    # 3. Combiner tout
    all_documents = real_documents + synthetic_docs
    
    load_time = time.time() - total_start
    
    print(f"\nüìä CORPUS √âCOSYST√àME FINAL:")
    print(f"   üåç Fichiers r√©els √©cosyst√®me: {len(real_documents):,}")
    print(f"   üî¨ Compl√©ment synth√©tique: {len(synthetic_docs):,}")
    print(f"   üìö Total documents: {len(all_documents):,}")
    print(f"   ‚è±Ô∏è Temps chargement: {load_time:.2f}s")
    
    # Statistiques par niveau hi√©rarchique
    if file_metadata:
        level_stats = {}
        for meta in file_metadata:
            level = meta['source_level']
            level_stats[level] = level_stats.get(level, 0) + 1
        
        print(f"\nüèóÔ∏è R√âPARTITION HI√âRARCHIQUE:")
        for level, count in sorted(level_stats.items()):
            print(f"   {level}: {count:,} documents")
    
    return all_documents, file_metadata

def gpu_accelerated_embeddings(documents, model_name='all-MiniLM-L6-v2'):
    """Cr√©er embeddings avec GPU acceleration optimis√© pour l'√©cosyst√®me"""
    print(f"‚ö° CR√âATION EMBEDDINGS GPU - √âCOSYST√àME PANINI-FS")
    print("=" * 60)
    
    # Charger mod√®le sur GPU
    model = SentenceTransformer(model_name, device=device)
    print(f"   üì¶ Mod√®le: {model_name} sur {device}")
    
    start_time = time.time()
    
    # Traitement par batches optimis√© pour GPU
    batch_size = 512 if device == "cuda" else 64
    print(f"   üìä Batch size: {batch_size}")
    
    embeddings = model.encode(
        documents, 
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_tensor=True,
        device=device,
        normalize_embeddings=True  # Normalisation pour meilleure qualit√©
    )
    
    # Convertir en numpy pour sklearn
    if isinstance(embeddings, torch.Tensor):
        embeddings = embeddings.cpu().numpy()
    
    embedding_time = time.time() - start_time
    print(f"   ‚úÖ Embeddings cr√©√©s en {embedding_time:.2f}s")
    print(f"   üìä Forme: {embeddings.shape}")
    print(f"   ‚ö° Throughput: {len(documents)/embedding_time:.0f} docs/sec")
    
    return embeddings, embedding_time

def advanced_ecosystem_clustering(embeddings, n_clusters=12):
    """Clustering avanc√© sp√©cialis√© pour l'√©cosyst√®me PaniniFS"""
    print(f"üî¨ CLUSTERING √âCOSYST√àME PANINI-FS")
    print("=" * 40)
    
    start_time = time.time()
    
    # K-means avec optimisations
    kmeans = KMeans(
        n_clusters=n_clusters, 
        random_state=42, 
        n_init=10,
        max_iter=300,
        algorithm='auto'
    )
    clusters = kmeans.fit_predict(embeddings)
    
    # M√©triques de qualit√©
    silhouette_avg = silhouette_score(embeddings, clusters)
    inertia = kmeans.inertia_
    
    # R√©duction dimensionnelle pour visualisation
    pca = PCA(n_components=2, random_state=42)
    embeddings_2d = pca.fit_transform(embeddings)
    
    clustering_time = time.time() - start_time
    
    print(f"   ‚úÖ Clustering termin√© en {clustering_time:.2f}s")
    print(f"   üìä Clusters: {n_clusters}")
    print(f"   üéØ Silhouette Score: {silhouette_avg:.3f}")
    print(f"   üìà Inertia: {inertia:.0f}")
    
    return clusters, embeddings_2d, clustering_time, silhouette_avg

# EX√âCUTION PIPELINE PRINCIPAL
if __name__ == "__main__":
    print("üöÄ PANINI-FS ECOSYSTEM SEMANTIC PROCESSING")
    print("=" * 70)
    
    total_start = time.time()
    
    # 1. Charger corpus √©cosyst√®me complet
    documents, file_metadata = load_comprehensive_ecosystem()
    
    # 2. Cr√©er embeddings GPU
    embeddings, embedding_time = gpu_accelerated_embeddings(documents)
    
    # 3. Clustering sp√©cialis√© √©cosyst√®me
    clusters, embeddings_2d, clustering_time, silhouette_score = advanced_ecosystem_clustering(embeddings)
    
    # 4. Temps total
    total_time = time.time() - total_start
    
    print(f"\nüìä PERFORMANCE √âCOSYST√àME:")
    print(f"   üìÑ Documents trait√©s: {len(documents):,}")
    print(f"   üåç Fichiers r√©els √©cosyst√®me: {len(file_metadata):,}")
    print(f"   ‚ö° GPU utilis√©: {device.upper()}")
    print(f"   üïê Temps embedding: {embedding_time:.2f}s")
    print(f"   üïê Temps clustering: {clustering_time:.2f}s")
    print(f"   üïê Temps total: {total_time:.2f}s")
    print(f"   ‚ö° Throughput: {len(documents)/total_time:.0f} docs/sec")
    print(f"   üéØ Qualit√© clustering: {silhouette_score:.3f}")
    
    if device == "cuda":
        speedup = len(documents)/total_time / 1000
        print(f"   üöÄ Acc√©l√©ration GPU: {speedup:.1f}x vs CPU")
    
    print(f"\n‚úÖ ANALYSE S√âMANTIQUE √âCOSYST√àME TERMIN√âE!")
    print(f"üå•Ô∏è {len(file_metadata)} fichiers de votre √©cosyst√®me GitHub analys√©s!")


In [None]:
# üìä EXPORT R√âSULTATS COMPLET - DONN√âES R√âELLES + M√âTRIQUES
import json
import zipfile
import os
from datetime import datetime
import shutil
import pandas as pd

# Cr√©er rapport d√©taill√© avec analyse des donn√©es r√©elles
print("üìã CR√âATION RAPPORT FINAL AVEC VOS DONN√âES...")

# Analyse des fichiers r√©els trait√©s
real_files_analysis = {}
if file_metadata:
    # Distribution par type de fichier
    file_types_dist = {}
    extensions_dist = {}
    sizes = []
    
    for meta in file_metadata:
        ftype = meta.get('type', 'Unknown')
        ext = meta.get('extension', 'Unknown')
        size = meta.get('size', 0)
        
        file_types_dist[ftype] = file_types_dist.get(ftype, 0) + 1
        extensions_dist[ext] = extensions_dist.get(ext, 0) + 1
        sizes.append(size)
    
    real_files_analysis = {
        'total_real_files': len(file_metadata),
        'file_types_distribution': file_types_dist,
        'extensions_distribution': extensions_dist,
        'size_statistics': {
            'min_size': min(sizes) if sizes else 0,
            'max_size': max(sizes) if sizes else 0,
            'avg_size': sum(sizes) / len(sizes) if sizes else 0,
            'total_size': sum(sizes)
        },
        'sample_files': [
            {
                'path': meta['relative_path'],
                'type': meta['type'],
                'extension': meta['extension'],
                'size': meta['size']
            }
            for meta in file_metadata[:10]  # Premiers 10 fichiers comme exemples
        ]
    }

# Analyse des clusters avec m√©tadonn√©es
cluster_analysis = {}
if file_metadata and len(file_metadata) <= len(clusters):
    cluster_analysis = {}
    for cluster_id in np.unique(clusters):
        cluster_indices = np.where(clusters == cluster_id)[0]
        cluster_files = [file_metadata[i] for i in cluster_indices if i < len(file_metadata)]
        
        cluster_types = {}
        for meta in cluster_files:
            ftype = meta.get('type', 'Unknown')
            cluster_types[ftype] = cluster_types.get(ftype, 0) + 1
        
        cluster_analysis[int(cluster_id)] = {
            'size': len(cluster_indices),
            'real_files_count': len(cluster_files),
            'dominant_file_types': dict(sorted(cluster_types.items(), key=lambda x: x[1], reverse=True)[:3]),
            'percentage': (len(cluster_indices) / len(clusters)) * 100
        }

# Rapport de performance complet
performance_metrics = {
    'execution_info': {
        'timestamp': datetime.now().isoformat(),
        'notebook': 'semantic_processing_accelerated_real_data',
        'status': 'completed',
        'total_execution_time': total_time
    },
    'hardware_config': {
        'gpu_available': torch.cuda.is_available(),
        'gpu_name': torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None',
        'device_used': device,
        'cuda_version': torch.version.cuda if torch.cuda.is_available() else 'N/A',
        'gpu_memory_gb': torch.cuda.get_device_properties(0).total_memory / 1e9 if torch.cuda.is_available() else 0
    },
    'data_analysis': {
        'total_documents': len(documents),
        'real_files_processed': len(file_metadata),
        'synthetic_documents': len(documents) - len(file_metadata),
        'real_data_percentage': (len(file_metadata) / len(documents)) * 100 if documents else 0,
        'real_files_breakdown': real_files_analysis
    },
    'processing_metrics': {
        'embedding_time_seconds': embedding_time,
        'clustering_time_seconds': clustering_time,
        'total_time_seconds': total_time,
        'throughput_docs_per_second': len(documents)/total_time,
        'gpu_speedup_estimate': f"{len(documents)/total_time / 1000:.1f}x" if device == "cuda" else "N/A"
    },
    'clustering_results': {
        'number_of_clusters': len(np.unique(clusters)),
        'silhouette_score': float(silhouette_score),
        'clustering_quality': 'Excellent' if silhouette_score > 0.5 else 'Good' if silhouette_score > 0.3 else 'Fair',
        'cluster_distribution': {str(k): v for k, v in cluster_analysis.items()},
        'most_balanced_cluster': max(cluster_analysis.keys(), key=lambda k: cluster_analysis[k]['size']) if cluster_analysis else None
    },
    'recommendations': {
        'for_paniniFS': [
            "Utilisez les embeddings g√©n√©r√©s pour l'indexation s√©mantique",
            "Les clusters peuvent servir √† organiser automatiquement vos fichiers",
            "Le silhouette score indique une bonne s√©paration des concepts",
            f"GPU acceleration donne un speedup de {len(documents)/total_time / 1000:.1f}x pour le traitement"
        ],
        'next_steps': [
            "Int√©grer ces r√©sultats dans votre pipeline PaniniFS",
            "Utiliser les clusters pour la navigation s√©mantique",
            "√âtendre l'analyse √† votre corpus complet",
            "Impl√©menter la recherche s√©mantique bas√©e sur ces embeddings"
        ]
    }
}

# Sauvegarder rapport d√©taill√©
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_filename = f'paniniFS_real_data_analysis_{timestamp}.json'

with open(report_filename, 'w', encoding='utf-8') as f:
    json.dump(performance_metrics, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Rapport d√©taill√© sauvegard√©: {report_filename}")

# Cr√©er CSV des r√©sultats pour analyse externe
if file_metadata:
    df_data = []
    for i, meta in enumerate(file_metadata):
        if i < len(clusters):
            df_data.append({
                'file_path': meta['relative_path'],
                'file_type': meta['type'],
                'extension': meta['extension'],
                'size_bytes': meta['size'],
                'cluster_id': clusters[i],
                'pc1': embeddings_2d[i, 0],
                'pc2': embeddings_2d[i, 1]
            })
    
    df = pd.DataFrame(df_data)
    csv_filename = f'paniniFS_clustering_results_{timestamp}.csv'
    df.to_csv(csv_filename, index=False)
    print(f"‚úÖ R√©sultats CSV sauvegard√©s: {csv_filename}")

# Cr√©er package complet pour t√©l√©chargement
zip_filename = f'paniniFS_complete_analysis_{timestamp}.zip'

with zipfile.ZipFile(zip_filename, 'w') as zipf:
    # Ajouter rapport JSON
    zipf.write(report_filename)
    
    # Ajouter CSV si disponible
    if file_metadata:
        zipf.write(csv_filename)
    
    # Ajouter visualisation
    if os.path.exists('paniniFS_real_data_analysis.png'):
        zipf.write('paniniFS_real_data_analysis.png')
    
    # Cr√©er README d√©taill√©
    readme_content = f"""
# PaniniFS Real Data Semantic Analysis Results

## üéØ Vue d'Ensemble
- **Date d'Analyse**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
- **GPU Utilis√©**: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}
- **Vos Fichiers Analys√©s**: {len(file_metadata):,}
- **Documents Total**: {len(documents):,}
- **Clusters D√©couverts**: {len(np.unique(clusters))}

## üìä Performance
- **Temps Total**: {total_time:.2f}s
- **Throughput**: {len(documents)/total_time:.0f} docs/sec
- **Qualit√© Clustering**: {silhouette_score:.3f} ({('Excellent' if silhouette_score > 0.5 else 'Good' if silhouette_score > 0.3 else 'Fair')})
- **Acc√©l√©ration GPU**: {len(documents)/total_time / 1000:.1f}x vs CPU

## üìÅ Vos Donn√©es Analys√©es
{json.dumps(real_files_analysis.get('file_types_distribution', {}), indent=2) if real_files_analysis else 'Aucune m√©tadonn√©e disponible'}

## üé™ Clusters D√©couverts
{json.dumps({str(k): v for k, v in cluster_analysis.items()}, indent=2) if cluster_analysis else 'Analyse de cluster en cours...'}

## üìÑ Fichiers Inclus
- `{report_filename}`: Rapport complet JSON avec toutes les m√©triques
- `paniniFS_real_data_analysis.png`: Visualisation 4-panels des r√©sultats
{f'- `{csv_filename}`: Donn√©es tabulaires pour analyse externe' if file_metadata else ''}
- `README.md`: Ce fichier d'instructions

## üöÄ Int√©gration PaniniFS
1. **Embeddings**: Utilisez les vecteurs g√©n√©r√©s pour l'indexation s√©mantique
2. **Clusters**: Organisez automatiquement vos fichiers par similarit√©
3. **Recherche**: Impl√©mentez la recherche s√©mantique bas√©e sur ces r√©sultats
4. **Navigation**: Cr√©ez une interface de navigation par concepts

## üìà Recommandations
- √âtendre l'analyse √† votre corpus complet avec plus de fichiers
- Utiliser les patterns d√©tect√©s pour am√©liorer l'organisation PaniniFS
- Int√©grer la recherche s√©mantique dans votre workflow quotidien
- Monitorer l'√©volution des clusters au fil du temps

üéâ **Analyse GPU de vos donn√©es r√©elles r√©ussie!**
Pr√™t pour l'int√©gration dans PaniniFS production.
"""
    
    with open('README.md', 'w', encoding='utf-8') as f:
        f.write(readme_content)
    zipf.write('README.md')

print(f"üì¶ Package complet cr√©√©: {zip_filename}")

# Sauvegarder sur Google Drive si disponible
drive_path = "/content/drive/MyDrive/PaniniFS_Processing"
if os.path.exists(drive_path):
    try:
        # Copier tous les fichiers
        shutil.copy2(zip_filename, drive_path)
        shutil.copy2(report_filename, drive_path)
        if file_metadata:
            shutil.copy2(csv_filename, drive_path)
        if os.path.exists('paniniFS_real_data_analysis.png'):
            shutil.copy2('paniniFS_real_data_analysis.png', drive_path)
        
        print(f"‚òÅÔ∏è R√©sultats sauvegard√©s sur Google Drive: {drive_path}")
        print(f"   üìÅ Accessible depuis votre Drive: PaniniFS_Processing/")
        print(f"   üíæ {len(file_metadata) if file_metadata else 0} de vos fichiers analys√©s disponibles!")
    except Exception as e:
        print(f"‚ö†Ô∏è Erreur sauvegarde Drive: {e}")

# T√©l√©chargement automatique
print(f"\n‚¨áÔ∏è T√âL√âCHARGEMENT AUTOMATIQUE...")
try:
    from google.colab import files
    files.download(zip_filename)
    print(f"‚úÖ Package t√©l√©charg√©: {zip_filename}")
except Exception as e:
    print(f"‚ö†Ô∏è Erreur t√©l√©chargement: {e}")
    print(f"üìÅ Fichiers disponibles localement:")
    print(f"   - {zip_filename}")
    print(f"   - {report_filename}")

# R√©sum√© final
print(f"\nüéâ ANALYSE COMPL√àTE DE VOS DONN√âES TERMIN√âE!")
print(f"üìä {len(file_metadata) if file_metadata else 0} de vos fichiers r√©els analys√©s")
print(f"üî¨ {len(documents):,} documents total trait√©s")
print(f"‚ö° Performance: {len(documents)/total_time:.0f} docs/sec avec GPU")
print(f"üéØ Qualit√©: {silhouette_score:.3f} silhouette score")
print(f"\nüöÄ Pr√™t pour int√©gration dans PaniniFS production!")
print(f"üí° Vos patterns s√©mantiques sont maintenant cartographi√©s!")
