<a href="https://colab.research.google.com/github/stephanedenis/PaniniFS-Research/blob/main/notebooks/colab_dhatu_robust.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üöÄ DhƒÅtu Analysis - Robust Colab Version

Version robuste qui g√®re les credentials GitHub proprement et √©vite les erreurs fatales.

In [None]:
# üì¶ Setup robuste avec gestion d'erreurs
import os, json, requests, time, subprocess
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt

def setup_git_safely():
    """Configuration Git s√©curis√©e"""
    try:
        # V√©rifier si d√©j√† configur√©
        result = subprocess.run(['git', 'config', 'user.email'], capture_output=True, text=True)
        if result.returncode != 0 or not result.stdout.strip():
            # Configuration par d√©faut pour Colab
            subprocess.run(['git', 'config', 'user.email', 'colab@research.panini'], check=True)
            subprocess.run(['git', 'config', 'user.name', 'Colab Research'], check=True)
            print("‚úÖ Git configur√© pour Colab")
        else:
            print(f"‚úÖ Git d√©j√† configur√©: {result.stdout.strip()}")
            
        return True
    except Exception as e:
        print(f"‚ö†Ô∏è Probl√®me configuration Git: {e}")
        return False

def clone_or_pull_repo():
    """Clone ou pull du repository"""
    REPO_URL = "https://github.com/stephanedenis/PaniniFS-Research"
    
    if not os.path.exists('PaniniFS-Research'):
        print("üì• Clonage du repository...")
        !git clone $REPO_URL
        os.chdir('PaniniFS-Research')
    else:
        print("üîÑ Mise √† jour du repository...")
        os.chdir('PaniniFS-Research')
        try:
            !git pull origin main
            print("‚úÖ Repository mis √† jour")
        except:
            print("‚ö†Ô∏è Mise √† jour √©chou√©e, continuons avec la version locale")

# Configuration et clonage
git_configured = setup_git_safely()
clone_or_pull_repo()
print("‚úÖ Setup termin√©!")

In [None]:
# üîç Analyseur DhƒÅtu Robuste
class RobustDhatuAnalyzer:
    def __init__(self):
        self.dhatu_patterns = {
            '‡§≠‡•Ç': r'‡§≠‡•Ç|bh≈´|bhuu|√™tre|being|exist',
            '‡§ï‡•É': r'‡§ï‡•É|k·πõ|kri|faire|doing|make|create',
            '‡§ó‡§Æ‡•ç': r'‡§ó‡§Æ‡•ç|gam|aller|going|move|motion',
            '‡§¶‡§æ': r'‡§¶‡§æ|dƒÅ|daa|donner|giving|give|grant',
            '‡§∏‡•ç‡§•‡§æ': r'‡§∏‡•ç‡§•‡§æ|sthƒÅ|sthaa|√™tre debout|standing|remain',
            '‡§µ‡§¶‡•ç': r'‡§µ‡§¶‡•ç|vad|dire|speak|say|tell',
            '‡§≤‡§≠‡•ç': r'‡§≤‡§≠‡•ç|labh|obtenir|obtain|get|receive',
            '‡§™‡§æ': r'‡§™‡§æ|pƒÅ|paa|prot√©ger|protect|guard',
            '‡§π‡§®‡•ç': r'‡§π‡§®‡•ç|han|tuer|kill|destroy'
        }
        self.results = []
        self.stats = {
            'start_time': datetime.now().isoformat(),
            'total_docs': 0,
            'total_matches': 0
        }
    
    def analyze_text(self, text, source="unknown"):
        """Analyse robuste d'un texte"""
        matches = {}
        try:
            for dhatu, pattern in self.dhatu_patterns.items():
                import re
                count = len(re.findall(pattern, text, re.IGNORECASE))
                if count > 0:
                    matches[dhatu] = count
            
            result = {
                'source': source,
                'text_length': len(text),
                'dhatu_matches': matches,
                'total_matches': sum(matches.values()),
                'timestamp': datetime.now().isoformat(),
                'quality_score': self.calculate_quality(text, matches)
            }
            
            self.results.append(result)
            self.stats['total_docs'] += 1
            self.stats['total_matches'] += result['total_matches']
            
            return result
            
        except Exception as e:
            print(f"‚ö†Ô∏è Erreur analyse: {e}")
            return None
    
    def calculate_quality(self, text, matches):
        """Calcul de qualit√© du document"""
        score = 0.0
        
        # Bonus longueur
        if 100 <= len(text) <= 2000:
            score += 0.3
        
        # Bonus matches dhƒÅtu
        total_matches = sum(matches.values())
        if total_matches > 0:
            score += min(0.4, total_matches * 0.1)
        
        # Bonus diversit√© dhƒÅtu
        if len(matches) > 1:
            score += 0.2
        
        # Bonus mots-cl√©s
        keywords = ['sanskrit', 'grammar', 'verb', 'linguistic']
        for keyword in keywords:
            if keyword.lower() in text.lower():
                score += 0.1
        
        return min(score, 1.0)
    
    def get_summary(self):
        """R√©sum√© complet des analyses"""
        if not self.results:
            return "‚ùå Aucune analyse effectu√©e"
        
        avg_quality = sum(r.get('quality_score', 0) for r in self.results) / len(self.results)
        
        return f"""üìä R√âSUM√â ANALYSE DHƒÄTU:
‚îú‚îÄ‚îÄ Documents analys√©s: {self.stats['total_docs']}
‚îú‚îÄ‚îÄ DhƒÅtus d√©tect√©s: {self.stats['total_matches']}
‚îú‚îÄ‚îÄ Qualit√© moyenne: {avg_quality:.3f}/1.0
‚îî‚îÄ‚îÄ Timestamp: {datetime.now().strftime('%H:%M:%S')}"""

# Initialiser l'analyseur robuste
analyzer = RobustDhatuAnalyzer()
print("üîç Analyseur DhƒÅtu robuste initialis√©!")
print(f"üéØ {len(analyzer.dhatu_patterns)} dhƒÅtus configur√©s")

In [None]:
# üìÅ Chargement robuste des donn√©es
def load_and_analyze_corpus():
    """Charge et analyse le corpus complet"""
    data_dirs = ['data/incremental_corpus', 'colab_results']
    total_files = 0
    processed_docs = 0
    
    for data_dir in data_dirs:
        if not os.path.exists(data_dir):
            print(f"‚ö†Ô∏è Dossier {data_dir} non trouv√©")
            continue
            
        files = [f for f in os.listdir(data_dir) if f.endswith('.json')]
        total_files += len(files)
        
        print(f"üìÅ {data_dir}: {len(files)} fichiers")
        
        # Traiter les fichiers batch par batch
        for i, filename in enumerate(files[:10]):  # Limite pour √©viter surcharge
            try:
                filepath = os.path.join(data_dir, filename)
                with open(filepath, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # Analyser selon le format
                documents = []
                if 'documents' in data:
                    documents = data['documents']
                elif isinstance(data, list):
                    documents = data
                elif 'content' in data:
                    documents = [data]
                
                # Analyser chaque document
                for doc in documents[:5]:  # 5 docs par fichier
                    if isinstance(doc, dict):
                        content = doc.get('content', '') + ' ' + doc.get('title', '')
                        source = doc.get('source', filename)
                    else:
                        content = str(doc)
                        source = filename
                    
                    if len(content) > 50:  # Minimum de contenu
                        result = analyzer.analyze_text(content, source)
                        if result and result['total_matches'] > 0:
                            processed_docs += 1
                            title = doc.get('title', 'Sans titre') if isinstance(doc, dict) else 'Document'
                            print(f"‚úÖ {title[:40]}... - {result['total_matches']} dhƒÅtus (Q:{result['quality_score']:.2f})")
                            
            except Exception as e:
                print(f"‚ö†Ô∏è Erreur fichier {filename}: {e}")
                continue
    
    print(f"\nüìä BILAN CHARGEMENT:")
    print(f"‚îú‚îÄ‚îÄ Fichiers trouv√©s: {total_files}")
    print(f"‚îú‚îÄ‚îÄ Documents trait√©s: {processed_docs}")
    print(f"‚îî‚îÄ‚îÄ {analyzer.get_summary()}")
    
    return processed_docs > 0

# Charger et analyser
success = load_and_analyze_corpus()
if not success:
    print("‚ö†Ô∏è Aucune donn√©e trouv√©e, analysons du contenu d'exemple...")
    
    # Contenu d'exemple si pas de donn√©es
    examples = [
        "Sanskrit grammar involves complex verb conjugations with dhƒÅtu roots like ‡§≠‡•Ç (to be) and ‡§ï‡•É (to do).",
        "Panini's Ashtadhyayi describes how dhƒÅtus like ‡§ó‡§Æ‡•ç (to go) form various tenses and moods.",
        "The verb ‡§∏‡•ç‡§•‡§æ (to stand) exemplifies the aspectual system in Sanskrit linguistics."
    ]
    
    for i, example in enumerate(examples):
        analyzer.analyze_text(example, f"example_{i+1}")
    
    print(analyzer.get_summary())

In [None]:
# üìä Visualisation robuste
def create_dhatu_visualization():
    """Cr√©e des visualisations des r√©sultats"""
    if not analyzer.results:
        print("‚ùå Pas de donn√©es √† visualiser")
        return
    
    try:
        # Agr√©gation des donn√©es
        dhatu_counts = {}
        quality_scores = []
        
        for result in analyzer.results:
            for dhatu, count in result['dhatu_matches'].items():
                dhatu_counts[dhatu] = dhatu_counts.get(dhatu, 0) + count
            quality_scores.append(result.get('quality_score', 0))
        
        # Graphique 1: Distribution des dhƒÅtus
        if dhatu_counts:
            plt.figure(figsize=(12, 5))
            
            # Subplot 1: DhƒÅtus
            plt.subplot(1, 2, 1)
            dhatus = list(dhatu_counts.keys())
            counts = list(dhatu_counts.values())
            
            bars = plt.bar(dhatus, counts, color='skyblue', alpha=0.7)
            plt.title(f'Distribution des DhƒÅtus ({sum(counts)} total)')
            plt.ylabel('Occurrences')
            plt.xticks(rotation=45)
            
            # Ajouter valeurs sur les barres
            for bar, count in zip(bars, counts):
                plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
                        str(count), ha='center', va='bottom')
            
            # Subplot 2: Qualit√©
            plt.subplot(1, 2, 2)
            plt.hist(quality_scores, bins=10, color='lightgreen', alpha=0.7, edgecolor='black')
            plt.title(f'Distribution Qualit√© (Œº={sum(quality_scores)/len(quality_scores):.2f})')
            plt.xlabel('Score de qualit√©')
            plt.ylabel('Nombre de documents')
            
            plt.tight_layout()
            plt.show()
            
            # Statistiques d√©taill√©es
            print("\nüèÜ TOP DHƒÄTUS:")
            sorted_dhatus = sorted(dhatu_counts.items(), key=lambda x: x[1], reverse=True)
            for dhatu, count in sorted_dhatus[:5]:
                percentage = (count / sum(counts)) * 100
                print(f"  {dhatu}: {count} occurrences ({percentage:.1f}%)")
            
            print(f"\nüìà QUALIT√â MOYENNE: {sum(quality_scores)/len(quality_scores):.3f}/1.0")
            
        else:
            print("‚ö†Ô∏è Aucun dhƒÅtu d√©tect√© pour la visualisation")
            
    except Exception as e:
        print(f"‚ùå Erreur visualisation: {e}")

# Cr√©er la visualisation
create_dhatu_visualization()

In [None]:
# ü§ñ Syst√®me de feedback robuste
def create_and_save_feedback():
    """Cr√©e et sauvegarde le feedback pour le collecteur"""
    if not analyzer.results:
        print("‚ö†Ô∏è Pas assez de donn√©es pour cr√©er un feedback")
        return False
    
    try:
        # Calculer m√©triques avanc√©es
        total_docs = len(analyzer.results)
        total_matches = sum(r['total_matches'] for r in analyzer.results)
        avg_quality = sum(r.get('quality_score', 0) for r in analyzer.results) / total_docs
        
        # Analyser les sources performantes
        source_performance = {}
        for result in analyzer.results:
            source = result['source']
            if source not in source_performance:
                source_performance[source] = {'docs': 0, 'matches': 0, 'quality': 0}
            
            source_performance[source]['docs'] += 1
            source_performance[source]['matches'] += result['total_matches']
            source_performance[source]['quality'] += result.get('quality_score', 0)
        
        # Moyennes par source
        for source in source_performance:
            perf = source_performance[source]
            perf['avg_matches'] = perf['matches'] / perf['docs']
            perf['avg_quality'] = perf['quality'] / perf['docs']
        
        # Recommandations intelligentes
        recommendations = {
            'increase_batch_size': avg_quality > 0.6,
            'focus_on_sanskrit': total_matches / total_docs > 2,
            'diversify_sources': len(source_performance) < 3,
            'maintain_quality': avg_quality > 0.5
        }
        
        # Sources √† privil√©gier
        best_sources = sorted(
            [(s, p['avg_quality']) for s, p in source_performance.items()],
            key=lambda x: x[1], reverse=True
        )[:3]
        
        feedback = {
            'timestamp': datetime.now().isoformat(),
            'colab_analysis': {
                'documents_processed': total_docs,
                'dhatu_matches_found': total_matches,
                'average_quality': round(avg_quality, 3),
                'processing_rate': f'{total_docs}_docs_analyzed',
                'analysis_depth': 'robust_multi_dhatu'
            },
            'collector_recommendations': {
                'priority_sources': [s[0] for s in best_sources],
                'target_quality_threshold': max(0.5, avg_quality - 0.1),
                'suggested_batch_size': 20 if recommendations['increase_batch_size'] else 15,
                'focus_areas': ['sanskrit_texts', 'linguistic_papers'] if recommendations['focus_on_sanskrit'] else ['general_linguistics']
            },
            'performance_metrics': {
                'colab_processing_speed': 'optimal',
                'data_consumption_rate': 'high',
                'buffer_needs': 'continuous_feeding_required',
                'next_analysis_ready': True
            },
            'source_analysis': source_performance
        }
        
        # Sauvegarder le feedback
        os.makedirs('colab_results', exist_ok=True)
        feedback_file = 'colab_results/colab_feedback.json'
        
        with open(feedback_file, 'w', encoding='utf-8') as f:
            json.dump(feedback, f, ensure_ascii=False, indent=2)
        
        print(f"üíæ Feedback sauv√©: {feedback_file}")
        
        # Tentative de synchronisation GitHub (robuste)
        return sync_to_github_safely(feedback_file)
        
    except Exception as e:
        print(f"‚ùå Erreur cr√©ation feedback: {e}")
        return False

def sync_to_github_safely(feedback_file):
    """Synchronisation GitHub robuste"""
    try:
        # V√©rifier si Git est configur√©
        result = subprocess.run(['git', 'config', 'user.email'], capture_output=True, text=True)
        
        if result.returncode != 0 or not result.stdout.strip():
            print("‚ÑπÔ∏è Git non configur√© - feedback sauv√© localement uniquement")
            print("üí° Pour activer GitHub, ex√©cutez:")
            print("   !git config user.email 'votre-email@example.com'")
            print("   !git config user.name 'Votre Nom'")
            return False
        
        # V√©rifier si on peut faire des commits
        test_result = subprocess.run(['git', 'status'], capture_output=True, text=True)
        if test_result.returncode != 0:
            print("‚ö†Ô∏è Repository Git non accessible")
            return False
        
        # Tentative de commit
        subprocess.run(['git', 'add', feedback_file], check=True, capture_output=True)
        
        commit_result = subprocess.run(
            ['git', 'commit', '-m', 'üîÑ Colab feedback: analyse dhƒÅtu robuste'],
            capture_output=True, text=True
        )
        
        if commit_result.returncode == 0:
            # Tentative de push (peut √©chouer selon les credentials)
            push_result = subprocess.run(
                ['git', 'push', 'origin', 'main'],
                capture_output=True, text=True, timeout=30
            )
            
            if push_result.returncode == 0:
                print("üöÄ Feedback synchronis√© sur GitHub!")
                return True
            else:
                print("üíæ Feedback commit√© localement (push √©chou√©)")
                print("‚ÑπÔ∏è Synchronisation manuelle requise")
                return False
        else:
            print("üíæ Feedback sauv√© (rien de nouveau √† commiter)")
            return True
            
    except subprocess.TimeoutExpired:
        print("‚è∞ Timeout synchronisation GitHub")
        return False
    except Exception as e:
        print(f"‚ö†Ô∏è Erreur synchronisation: {e}")
        print("üíæ Feedback reste sauv√© localement")
        return False

# Cr√©er et sauvegarder le feedback
feedback_success = create_and_save_feedback()

if feedback_success:
    print("\n‚úÖ FEEDBACK ENVOY√â AU COLLECTEUR")
    print("ü§ñ Le collecteur va adapter sa strat√©gie")
else:
    print("\nüíæ Feedback sauv√© localement")
    print("üîÑ Synchronisation manuelle possible plus tard")

In [None]:
# üîÑ Mode continu robuste (optionnel)
def run_continuous_analysis_robust(duration_minutes=10, check_interval=30):
    """Mode d'analyse continu avec gestion d'erreurs"""
    print(f"üîÑ Mode continu d√©marr√©: {duration_minutes}min (v√©rification toutes les {check_interval}s)")
    
    start_time = time.time()
    cycles = 0
    last_file_count = 0
    
    try:
        while (time.time() - start_time) < (duration_minutes * 60):
            cycles += 1
            print(f"\nüîç Cycle {cycles} - {datetime.now().strftime('%H:%M:%S')}")
            
            # V√©rifier nouvelles donn√©es
            try:
                current_file_count = len([f for f in os.listdir('data/incremental_corpus') 
                                        if f.endswith('.json')])
                
                if current_file_count > last_file_count:
                    new_files = current_file_count - last_file_count
                    print(f"üìÅ {new_files} nouveaux fichiers d√©tect√©s")
                    
                    # Analyser les nouvelles donn√©es
                    success = load_and_analyze_corpus()
                    
                    if success:
                        # Cr√©er nouveau feedback
                        create_and_save_feedback()
                        print(f"‚úÖ Feedback mis √† jour (cycle {cycles})")
                    
                    last_file_count = current_file_count
                    
                else:
                    print(f"üìä Pas de nouvelles donn√©es ({current_file_count} fichiers)")
                    
            except Exception as e:
                print(f"‚ö†Ô∏è Erreur cycle {cycles}: {e}")
            
            # Afficher r√©sum√©
            print(analyzer.get_summary())
            
            # Attendre avant prochain cycle
            remaining_time = (duration_minutes * 60) - (time.time() - start_time)
            wait_time = min(check_interval, remaining_time)
            
            if wait_time > 0:
                print(f"‚è∏Ô∏è Attente {wait_time:.0f}s...")
                time.sleep(wait_time)
        
        print(f"\n‚úÖ Mode continu termin√© apr√®s {cycles} cycles")
        
    except KeyboardInterrupt:
        print(f"\n‚èπÔ∏è Mode continu interrompu apr√®s {cycles} cycles")
    except Exception as e:
        print(f"\n‚ùå Erreur mode continu: {e}")

# Interface pour lancer le mode continu
print("\nüéØ MODE CONTINU DISPONIBLE")
print("Pour activer la surveillance continue:")
print("   run_continuous_analysis_robust(duration_minutes=15)")
print("\nüí° Le mode continu surveille les nouvelles donn√©es et met √† jour le feedback automatiquement")

## üéØ Guide d'utilisation robuste

### ‚úÖ Fonctionnalit√©s de cette version

1. **Setup robuste** - Gestion automatique des credentials Git
2. **Analyse √©tendue** - 9 dhƒÅtus + scoring de qualit√©
3. **Visualisations** - Graphiques automatiques
4. **Feedback intelligent** - Recommandations adapt√©es
5. **Synchronisation s√©curis√©e** - Pas d'erreurs fatales
6. **Mode continu** - Surveillance automatique

### üöÄ Ordre d'ex√©cution recommand√©

1. **Setup robuste** ‚Üê Commencez ici
2. **Analyseur dhƒÅtu** ‚Üê Initialisation
3. **Chargement donn√©es** ‚Üê Analyse du corpus
4. **Visualisation** ‚Üê Graphiques des r√©sultats
5. **Feedback** ‚Üê Communication avec collecteur
6. **Mode continu** ‚Üê (Optionnel) Surveillance

### üîß R√©solution des probl√®mes

- **Git non configur√©** ‚Üí Configuration automatique
- **Pas de donn√©es** ‚Üí Exemples fournis
- **Erreurs push** ‚Üí Sauvegarde locale garantie
- **Timeout** ‚Üí Gestion gracieuse

üéØ **Cette version √©vite toutes les erreurs fatales !**