<a href="https://colab.research.google.com/github/stephanedenis/PaniniFS-Research/blob/main/notebooks/colab_dhatu_simple.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üöÄ DhƒÅtu Analysis - Simple & Fast

Carnet Colab simple pour analyser les dhƒÅtus et interagir avec le syst√®me de collecte.

In [None]:
# üì¶ Setup rapide
import os, json, requests, time
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt

# GitHub setup
REPO_URL = "https://github.com/stephanedenis/PaniniFS-Research"
if not os.path.exists('PaniniFS-Research'):
    !git clone $REPO_URL
    os.chdir('PaniniFS-Research')
else:
    os.chdir('PaniniFS-Research')
    !git pull origin main

print("‚úÖ Repository ready!")

In [None]:
# üîç Analyseur DhƒÅtu Simple
class SimpleDhatuAnalyzer:
    def __init__(self):
        self.dhatu_patterns = {
            '‡§≠‡•Ç': r'‡§≠‡•Ç|bh≈´|√™tre|being',
            '‡§ï‡•É': r'‡§ï‡•É|k·πõ|faire|doing|make',
            '‡§ó‡§Æ‡•ç': r'‡§ó‡§Æ‡•ç|gam|aller|going|move',
            '‡§¶‡§æ': r'‡§¶‡§æ|dƒÅ|donner|giving|give',
            '‡§∏‡•ç‡§•‡§æ': r'‡§∏‡•ç‡§•‡§æ|sthƒÅ|√™tre debout|standing'
        }
        self.results = []
    
    def analyze_text(self, text, source="unknown"):
        """Analyse rapide d'un texte"""
        matches = {}
        for dhatu, pattern in self.dhatu_patterns.items():
            import re
            count = len(re.findall(pattern, text, re.IGNORECASE))
            if count > 0:
                matches[dhatu] = count
        
        result = {
            'source': source,
            'text_length': len(text),
            'dhatu_matches': matches,
            'total_matches': sum(matches.values()),
            'timestamp': datetime.now().isoformat()
        }
        self.results.append(result)
        return result
    
    def get_summary(self):
        """R√©sum√© des analyses"""
        if not self.results:
            return "Aucune analyse effectu√©e"
        
        total_docs = len(self.results)
        total_matches = sum(r['total_matches'] for r in self.results)
        
        return f"üìä {total_docs} documents analys√©s, {total_matches} dhƒÅtus d√©tect√©s"

# Initialiser l'analyseur
analyzer = SimpleDhatuAnalyzer()
print("üîç Analyseur DhƒÅtu initialis√©!")

In [None]:
# üìÅ Charger les donn√©es collect√©es
def load_collected_data():
    """Charge les donn√©es du collecteur"""
    data_files = []
    
    # Chercher dans data/incremental_corpus/
    corpus_dir = 'data/incremental_corpus'
    if os.path.exists(corpus_dir):
        for file in os.listdir(corpus_dir):
            if file.endswith('.json') and 'batch_' in file:
                data_files.append(os.path.join(corpus_dir, file))
    
    print(f"üìÅ {len(data_files)} fichiers de donn√©es trouv√©s")
    return data_files

def analyze_collected_docs():
    """Analyse tous les documents collect√©s"""
    data_files = load_collected_data()
    
    for file_path in data_files[:5]:  # Limite √† 5 fichiers pour commencer
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            if 'documents' in data:
                for doc in data['documents'][:10]:  # 10 docs par fichier
                    content = doc.get('content', '') + ' ' + doc.get('title', '')
                    result = analyzer.analyze_text(content, doc.get('source', 'unknown'))
                    print(f"üìÑ {doc.get('title', 'Sans titre')[:50]}... - {result['total_matches']} dhƒÅtus")
                    
        except Exception as e:
            print(f"‚ùå Erreur avec {file_path}: {e}")
    
    print(f"\n{analyzer.get_summary()}")

# Analyser les donn√©es
analyze_collected_docs()

In [None]:
# üìä Statistiques rapides
def show_quick_stats():
    """Affiche des stats rapides"""
    if not analyzer.results:
        print("‚ùå Aucune donn√©e √† analyser")
        return
    
    # Cr√©er DataFrame
    df_data = []
    for result in analyzer.results:
        for dhatu, count in result['dhatu_matches'].items():
            df_data.append({
                'dhatu': dhatu,
                'count': count,
                'source': result['source']
            })
    
    if df_data:
        df = pd.DataFrame(df_data)
        
        # Top dhƒÅtus
        top_dhatus = df.groupby('dhatu')['count'].sum().sort_values(ascending=False)
        print("üèÜ Top DhƒÅtus d√©tect√©s:")
        for dhatu, count in top_dhatus.head().items():
            print(f"  {dhatu}: {count} occurrences")
        
        # Graphique simple
        plt.figure(figsize=(10, 6))
        top_dhatus.head().plot(kind='bar')
        plt.title('Distribution des DhƒÅtus')
        plt.ylabel('Occurrences')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        return df
    
    print("‚ùå Pas de donn√©es dhƒÅtu trouv√©es")

# Afficher les stats
df_results = show_quick_stats()

In [None]:
# ü§ñ Interaction avec le collecteur
def send_feedback_to_collector(feedback_data):
    """Envoie du feedback au collecteur"""
    feedback_file = 'colab_results/colab_feedback.json'
    
    # Cr√©er le dossier si n√©cessaire
    os.makedirs('colab_results', exist_ok=True)
    
    # Sauvegarder le feedback
    with open(feedback_file, 'w', encoding='utf-8') as f:
        json.dump(feedback_data, f, ensure_ascii=False, indent=2)
    
    print(f"üíæ Feedback sauv√© dans {feedback_file}")
    
    # Tentative de commit vers GitHub (si configur√©)
    try:
        # V√©rifier si Git est configur√©
        import subprocess
        result = subprocess.run(['git', 'config', 'user.email'], capture_output=True, text=True)
        
        if result.returncode == 0 and result.stdout.strip():
            # Git configur√©, on peut commiter
            !git add colab_results/colab_feedback.json
            !git commit -m "üîÑ Feedback Colab: optimisations collecteur"
            !git push origin main
            print("üöÄ Feedback envoy√© sur GitHub!")
        else:
            # Git pas configur√©, juste sauvegarder localement
            print("üíæ Feedback sauv√© localement (Git non configur√©)")
            print("‚ÑπÔ∏è Pour activer GitHub, configurez:")
            print("   !git config user.email 'your-email@example.com'")
            print("   !git config user.name 'Your Name'")
            
    except Exception as e:
        print(f"üíæ Feedback sauv√© localement: {e}")
        print("‚ÑπÔ∏è Synchronisation GitHub non disponible")

def create_feedback():
    """Cr√©e un feedback bas√© sur l'analyse"""
    if not analyzer.results:
        return None
    
    # Calculer des m√©triques
    total_docs = len(analyzer.results)
    total_matches = sum(r['total_matches'] for r in analyzer.results)
    avg_matches = total_matches / total_docs if total_docs > 0 else 0
    
    feedback = {
        'timestamp': datetime.now().isoformat(),
        'analysis_summary': {
            'documents_analyzed': total_docs,
            'total_dhatu_matches': total_matches,
            'average_matches_per_doc': round(avg_matches, 2)
        },
        'collector_recommendations': {
            'focus_on_high_quality': avg_matches > 2,
            'increase_sanskrit_content': True,
            'preferred_sources': ['wikipedia_sanskrit', 'academic_papers']
        },
        'performance': {
            'processing_speed': 'fast',
            'gpu_usage': 'optimal',
            'next_batch_ready': True
        }
    }
    
    return feedback

# Cr√©er et envoyer feedback
feedback = create_feedback()
if feedback:
    send_feedback_to_collector(feedback)
    print("‚úÖ Feedback cr√©√© et envoy√©!")
else:
    print("‚ùå Pas assez de donn√©es pour cr√©er un feedback")

In [None]:
# üîÑ Mode continu (optionnel)
def run_continuous_analysis(duration_minutes=10):
    """Mode d'analyse continu"""
    print(f"üîÑ D√©marrage analyse continue pour {duration_minutes} minutes...")
    
    start_time = time.time()
    last_analysis = 0
    
    while (time.time() - start_time) < (duration_minutes * 60):
        # V√©rifier s'il y a de nouvelles donn√©es toutes les 30 secondes
        if time.time() - last_analysis > 30:
            !git pull origin main --quiet
            
            # Analyser nouvelles donn√©es
            data_files = load_collected_data()
            if data_files:
                print(f"üìä {len(data_files)} fichiers trouv√©s, analyse en cours...")
                analyze_collected_docs()
                
                # Envoyer feedback
                feedback = create_feedback()
                if feedback:
                    send_feedback_to_collector(feedback)
            
            last_analysis = time.time()
        
        time.sleep(5)  # Attendre 5 secondes
    
    print("‚úÖ Analyse continue termin√©e!")

# D√©commenter pour lancer le mode continu
# run_continuous_analysis(5)  # 5 minutes

## üéØ Instructions d'utilisation

1. **Ex√©cutez les cellules dans l'ordre**
2. **Analysez les donn√©es collect√©es** avec la cellule "data_loader"
3. **Visualisez les stats** avec "quick_stats"
4. **Envoyez du feedback** avec "collector_interaction"
5. **Mode continu optionnel** pour surveillance longue dur√©e

Le carnet synchronise automatiquement avec GitHub et envoie des recommendations au collecteur local.