In [None]:
#!/usr/bin/env python3
"""
Enhanced NIRF RAG Query System v3.0 - Advanced Analytics Edition
Advanced querying with Qdrant vector database and Groq API
NEW: Faculty analytics, NIRF parameter analysis, advanced comparison modes
"""

import subprocess
import sys
import json
import warnings
import os
from datetime import datetime
from typing import List, Dict, Optional, Any, Tuple
import time
from pathlib import Path
from collections import Counter, defaultdict
import statistics
import re

# Suppress warnings
warnings.filterwarnings('ignore')
os.environ.update({
    'TF_CPP_MIN_LOG_LEVEL': '3',
    'TOKENIZERS_PARALLELISM': 'false'
})

def install_packages():
    """Install required packages with progress indication"""
    packages = [
        'qdrant-client',
        'sentence-transformers',
        'groq',
        'pandas',
        'numpy',
        'torch',
        'rich',
        'tqdm',
        'matplotlib',
        'seaborn',
        'plotly'
    ]

    print("📦 Installing required packages...")
    for pkg in packages:
        try:
            print(f"   Installing {pkg}...", end=' ')
            subprocess.check_call([
                sys.executable, '-m', 'pip', 'install', pkg
            ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            print("✅")
        except subprocess.CalledProcessError:
            print(f"❌ Failed to install {pkg}")

# Auto-install packages
try:
    from qdrant_client import QdrantClient
    from qdrant_client.models import Filter, FieldCondition, MatchValue, Range
    from sentence_transformers import SentenceTransformer
    from groq import Groq
    import pandas as pd
    import numpy as np
    from rich.console import Console
    from rich.table import Table
    from rich.progress import Progress, track
    from rich.panel import Panel
    from rich.columns import Columns
    from rich.text import Text
    from tqdm import tqdm
except ImportError as e:
    print(f"Installing missing packages... ({e})")
    install_packages()
    from qdrant_client import QdrantClient
    from qdrant_client.models import Filter, FieldCondition, MatchValue, Range
    from sentence_transformers import SentenceTransformer
    from groq import Groq
    import pandas as pd
    import numpy as np
    from rich.console import Console
    from rich.table import Table
    from rich.progress import Progress, track
    from rich.panel import Panel
    from rich.columns import Columns
    from rich.text import Text
    from tqdm import tqdm

class NIRFAnalytics:
    """Advanced analytics module for NIRF data"""

    # NIRF Parameter Definitions with Weightages
    NIRF_PARAMETERS = {
        'TLR': {
            'name': 'Teaching, Learning & Resources',
            'weight': 30,
            'sub_params': ['SS', 'FSR', 'FQE', 'FRU'],
            'description': 'Student Strength, Faculty-Student Ratio, Faculty Qualification & Experience, Financial Resources & Utilization'
        },
        'RP': {
            'name': 'Research and Professional Practice',
            'weight': 30,
            'sub_params': ['PU', 'QP', 'IPR', 'FPPP'],
            'description': 'Publications, Quality Publications, IPR & Patents, Faculty Professional Practice'
        },
        'GO': {
            'name': 'Graduation Outcomes',
            'weight': 20,
            'sub_params': ['GPH', 'GUE', 'GMS', 'GPHD'],
            'description': 'Graduate Placement & Higher Studies, University Examinations, Median Salary, PhD Students'
        },
        'OI': {
            'name': 'Outreach and Inclusivity',
            'weight': 10,
            'sub_params': ['RD', 'WD', 'ESCS', 'PCS'],
            'description': 'Regional Diversity, Women Diversity, Economically & Socially Challenged Students, Physically Challenged Students'
        },
        'PR': {
            'name': 'Perception',
            'weight': 10,
            'sub_params': ['PEER', 'EMP', 'ACAD'],
            'description': 'Peer Perception, Employer Perception, Academic Reputation'
        }
    }

    @staticmethod
    def extract_faculty_stats(payload_data: Dict) -> Dict:
        """Extract and analyze faculty statistics from payload"""
        faculty_stats = {
            'total_faculty': 0,
            'avg_experience': 0,
            'designation_breakdown': {},
            'qualification_breakdown': {},
            'experience_distribution': {},
            'faculty_records': []
        }

        try:
            # Extract faculty records
            faculty_records = []

            # Check for faculty_records in payload
            if 'faculty_records' in payload_data:
                faculty_records = payload_data['faculty_records']
            elif 'faculty_data' in payload_data:
                faculty_records = payload_data['faculty_data']

            if not faculty_records and isinstance(payload_data, dict):
                # Search for faculty information in text content
                content = payload_data.get('content', '')
                faculty_info = NIRFAnalytics.parse_faculty_from_text(content)
                if faculty_info:
                    faculty_records = faculty_info

            if faculty_records:
                faculty_stats['faculty_records'] = faculty_records
                faculty_stats['total_faculty'] = len(faculty_records)

                # Analyze experience
                experiences = []
                designations = []
                qualifications = []

                for faculty in faculty_records:
                    if isinstance(faculty, dict):
                        # Extract experience
                        exp = faculty.get('experience', 0)
                        if isinstance(exp, str):
                            exp = re.findall(r'\d+', exp)
                            exp = int(exp[0]) if exp else 0
                        experiences.append(int(exp) if exp else 0)

                        # Extract designation
                        designation = faculty.get('designation', 'Unknown')
                        designations.append(designation)

                        # Extract qualification
                        qualification = faculty.get('qualification', 'Unknown')
                        qualifications.append(qualification)

                if experiences:
                    faculty_stats['avg_experience'] = statistics.mean(experiences)
                    faculty_stats['experience_distribution'] = {
                        '0-5 years': sum(1 for e in experiences if 0 <= e <= 5),
                        '6-15 years': sum(1 for e in experiences if 6 <= e <= 15),
                        '16-25 years': sum(1 for e in experiences if 16 <= e <= 25),
                        '26+ years': sum(1 for e in experiences if e > 25)
                    }

                faculty_stats['designation_breakdown'] = dict(Counter(designations))
                faculty_stats['qualification_breakdown'] = dict(Counter(qualifications))

        except Exception as e:
            pass  # Silent error handling

        return faculty_stats

    @staticmethod
    def parse_faculty_from_text(text: str) -> List[Dict]:
        """Parse faculty information from text content"""
        faculty_list = []

        # Look for faculty patterns in text
        patterns = [
            r'Professor\s+([A-Za-z\s\.]+).*?(\d+)\s*years?',
            r'Dr\.?\s+([A-Za-z\s\.]+).*?Professor.*?(\d+)\s*years?',
            r'([A-Za-z\s\.]+)\s*-\s*Professor.*?(\d+)\s*years?'
        ]

        for pattern in patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                faculty_list.append({
                    'name': match.group(1).strip(),
                    'designation': 'Professor',
                    'experience': int(match.group(2)) if match.group(2) else 0
                })

        return faculty_list[:50]  # Limit to avoid too much data

    @staticmethod
    def analyze_nirf_trends(results: List[Dict]) -> Dict:
        """Analyze NIRF ranking trends across years"""
        trends = {
            'year_wise_ranks': {},
            'rank_improvement': {},
            'score_trends': {},
            'parameter_analysis': {},
            'insights': []
        }

        year_data = defaultdict(list)

        for result in results:
            year = result['metadata'].get('year', 'Unknown')
            rank = result['metadata'].get('rank', 'N/A')
            college = result['metadata'].get('college_name', '')

            if year != 'Unknown' and rank != 'N/A':
                try:
                    rank_num = int(str(rank).replace('>', '').replace('<', '').split('-')[0])
                    year_data[str(year)].append({
                        'college': college,
                        'rank': rank_num,
                        'content': result['content']
                    })
                except:
                    continue

        # Calculate trends
        for year, data in year_data.items():
            avg_rank = statistics.mean([d['rank'] for d in data])
            trends['year_wise_ranks'][year] = {
                'avg_rank': avg_rank,
                'institutions': len(data),
                'best_rank': min(d['rank'] for d in data),
                'worst_rank': max(d['rank'] for d in data)
            }

        # Generate insights
        if len(year_data) > 1:
            years = sorted(year_data.keys())
            if len(years) >= 2:
                recent_avg = trends['year_wise_ranks'][years[-1]]['avg_rank']
                prev_avg = trends['year_wise_ranks'][years[-2]]['avg_rank']

                if recent_avg < prev_avg:
                    trends['insights'].append(f"📈 Ranking improved from {prev_avg:.1f} to {recent_avg:.1f}")
                elif recent_avg > prev_avg:
                    trends['insights'].append(f"📉 Ranking declined from {prev_avg:.1f} to {recent_avg:.1f}")
                else:
                    trends['insights'].append(f"➡️ Ranking remained stable at {recent_avg:.1f}")

        return trends

class EnhancedNIRFRAG:
    def __init__(self):
        """Initialize the RAG system with enhanced configurations"""
        self.console = Console()
        self.analytics = NIRFAnalytics()

        # API configurations
        self.QDRANT_URL = "https://b5651607-31ce-49ba-916d-c35c89d731d2.us-east4-0.gcp.cloud.qdrant.io"
        self.QDRANT_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.0ApHZL4Qn_A8bx7FCC62nx-IOrHI84W7GZlUZEyVgKk"
        self.GROQ_API_KEY =

        # Initialize clients
        self.qdrant_client = None
        self.groq_client = None
        self.encoder = None

        # Set default collection
        self.selected_collection = '200528'

        # Query configuration
        self.default_limit = 10
        self.similarity_threshold = 0.3
        self.cache_enabled = True
        self.query_cache = {}

        # Session statistics
        self.session_stats = {
            'queries_executed': 0,
            'total_results': 0,
            'avg_response_time': 0,
            'start_time': datetime.now(),
            'faculty_analyzed': 0,
            'trends_generated': 0
        }

        self.console.print("🚀 Enhanced NIRF RAG System v3.0 - Advanced Analytics Edition", style="bold green")
        self.show_nirf_parameters_overview()

    def show_nirf_parameters_overview(self):
        """Display NIRF parameters overview"""
        table = Table(title="🎯 NIRF Ranking Parameters & Weightages")
        table.add_column("Parameter", style="cyan", width=10)
        table.add_column("Full Name", style="magenta", width=25)
        table.add_column("Weight", style="green", width=8)
        table.add_column("Sub-Parameters", style="yellow", width=20)

        for param, details in self.analytics.NIRF_PARAMETERS.items():
            table.add_row(
                param,
                details['name'],
                f"{details['weight']}%",
                ", ".join(details['sub_params'])
            )

        self.console.print(table)

    def setup_connections(self) -> bool:
        """Setup all API connections with enhanced error handling"""
        try:
            with Progress() as progress:
                task = progress.add_task("Setting up connections...", total=3)

                # Connect to Qdrant
                progress.update(task, description="Connecting to Qdrant...")
                self.qdrant_client = QdrantClient(
                    url=self.QDRANT_URL,
                    api_key=self.QDRANT_API_KEY
                )
                progress.advance(task)

                # Connect to Groq
                progress.update(task, description="Connecting to Groq...")
                self.groq_client = Groq(api_key=self.GROQ_API_KEY)
                progress.advance(task)

                # Initialize encoder
                progress.update(task, description="Loading sentence transformer...")
                self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
                progress.advance(task)

            self.console.print("✅ All connections established successfully", style="bold green")
            return True

        except Exception as e:
            self.console.print(f"❌ Connection failed: {e}", style="bold red")
            return False

    def search_similar_chunks(self, query: str, college_filter: str = None, year_filter: str = None, limit: int = None) -> List[Dict]:
        """Enhanced search with manual filtering and improved error handling"""
        if not self.selected_collection:
            self.console.print("❌ No collection selected", style="bold red")
            return []

        # Check cache first
        cache_key = f"{query}_{college_filter}_{year_filter}_{limit}"
        if self.cache_enabled and cache_key in self.query_cache:
            self.console.print("🔄 Using cached results", style="yellow")
            return self.query_cache[cache_key]

        start_time = time.time()

        try:
            # Generate query embedding
            search_query = query
            if college_filter:
                search_query += f" {college_filter} {college_filter.replace(' ', '').upper()}"

            query_embedding = self.encoder.encode([search_query])[0].tolist()

            # Search without filters first (to avoid indexing issues)
            self.console.print("🔍 Searching vector database...", style="cyan")
            search_results = self.qdrant_client.search(
                collection_name=self.selected_collection,
                query_vector=query_embedding,
                limit=limit or (self.default_limit * 3),
                with_payload=True,
                score_threshold=self.similarity_threshold
            )

            # Manual filtering and enhanced data extraction
            processed_results = []
            for result in search_results:
                # Manual college name filtering
                if college_filter:
                    college_name = result.payload.get('college_name', '').lower()
                    institution_name = result.payload.get('institution', '').lower()
                    if (college_filter.lower() not in college_name and
                        college_filter.lower() not in institution_name):
                        continue

                # Manual year filtering
                if year_filter:
                    result_year = str(result.payload.get('year', ''))
                    if year_filter not in result_year:
                        continue

                # Extract faculty statistics if available
                faculty_stats = self.analytics.extract_faculty_stats(result.payload)

                processed_result = {
                    'content': result.payload.get('content', ''),
                    'score': result.score,
                    'metadata': {
                        'college_name': result.payload.get('college_name', 'N/A'),
                        'year': result.payload.get('year', 'N/A'),
                        'content_type': result.payload.get('content_type', 'N/A'),
                        'chunk_type': result.payload.get('chunk_type', 'N/A'),
                        'department': result.payload.get('department', 'N/A'),
                        'state': result.payload.get('state', 'N/A'),
                        'city': result.payload.get('city', 'N/A'),
                        'rank': result.payload.get('rank', 'N/A'),
                        'ranking_category': result.payload.get('ranking_category', 'N/A'),
                        'nirf_id': result.payload.get('nirf_id', 'N/A'),
                        'faculty_stats': faculty_stats if faculty_stats['total_faculty'] > 0 else None
                    },
                    'raw_payload': result.payload  # Keep raw data for advanced analysis
                }

                processed_results.append(processed_result)

            # Limit results
            if limit:
                processed_results = processed_results[:limit]

            # Cache results
            if self.cache_enabled:
                self.query_cache[cache_key] = processed_results

            # Update stats
            response_time = time.time() - start_time
            self.update_session_stats(len(processed_results), response_time)

            return processed_results

        except Exception as e:
            self.console.print(f"❌ Search failed: {e}", style="bold red")
            return []

    def generate_response(self, query: str, context_chunks: List[Dict], response_style: str = "comprehensive") -> str:
        """Enhanced response generation with faculty analytics and NIRF parameter analysis"""
        if not context_chunks:
            return "I couldn't find relevant information in the NIRF database to answer your question. Please try a different query or check if the institution name is correct."

        # Analyze faculty data across all results
        all_faculty_stats = []
        institutions_with_faculty = {}

        for chunk in context_chunks:
            faculty_data = chunk['metadata'].get('faculty_stats')
            if faculty_data and faculty_data['total_faculty'] > 0:
                all_faculty_stats.append(faculty_data)
                inst_name = chunk['metadata']['college_name']
                institutions_with_faculty[inst_name] = faculty_data
                self.session_stats['faculty_analyzed'] += faculty_data['total_faculty']

        # Analyze NIRF trends
        trends_analysis = self.analytics.analyze_nirf_trends(context_chunks)
        if trends_analysis['insights']:
            self.session_stats['trends_generated'] += 1

        # Prepare enhanced context
        context_items = []
        for i, chunk in enumerate(context_chunks):
            faculty_info = ""
            if chunk['metadata'].get('faculty_stats') and chunk['metadata']['faculty_stats']['total_faculty'] > 0:
                fs = chunk['metadata']['faculty_stats']
                faculty_info = f"""
Faculty Statistics:
- Total Faculty: {fs['total_faculty']}
- Average Experience: {fs['avg_experience']:.1f} years
- Top Designations: {dict(list(fs['designation_breakdown'].items())[:3]) if fs['designation_breakdown'] else 'N/A'}
- Experience Distribution: {fs['experience_distribution']}"""

            context_items.append(f"""
[Result {i+1}] Relevance Score: {chunk['score']:.3f}
Institution: {chunk['metadata']['college_name']}
Year: {chunk['metadata']['year']}
NIRF Rank: {chunk['metadata']['rank']}
Ranking Category: {chunk['metadata']['ranking_category']}
Location: {chunk['metadata']['city']}, {chunk['metadata']['state']}
Content Type: {chunk['metadata']['content_type']}
Section: {chunk['metadata']['chunk_type']}
{faculty_info}

Content: {chunk['content']}
""")

        context = "\n".join(context_items)

        # Add trends analysis to context
        trends_context = ""
        if trends_analysis['year_wise_ranks']:
            trends_context = f"""
NIRF Trends Analysis:
{json.dumps(trends_analysis['year_wise_ranks'], indent=2)}
Key Insights: {'; '.join(trends_analysis['insights'])}
"""

        # Add faculty summary
        faculty_summary = ""
        if all_faculty_stats:
            total_faculty = sum(fs['total_faculty'] for fs in all_faculty_stats)
            avg_exp_all = statistics.mean([fs['avg_experience'] for fs in all_faculty_stats if fs['avg_experience'] > 0])
            faculty_summary = f"""
Faculty Analysis Summary:
- Total Faculty Analyzed: {total_faculty} across {len(institutions_with_faculty)} institutions
- Overall Average Experience: {avg_exp_all:.1f} years
- Institutions with Faculty Data: {list(institutions_with_faculty.keys())}
"""

        # Style-specific prompts with NIRF parameter focus
        style_prompts = {
            "comprehensive": f"""Provide a detailed, comprehensive answer with:
1. Specific NIRF ranking data, scores, and analysis
2. Faculty statistics and trends where available
3. NIRF parameter breakdown (TLR: 30%, RP: 30%, GO: 20%, OI: 10%, PR: 10%)
4. Year-wise trends and performance analysis
5. Institutional strengths and areas for improvement""",

            "concise": "Provide a brief, focused answer highlighting key NIRF ranking information, top faculty stats, and main trends only.",

            "comparative": f"""Compare NIRF rankings and performance across different institutions or years, focusing on:
1. Ranking improvements/declines
2. Faculty strength comparisons
3. Parameter-wise performance (TLR, RP, GO, OI, PR)
4. Regional/state-wise analysis""",

            "analytical": f"""Provide deep analytical insights and trends based on NIRF data:
1. Statistical analysis of ranking patterns
2. Faculty development trends and their impact on rankings
3. NIRF parameter correlation analysis
4. Predictive insights for future performance
5. Recommendations for improvement""",

            "faculty_focused": f"""Focus specifically on faculty analysis:
1. Detailed faculty statistics and demographics
2. Experience distribution and designation breakdown
3. Faculty-student ratios and their NIRF impact
4. Comparison of faculty strength across institutions
5. Faculty development recommendations""",

            "trend_analysis": f"""Focus on trends and temporal analysis:
1. Year-over-year ranking changes
2. Parameter-wise improvement trends
3. Institutional growth patterns
4. Comparative regional development
5. Future trajectory predictions"""
        }

        # Create enhanced prompt
        prompt = f"""You are an expert assistant for NIRF (National Institutional Ranking Framework) data analysis with deep knowledge of Indian higher education rankings.

NIRF PARAMETER FRAMEWORK:
1. TLR (Teaching, Learning & Resources): 30% - Faculty quality, student-faculty ratio, infrastructure
2. RP (Research & Professional Practice): 30% - Publications, patents, research funding
3. GO (Graduation Outcomes): 20% - Placements, higher studies, median salary
4. OI (Outreach & Inclusivity): 10% - Diversity, social responsibility
5. PR (Perception): 10% - Peer and employer perception

NIRF Context Information:
{context}

{trends_context}

{faculty_summary}

User Question: {query}

Response Style: {style_prompts.get(response_style, style_prompts['comprehensive'])}

Instructions:
1. Analyze the query in context of NIRF parameters and weightages
2. Include specific NIRF ranks, scores, and parameter analysis
3. Incorporate faculty statistics and trends where available
4. Reference year-wise data and performance trends
5. Provide actionable insights for institutional improvement
6. Use relevance scores to prioritize the most accurate information
7. Format numerical data clearly (ranks, scores, percentages, faculty counts)
8. If faculty data is available, analyze its potential impact on TLR scores
9. Connect trends to NIRF parameter performance where possible
10. Provide context about institutional development and growth patterns

Answer:"""

        try:
            response = self.groq_client.chat.completions.create(
                messages=[
                    {"role": "system", "content": "You are a highly specialized assistant for NIRF (National Institutional Ranking Framework) data analysis with expertise in Indian higher education rankings, faculty analytics, and institutional development trends."},
                    {"role": "user", "content": prompt}
                ],
                model="llama3-70b-8192",
                max_tokens=2000,
                temperature=0.1
            )

            return response.choices[0].message.content

        except Exception as e:
            self.console.print(f"❌ Response generation failed: {e}", style="bold red")
            if "503" in str(e):
                summary = self.generate_fallback_response(context_chunks, faculty_summary, trends_analysis)
                return f"Groq service temporarily unavailable. {summary}"
            return "I encountered an error while generating the response. Please try again."

    def generate_fallback_response(self, results: List[Dict], faculty_summary: str, trends_analysis: Dict) -> str:
        """Generate a structured fallback response when AI is unavailable"""
        response_parts = []

        # Basic statistics
        institutions = set()
        years = set()
        ranks = []

        for result in results:
            institutions.add(result['metadata']['college_name'])
            years.add(str(result['metadata']['year']))
            rank = result['metadata']['rank']
            if rank != 'N/A':
                try:
                    rank_num = int(str(rank).replace('>', '').replace('<', '').split('-')[0])
                    ranks.append(rank_num)
                except:
                    pass

        response_parts.append(f"📊 Found {len(results)} relevant NIRF entries")
        response_parts.append(f"🏛️ Institutions: {', '.join(list(institutions)[:3])}")
        response_parts.append(f"📅 Years covered: {', '.join(sorted(years))}")

        if ranks:
            response_parts.append(f"🏆 Ranking range: {min(ranks)} to {max(ranks)}")

        if faculty_summary:
            response_parts.append(faculty_summary.strip())

        if trends_analysis['insights']:
            response_parts.append(f"📈 Trends: {'; '.join(trends_analysis['insights'])}")

        return " | ".join(response_parts)

    def update_session_stats(self, results_count: int, response_time: float):
        """Update session statistics"""
        self.session_stats['queries_executed'] += 1
        self.session_stats['total_results'] += results_count

        # Calculate running average
        prev_avg = self.session_stats['avg_response_time']
        n = self.session_stats['queries_executed']
        self.session_stats['avg_response_time'] = (prev_avg * (n - 1) + response_time) / n

    def show_session_stats(self):
        """Display enhanced session statistics"""
        stats = self.session_stats
        session_duration = datetime.now() - stats['start_time']

        table = Table(title="📊 Advanced Session Statistics")
        table.add_column("Metric", style="cyan")
        table.add_column("Value", style="green")

        table.add_row("Queries Executed", str(stats['queries_executed']))
        table.add_row("Total Results", str(stats['total_results']))
        table.add_row("Faculty Records Analyzed", str(stats['faculty_analyzed']))
        table.add_row("Trends Generated", str(stats['trends_generated']))
        table.add_row("Average Response Time", f"{stats['avg_response_time']:.2f}s")
        table.add_row("Session Duration", str(session_duration).split('.')[0])
        table.add_row("Cache Hits", str(len(self.query_cache)))

        self.console.print(table)

    def show_faculty_analysis(self, results: List[Dict]):
        """Display detailed faculty analysis"""
        faculty_institutions = {}

        for result in results:
            faculty_data = result['metadata'].get('faculty_stats')
            if faculty_data and faculty_data['total_faculty'] > 0:
                inst_name = result['metadata']['college_name']
                faculty_institutions[inst_name] = faculty_data

        if not faculty_institutions:
            self.console.print("📊 No faculty data available in current results", style="yellow")
            return

        for inst_name, faculty_data in faculty_institutions.items():
            panel_content = f"""
👥 Total Faculty: {faculty_data['total_faculty']}
⏱️ Average Experience: {faculty_data['avg_experience']:.1f} years

📊 Designation Breakdown:
{self.format_dict_for_display(faculty_data['designation_breakdown'])}

🎓 Experience Distribution:
{self.format_dict_for_display(faculty_data['experience_distribution'])}

📚 Qualification Breakdown:
{self.format_dict_for_display(faculty_data['qualification_breakdown'])}
            """

            panel = Panel(
                panel_content.strip(),
                title=f"👨‍🏫 Faculty Analysis: {inst_name}",
                border_style="blue"
            )
            self.console.print(panel)

    def format_dict_for_display(self, data_dict: Dict) -> str:
        """Format dictionary for nice display"""
        if not data_dict:
            return "No data available"

        formatted_items = []
        for key, value in data_dict.items():
            if isinstance(value, float):
                formatted_items.append(f"  • {key}: {value:.1f}")
            else:
                formatted_items.append(f"  • {key}: {value}")

        return "\n".join(formatted_items)

    def export_results(self, query: str, results: List[Dict], response: str, format: str = "json"):
        """Enhanced export with faculty and analytics data"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"nirf_advanced_query_{timestamp}.{format}"

        try:
            if format == "json":
                # Extract all faculty data
                faculty_data_export = {}
                for result in results:
                    if result['metadata'].get('faculty_stats'):
                        inst_name = result['metadata']['college_name']
                        faculty_data_export[inst_name] = result['metadata']['faculty_stats']

                trends_data = self.analytics.analyze_nirf_trends(results)

                export_data = {
                    'query': query,
                    'timestamp': timestamp,
                    'collection': self.selected_collection,
                    'results_count': len(results),
                    'ai_response': response,
                    'faculty_analytics': faculty_data_export,
                    'trends_analysis': trends_data,
                    'nirf_parameters': self.analytics.NIRF_PARAMETERS,
                    'session_stats': self.session_stats,
                    'results': results
                }

                with open(filename, 'w', encoding='utf-8') as f:
                    json.dump(export_data, f, indent=2, ensure_ascii=False, default=str)

            elif format == "csv":
                flattened_results = []
                for result in results:
                    faculty_stats = result['metadata'].get('faculty_stats', {})
                    flat_result = {
                        'query': query,
                        'content': result['content'],
                        'score': result['score'],
                        'faculty_count': faculty_stats.get('total_faculty', 0),
                        'avg_faculty_experience': faculty_stats.get('avg_experience', 0),
                        **result['metadata']
                    }
                    # Remove complex nested data for CSV
                    flat_result.pop('faculty_stats', None)
                    flattened_results.append(flat_result)

                df = pd.DataFrame(flattened_results)
                df.to_csv(filename, index=False)

            elif format == "txt":
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(f"NIRF Advanced Query Results v3.0\n")
                    f.write(f"===================================\n\n")
                    f.write(f"Query: {query}\n")
                    f.write(f"Timestamp: {timestamp}\n")
                    f.write(f"Collection: {self.selected_collection}\n")
                    f.write(f"Results Count: {len(results)}\n\n")

                    # NIRF Parameters Overview
                    f.write("NIRF PARAMETERS FRAMEWORK:\n")
                    f.write("-" * 50 + "\n")
                    for param, details in self.analytics.NIRF_PARAMETERS.items():
                        f.write(f"{param}: {details['name']} ({details['weight']}%)\n")
                        f.write(f"    {details['description']}\n\n")

                    f.write(f"AI Response:\n{response}\n\n")

                    # Faculty Analytics
                    faculty_institutions = {}
                    for result in results:
                        if result['metadata'].get('faculty_stats'):
                            inst_name = result['metadata']['college_name']
                            faculty_institutions[inst_name] = result['metadata']['faculty_stats']

                    if faculty_institutions:
                        f.write("FACULTY ANALYTICS:\n")
                        f.write("-" * 50 + "\n")
                        for inst_name, faculty_data in faculty_institutions.items():
                            f.write(f"\n{inst_name}:\n")
                            f.write(f"  Total Faculty: {faculty_data['total_faculty']}\n")
                            f.write(f"  Average Experience: {faculty_data['avg_experience']:.1f} years\n")
                            f.write(f"  Designation Breakdown: {faculty_data['designation_breakdown']}\n")
                            f.write(f"  Experience Distribution: {faculty_data['experience_distribution']}\n")

                    # Trends Analysis
                    trends_data = self.analytics.analyze_nirf_trends(results)
                    if trends_data['insights']:
                        f.write(f"\nTRENDS ANALYSIS:\n")
                        f.write("-" * 50 + "\n")
                        for insight in trends_data['insights']:
                            f.write(f"• {insight}\n")
                        f.write(f"\nYear-wise Data: {json.dumps(trends_data['year_wise_ranks'], indent=2)}\n")

                    f.write("\nDetailed Results:\n")
                    f.write("-" * 50 + "\n")
                    for i, result in enumerate(results, 1):
                        f.write(f"\n{i}. Score: {result['score']:.3f}\n")
                        f.write(f"   Institution: {result['metadata']['college_name']}\n")
                        f.write(f"   Year: {result['metadata']['year']}\n")
                        f.write(f"   NIRF Rank: {result['metadata']['rank']}\n")
                        f.write(f"   Location: {result['metadata']['city']}, {result['metadata']['state']}\n")

                        if result['metadata'].get('faculty_stats'):
                            fs = result['metadata']['faculty_stats']
                            f.write(f"   Faculty: {fs['total_faculty']} (Avg Exp: {fs['avg_experience']:.1f} years)\n")

                        f.write(f"   Content: {result['content'][:200]}...\n")

            self.console.print(f"✅ Advanced results exported to {filename}", style="bold green")

        except Exception as e:
            self.console.print(f"❌ Export failed: {e}", style="bold red")

    def interactive_query(self):
        """Enhanced interactive query interface with new modes"""
        self.console.print(f"\n🔍 Interactive Query Mode - Advanced NIRF Analytics", style="bold blue")
        self.console.print(f"Collection: {self.selected_collection}")
        self.console.print(f"Available commands: 'exit', 'help', 'stats', 'clear_cache', 'faculty', 'trends', 'params'")

        while True:
            try:
                query = input("\n📝 Enter your NIRF query: ").strip()

                if query.lower() == 'exit':
                    self.console.print("👋 Goodbye!", style="bold blue")
                    break

                if query.lower() == 'help':
                    self.show_help()
                    continue

                if query.lower() == 'stats':
                    self.show_session_stats()
                    continue

                if query.lower() == 'params':
                    self.show_nirf_parameters_overview()
                    continue

                if query.lower() == 'faculty':
                    self.console.print("💡 Try queries like: 'faculty statistics of IIT Delhi', 'AMU faculty analysis'", style="yellow")
                    continue

                if query.lower() == 'trends':
                    self.console.print("💡 Try queries like: 'NIRF ranking trends of AMU', 'year wise performance analysis'", style="yellow")
                    continue

                if query.lower() == 'clear_cache':
                    self.query_cache.clear()
                    self.console.print("🗑️ Cache cleared", style="yellow")
                    continue

                if not query:
                    continue

                # Enhanced filter input
                college_filter = input("Institution name (partial match, press Enter to skip): ").strip()
                year_filter = input("Year (e.g., 2023, press Enter to skip): ").strip()

                # Enhanced response style selection
                self.console.print("\n📊 Available Response Styles:")
                styles = [
                    "comprehensive - Detailed NIRF analysis with all data",
                    "concise - Brief ranking information only",
                    "comparative - Compare across institutions/years",
                    "analytical - Deep trends and insights analysis",
                    "faculty_focused - Focus on faculty statistics",
                    "trend_analysis - Focus on temporal trends"
                ]

                for i, style in enumerate(styles, 1):
                    self.console.print(f"  {i}. {style}", style="cyan")

                style_choice = input("Response style (1-6 or name) [comprehensive]: ").strip()

                style_map = {
                    '1': 'comprehensive', '2': 'concise', '3': 'comparative',
                    '4': 'analytical', '5': 'faculty_focused', '6': 'trend_analysis'
                }

                response_style = style_map.get(style_choice, style_choice)
                if response_style not in ['comprehensive', 'concise', 'comparative', 'analytical', 'faculty_focused', 'trend_analysis']:
                    response_style = 'comprehensive'

                # Number of results
                try:
                    limit = input(f"Number of results (default {self.default_limit}): ").strip()
                    limit = int(limit) if limit else self.default_limit
                except ValueError:
                    limit = self.default_limit

                # Perform search
                self.console.print("🔍 Searching NIRF database with advanced analytics...", style="yellow")
                results = self.search_similar_chunks(
                    query,
                    college_filter if college_filter else None,
                    year_filter if year_filter else None,
                    limit
                )

                if not results:
                    self.console.print("❌ No relevant NIRF data found", style="bold red")
                    self.console.print("💡 Try different keywords or check institution name spelling", style="yellow")
                    continue

                # Generate response
                self.console.print("🤖 Generating advanced NIRF analysis...", style="yellow")
                response = self.generate_response(query, results, response_style)

                # Display results
                self.console.print(f"\n🎯 Advanced NIRF Query Results:", style="bold green")
                self.console.print(f"📊 Found {len(results)} relevant entries")
                self.console.print(f"\n💬 AI Analysis ({response_style}):", style="bold blue")
                self.console.print(response)

                # Show detailed results
                self.show_detailed_results(results)

                # Show faculty analysis if available
                self.show_faculty_analysis(results)

                # Show trends analysis
                trends_analysis = self.analytics.analyze_nirf_trends(results)
                if trends_analysis['insights']:
                    self.console.print(f"\n📈 Trends Analysis:", style="bold magenta")
                    for insight in trends_analysis['insights']:
                        self.console.print(f"  • {insight}", style="green")

                # Export option with enhanced formats
                export_choice = input("\n📄 Export results? (y/n): ").strip().lower()
                if export_choice == 'y':
                    format_choice = input("Format (json/csv/txt) [json]: ").strip()
                    if format_choice not in ['json', 'csv', 'txt']:
                        format_choice = 'json'
                    self.export_results(query, results, response, format_choice)

                # Advanced analysis options
                advanced_choice = input("\n🔬 Run additional analysis? (faculty/trends/compare/n): ").strip().lower()
                if advanced_choice == 'faculty':
                    self.show_faculty_analysis(results)
                elif advanced_choice == 'trends':
                    self.show_detailed_trends(results)
                elif advanced_choice == 'compare':
                    self.show_comparison_analysis(results)

            except KeyboardInterrupt:
                self.console.print("\n👋 Goodbye!", style="bold blue")
                break
            except Exception as e:
                self.console.print(f"❌ Query failed: {e}", style="bold red")

    def show_detailed_trends(self, results: List[Dict]):
        """Show detailed trends analysis"""
        trends = self.analytics.analyze_nirf_trends(results)

        if not trends['year_wise_ranks']:
            self.console.print("📊 No trend data available", style="yellow")
            return

        table = Table(title="📈 Year-wise NIRF Performance Analysis")
        table.add_column("Year", style="cyan")
        table.add_column("Institutions", style="green")
        table.add_column("Avg Rank", style="yellow")
        table.add_column("Best Rank", style="magenta")
        table.add_column("Worst Rank", style="red")

        for year, data in sorted(trends['year_wise_ranks'].items()):
            table.add_row(
                year,
                str(data['institutions']),
                f"{data['avg_rank']:.1f}",
                str(data['best_rank']),
                str(data['worst_rank'])
            )

        self.console.print(table)

    def show_comparison_analysis(self, results: List[Dict]):
        """Show comparative analysis across institutions"""
        institutions = {}

        for result in results:
            inst_name = result['metadata']['college_name']
            if inst_name not in institutions:
                institutions[inst_name] = {
                    'years': set(),
                    'ranks': [],
                    'faculty_count': 0,
                    'avg_experience': 0,
                    'content_types': set()
                }

            institutions[inst_name]['years'].add(str(result['metadata']['year']))
            institutions[inst_name]['content_types'].add(result['metadata']['content_type'])

            rank = result['metadata']['rank']
            if rank != 'N/A':
                try:
                    rank_num = int(str(rank).replace('>', '').replace('<', '').split('-')[0])
                    institutions[inst_name]['ranks'].append(rank_num)
                except:
                    pass

            faculty_stats = result['metadata'].get('faculty_stats')
            if faculty_stats:
                institutions[inst_name]['faculty_count'] = faculty_stats['total_faculty']
                institutions[inst_name]['avg_experience'] = faculty_stats['avg_experience']

        table = Table(title="🏆 Institutional Comparison Analysis")
        table.add_column("Institution", style="cyan", width=25)
        table.add_column("Years", style="green")
        table.add_column("Avg Rank", style="yellow")
        table.add_column("Faculty", style="magenta")
        table.add_column("Avg Exp", style="blue")
        table.add_column("Data Types", style="white")

        for inst_name, data in institutions.items():
            avg_rank = statistics.mean(data['ranks']) if data['ranks'] else 0
            years_str = ", ".join(sorted(data['years']))[:20] + "..." if len(", ".join(sorted(data['years']))) > 20 else ", ".join(sorted(data['years']))

            table.add_row(
                inst_name[:22] + "..." if len(inst_name) > 22 else inst_name,
                years_str,
                f"{avg_rank:.1f}" if avg_rank > 0 else "N/A",
                str(data['faculty_count']) if data['faculty_count'] > 0 else "N/A",
                f"{data['avg_experience']:.1f}" if data['avg_experience'] > 0 else "N/A",
                str(len(data['content_types']))
            )

        self.console.print(table)

    def show_detailed_results(self, results: List[Dict]):
        """Enhanced detailed results with faculty and analytics"""
        if not results:
            return

        table = Table(title="📊 Detailed NIRF Results with Analytics")
        table.add_column("Rank", style="cyan", width=4)
        table.add_column("Score", style="green", width=6)
        table.add_column("Institution", style="magenta", width=20)
        table.add_column("Year", style="yellow", width=6)
        table.add_column("NIRF Rank", style="red", width=8)
        table.add_column("Faculty", style="blue", width=8)
        table.add_column("State", style="white", width=12)
        table.add_column("Content Preview", style="bright_black", width=25)

        for i, result in enumerate(results, 1):
            faculty_info = "N/A"
            faculty_stats = result['metadata'].get('faculty_stats')
            if faculty_stats and faculty_stats['total_faculty'] > 0:
                faculty_info = f"{faculty_stats['total_faculty']} ({faculty_stats['avg_experience']:.0f}y)"

            table.add_row(
                str(i),
                f"{result['score']:.3f}",
                result['metadata']['college_name'][:18] + "..." if len(result['metadata']['college_name']) > 18 else result['metadata']['college_name'],
                str(result['metadata']['year']),
                str(result['metadata']['rank']),
                faculty_info,
                result['metadata']['state'][:10] + "..." if len(result['metadata']['state']) > 10 else result['metadata']['state'],
                result['content'][:22] + "..." if len(result['content']) > 22 else result['content']
            )

        self.console.print(table)

    def show_help(self):
        """Enhanced comprehensive help information"""
        help_text = """
🔍 Enhanced NIRF RAG Query System v3.0 - Advanced Analytics Edition

🎯 NIRF PARAMETER FRAMEWORK:
• TLR (Teaching, Learning & Resources): 30% - Faculty quality, infrastructure, student-faculty ratio
• RP (Research & Professional Practice): 30% - Publications, patents, research funding
• GO (Graduation Outcomes): 20% - Placements, higher studies, median salary
• OI (Outreach & Inclusivity): 10% - Diversity, social responsibility
• PR (Perception): 10% - Peer and employer perception

🏛️ SAMPLE QUERIES FOR ALIGARH:
• "NIRF ranking of Aligarh Muslim University faculty analysis"
• "AMU faculty statistics and NIRF performance 2023"
• "Aligarh university TLR parameter analysis"
• "Faculty development trends at AMU"
• "Research publications ranking of Aligarh Muslim University"

🔬 ADVANCED QUERY TYPES:
• "Compare faculty strength IIT Delhi vs AMU"
• "NIRF trends analysis for engineering colleges"
• "Faculty experience distribution in top universities"
• "Year-wise ranking improvement patterns"
• "Parameter-wise performance analysis TLR RP GO"

🚀 NEW FEATURES v3.0:
• Faculty Analytics: Demographics, experience, qualifications
• Trends Analysis: Year-over-year performance tracking
• NIRF Parameter Analysis: TLR, RP, GO, OI, PR breakdown
• Comparative Analysis: Multi-institutional comparisons
• Advanced Export: JSON with analytics, enhanced CSV/TXT

📊 RESPONSE STYLES:
• comprehensive: Detailed analysis with all NIRF parameters
• concise: Brief ranking and key metrics only
• comparative: Cross-institutional/temporal comparisons
• analytical: Deep statistical insights and trends
• faculty_focused: Detailed faculty demographics and analysis
• trend_analysis: Temporal patterns and future projections

🛠️ INTERACTIVE COMMANDS:
• 'params' - Show NIRF parameters framework
• 'faculty' - Faculty analysis tips and examples
• 'trends' - Trending analysis examples
• 'stats' - Session statistics with analytics
• 'clear_cache' - Clear query cache
• 'help' - Show this comprehensive help

🎨 ADVANCED ANALYSIS OPTIONS:
After each query, you can run additional analysis:
• faculty - Detailed faculty breakdown and statistics
• trends - Year-wise performance analysis
• compare - Cross-institutional comparison table

💡 EXPERT TIPS:
• Use institution abbreviations: AMU, IIT, NIT, IIM, AIIMS
• Combine parameters: "TLR analysis of top engineering colleges"
• Specify analysis type: "faculty trends", "ranking patterns"
• Use comparative queries: "AMU vs JMI NIRF comparison"
• Include years for trend analysis: "2019-2023 performance"

📈 FACULTY ANALYSIS CAPABILITIES:
• Total faculty count and average experience
• Designation breakdown (Professor, Associate, Assistant)
• Experience distribution across career stages
• Qualification analysis (PhD, M.Tech, etc.)
• Impact assessment on TLR parameter scores

🔄 TRENDS ANALYSIS FEATURES:
• Year-over-year ranking changes
• Parameter-wise improvement tracking
• Institutional growth patterns
• Regional performance comparisons
• Predictive insights for future performance

🏆 COMPARISON ANALYSIS:
• Multi-institutional ranking comparisons
• Faculty strength analysis across universities
• Parameter-wise performance benchmarking
• Regional and category-wise analysis
• Historical performance tracking
        """
        self.console.print(help_text, style="cyan")

    def run(self):
        """Enhanced main execution function with analytics overview"""
        self.console.print("🚀 Starting Enhanced NIRF RAG System v3.0 - Advanced Analytics...", style="bold green")
        self.console.print(f"🎯 Using collection: {self.selected_collection}", style="bold blue")

        # Setup connections
        if not self.setup_connections():
            return

        # Show enhanced welcome info
        welcome_panels = [
            Panel("🏛️ Faculty Analytics\n📊 Experience Analysis\n👥 Demographics", title="Faculty Features", border_style="green"),
            Panel("📈 Year-wise Trends\n🎯 Parameter Analysis\n🏆 Ranking Patterns", title="Trends Analysis", border_style="blue"),
            Panel("🔍 TLR Analysis\n📚 RP Metrics\n🎓 GO Tracking", title="NIRF Parameters", border_style="magenta")
        ]

        self.console.print(Columns(welcome_panels))
        self.console.print("\n💡 NEW: Type 'params' to see NIRF framework, 'faculty' for faculty queries, 'trends' for trend analysis", style="yellow")

        # Start interactive query
        self.interactive_query()

        # Show enhanced final stats
        self.console.print("\n📊 Final Advanced Session Statistics:", style="bold blue")
        self.show_session_stats()

def main():
    """Main function with enhanced error handling"""
    try:
        rag_system = EnhancedNIRFRAG()
        rag_system.run()

    except KeyboardInterrupt:
        print("\n⚠ Process interrupted by user")
    except Exception as e:
        print(f"\n❌ Unexpected error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]