In [None]:
#!/usr/bin/env python3
"""
Enhanced NIRF JSON uploader with structured content processing and uniform metadata application
Optimized for clean, normalized tabular data with consistent metadata across all subchunks
"""

import subprocess
import sys
import hashlib
import re
import json
import uuid
import os
import warnings
import gc
from pathlib import Path
from collections import defaultdict
from datetime import datetime
import pandas as pd
from difflib import SequenceMatcher

# Suppress warnings and optimize memory
warnings.filterwarnings('ignore')
os.environ.update({
    'TF_CPP_MIN_LOG_LEVEL': '3',
    'TOKENIZERS_PARALLELISM': 'false',
    'PYTORCH_CUDA_ALLOC_CONF': 'max_split_size_mb:128'
})

def install_packages():
    """Install required packages if missing"""
    packages = ['qdrant-client', 'sentence-transformers', 'torch', 'pandas', 'numpy']
    for pkg in packages:
        try:
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg],
                                stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            print(f"✅ {pkg} installed")
        except subprocess.CalledProcessError:
            print(f"❌ {pkg} - installation failed")

# Import with auto-install
try:
    from qdrant_client import QdrantClient
    from qdrant_client.models import Distance, VectorParams, PointStruct
    from sentence_transformers import SentenceTransformer
    import pandas as pd
    import numpy as np
except ImportError as e:
    print(f"📦 Installing packages... ({e})")
    install_packages()
    from qdrant_client import QdrantClient
    from qdrant_client.models import Distance, VectorParams, PointStruct
    from sentence_transformers import SentenceTransformer
    import pandas as pd
    import numpy as np

class EnhancedNIRFUploader:
    def __init__(self, batch_size=20, chunk_size=512):
        self.batch_size = batch_size
        self.chunk_size = chunk_size
        self.processed_hashes = set()
        self.normalized_content_cache = {}
        self.stats = defaultdict(int)
        self.encoder = None
        self.client = None
        self.collection_name = None
        self.file_metadata = {}

        # Core metadata fields to preserve uniformly
        self.core_metadata_fields = [
            'college_name', 'nirf_id', 'year', 'category', 'institution',
            'department', 'state', 'city', 'university_type', 'ranking_category',
            'rank','location','chunk_type','file_production_year'
        ]

        # Structured field schemas for different content types
        self.field_schemas = {
            'faculty': ['S.No', 'Name', 'Age', 'Designation', 'Gender', 'Degree', 'Experience', 'Joining_Status', 'Joining_Date', 'Leaving_Date', 'Employment_Type'],
            'student': ['S.No', 'Name', 'Program', 'Year', 'Grade', 'Batch', 'Status', 'Roll_Number', 'Admission_Year'],
            'ranking': ['Rank', 'Institution', 'Score', 'Year', 'Category', 'State', 'Overall_Score', 'Parameter_Score'],
            'infrastructure': ['Facility', 'Capacity', 'Area_sqft', 'Year_Built', 'Status', 'Funding_Source', 'Maintenance_Status'],
            'research': ['Title', 'Author', 'Journal', 'Year', 'Citations', 'Impact_Factor', 'Category', 'DOI', 'Publisher'],
            'placement': ['Company', 'Package_LPA', 'Students_Placed', 'Year', 'Sector', 'Location', 'Job_Role', 'Placement_Type'],
            'academic': ['Course', 'Duration', 'Intake', 'Fees', 'Eligibility', 'Accreditation', 'AICTE_Approved'],
            'generic': ['Field1', 'Field2', 'Field3', 'Field4', 'Field5', 'Field6', 'Field7', 'Field8']
        }

        # Enhanced institution detection patterns
        self.institution_patterns = [
            # IIT patterns
            r'indian\s+institute\s+of\s+technology[\s,]+([a-zA-Z\s]+)',
            r'iit[\s,]+([a-zA-Z\s]+)',

            # NIT patterns
            r'national\s+institute\s+of\s+technology[\s,]+([a-zA-Z\s]+)',
            r'nit[\s,]+([a-zA-Z\s]+)',

            # IIM patterns
            r'indian\s+institute\s+of\s+management[\s,]+([a-zA-Z\s]+)',
            r'iim[\s,]+([a-zA-Z\s]+)',

            # AIIMS patterns
            r'all\s+india\s+institute\s+of\s+medical\s+sciences[\s,]*([a-zA-Z\s]*)',
            r'aiims[\s,]*([a-zA-Z\s]*)',

            # IISc patterns
            r'indian\s+institute\s+of\s+science[\s,]*([a-zA-Z\s]*)',
            r'iisc[\s,]*([a-zA-Z\s]*)',

            # University patterns
            r'([a-zA-Z\s]+)\s+university',
            r'university\s+of\s+([a-zA-Z\s]+)',

            # College patterns
            r'([a-zA-Z\s]+)\s+college\s+of\s+([a-zA-Z\s]+)',
            r'([a-zA-Z\s]+)\s+college',

            # Institute patterns
            r'([a-zA-Z\s]+)\s+institute\s+of\s+([a-zA-Z\s]+)',
            r'([a-zA-Z\s]+)\s+institute',

            # School patterns
            r'([a-zA-Z\s]+)\s+school\s+of\s+([a-zA-Z\s]+)',
            r'([a-zA-Z\s]+)\s+school',

            # Generic patterns
            r'([a-zA-Z\s]+)\s+academy',
            r'([a-zA-Z\s]+)\s+polytechnic'

        ]

        self.year_patterns = [
            # Academic year patterns
            r'academic\s+year[\s:]*(\d{4}[-/]\d{2,4})',
            r'ay[\s:]*(\d{4}[-/]\d{2,4})',
            r'session[\s:]*(\d{4}[-/]\d{2,4})',

            # Report year patterns
            r'report\s+year[\s:]*(\d{4})',
            r'annual\s+report[\s:]*(\d{4})',
            r'nirf\s+(\d{4})',

            # Data year patterns
            r'data\s+as\s+on[\s:]*(\d{4})',
            r'as\s+on[\s:]*(\d{1,2}[-/]\d{1,2}[-/](\d{4}))',

            # Standard year patterns
            r'year[\s:]*(\d{4})',
            r'(\d{4}[-/]\d{2,4})',  # Academic year format
            r'(20\d{2})'  # Simple 4-digit year
        ]


        self.nirf_patterns = [
            r'IR-[A-Z]-[A-Z]-\d{3,4}', r'NIRF[\s-]*ID[\s-]:?[_\s-]([A-Z0-9-]+)',
            r'Institute[\s]+ID[\s]:?[_\s]([A-Z0-9-]+)', r'NIRF[\s]+Code[\s]:?[_\s]([A-Z0-9-]+)'
        ]

    def clean_institution_name(self, name):
        """Clean and format institution name"""
        if not name:
            return ""

        # Remove extra whitespace
        name = re.sub(r'\s+', ' ', name).strip()

        # Remove common prefixes/suffixes that are not part of the actual name
        name = re.sub(r'^(the\s+)', '', name, flags=re.IGNORECASE)
        name = re.sub(r'\s+(college|university|institute|school|academy)$', r' \1', name, flags=re.IGNORECASE)

        # Title case but preserve acronyms
        words = name.split()
        cleaned_words = []
        for word in words:
            if word.isupper() and len(word) > 1:  # Keep acronyms as is
                cleaned_words.append(word)
            else:
                cleaned_words.append(word.title())

        return ' '.join(cleaned_words)


    def init_encoder(self):
        """Initialize encoder with optimized settings"""
        if self.encoder is None:
            print("🔧 Loading sentence transformer...")
            import logging
            logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
            self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
            print("✅ Encoder ready")

    def normalize_text(self, text):
        """Advanced text normalization for consistent processing"""
        if not text:
            return ""

        # Clean and normalize
        text = str(text).strip()
        text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)  # Remove control chars
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # ASCII only
        text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
        text = re.sub(r'[^\w\s\|\-\.\,\(\)\:\;]', ' ', text)  # Keep essential punctuation
        return text.strip()

    def detect_content_type(self, content):
        """Intelligent content type detection with enhanced patterns"""
        content_lower = content.lower()

        # More specific content type detection (High priority checks)
        if any(keyword in content_lower for keyword in ['phd', 'professor', 'assistant professor', 'associate professor']):
            return 'faculty'

        if any(keyword in content_lower for keyword in ['btech', 'mtech', 'bsc', 'msc', 'bachelor', 'master']):
            return 'student'

        if any(keyword in content_lower for keyword in ['ieee', 'acm', 'springer', 'elsevier', 'scopus']):
            return 'research'

        if any(keyword in content_lower for keyword in ['tcs', 'infosys', 'wipro', 'microsoft', 'google']):
            return 'placement'

        # Check for faculty data
        faculty_keywords = ['designation', 'professor', 'assistant', 'associate', 'ph.d', 'experience', 'joining', 'faculty', 'staff']
        if sum(1 for kw in faculty_keywords if kw in content_lower) >= 3:
            return 'faculty'

        # Check for student data
        student_keywords = ['student', 'program', 'grade', 'batch', 'semester', 'roll', 'admission', 'enrolled']
        if sum(1 for kw in student_keywords if kw in content_lower) >= 2:
            return 'student'

        # Check for ranking data
        ranking_keywords = ['rank', 'score', 'category', 'nirf', 'ranking', 'position', 'overall']
        if sum(1 for kw in ranking_keywords if kw in content_lower) >= 2:
            return 'ranking'

        # Check for infrastructure
        infra_keywords = ['facility', 'capacity', 'area', 'laboratory', 'library', 'hostel', 'building', 'infrastructure']
        if sum(1 for kw in infra_keywords if kw in content_lower) >= 2:
            return 'infrastructure'

        # Check for research
        research_keywords = ['research', 'publication', 'journal', 'citation', 'paper', 'doi', 'impact factor']
        if sum(1 for kw in research_keywords if kw in content_lower) >= 2:
            return 'research'

        # Check for placement
        placement_keywords = ['placement', 'company', 'package', 'salary', 'recruited', 'job', 'employment']
        if sum(1 for kw in placement_keywords if kw in content_lower) >= 2:
            return 'placement'

        # Check for academic programs
        academic_keywords = ['course', 'program', 'degree', 'duration', 'intake', 'fees', 'eligibility', 'curriculum']
        if sum(1 for kw in academic_keywords if kw in content_lower) >= 2:
            return 'academic'

        return 'generic'

    def structure_tabular_content(self, content):
        """Convert unstructured tabular content to structured JSON-like format with individual records"""
        if not content or len(content) < 20:
            return content, False

        # Detect content type and get appropriate schema
        content_type = self.detect_content_type(content)

        # Split content into lines and clean
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        if len(lines) < 2:
            return content, False

        individual_records = []

        # Handle pipe-separated tabular data
        if '|' in content:
            header_patterns = [
                r's\.?\s*no\.?', r'name', r'designation', r'professor', r'age', r'gender',
                r'degree', r'experience', r'joining', r'title', r'rank', r'company', r'package'
            ]
            processed_lines=0
            for line in lines:
                if '|' in line:
                    fields = [field.strip() for field in line.split('|') if field.strip()]

                    # Skip header rows
                    line_lower = line.lower()
                    is_header = any(pattern in line_lower for pattern in header_patterns)

                    # Skip separator lines (like |---|---|)
                    is_separator = all(re.match(r'^[\-\s]*$', field) for field in fields)

                    if is_header or is_separator:
                        continue

                    # Skip first few lines if they seem like headers
                    if processed_lines < 3:
                        # Check if this looks like data (has numbers, names, etc.)
                        has_data_indicators = any(
                            re.search(r'\d+', field) or
                            len(field) > 10 or
                            any(title in field.lower() for title in ['prof', 'dr', 'mr', 'ms'])
                            for field in fields
                        )
                        if not has_data_indicators:
                            processed_lines += 1
                            continue

                    # Process valid data rows
                    if len(fields) >= 2:
                        record = self.parse_structured_record(fields, content_type)
                        if record:
                            individual_records.append(record)
                            processed_lines+=1

        # Handle comma/tab separated data
        elif (',' in content and content.count(',') > content.count('|')) or '\t' in content:
            delimiter = '\t' if '\t' in content else ','

            processed_lines = 0
            for line in lines:
                if delimiter in line:
                    fields = [field.strip() for field in line.split(delimiter) if field.strip()]

                    # Skip header-like rows
                    line_lower = line.lower()
                    is_header = any(pattern in line_lower for pattern in ['s.no', 'name', 'designation', 'age'])

                    if is_header and processed_lines < 3:
                        processed_lines += 1
                        continue

                    if len(fields) >= 2:
                        record = self.parse_structured_record(fields, content_type)
                        if record:
                            individual_records.append(record)
                            processed_lines+=1

        if individual_records:
            header = f"=== {content_type.upper()} DATA ({len(individual_records)} records) ==="
            structured_content = header + '\n\n' + '\n\n'.join(individual_records)
            return structured_content, True

        return content, False

    def parse_structured_record(self, fields, content_type):
        """Parse individual record into structured format"""
        if content_type == 'faculty':
            # Faculty structure: S.No, Name, Age, Designation, Gender, Degree, Experience, etc.
            # clean each field properly
            cleaned_fields=[]
            for field in fields:
                cleaned = str(field).strip()
                # Remove unwanted characters and normalize
                cleaned = re.sub(r'[^\w\s\-\.\,\(\)\:\/]', '', cleaned)
                cleaned = re.sub(r'\s+', ' ', cleaned)
                cleaned_fields.append(cleaned if cleaned else 'N/A')

            # create structured faculty record
            record_data = {
                's_no': cleaned_fields[0] if len(cleaned_fields) > 0 else 'N/A',
                'name': cleaned_fields[1] if len(cleaned_fields) > 1 else 'N/A',
                'age': cleaned_fields[2] if len(cleaned_fields) > 2 else 'N/A',
                'designation': cleaned_fields[3] if len(cleaned_fields) > 3 else 'N/A',
                'gender': cleaned_fields[4] if len(cleaned_fields) > 4 else 'N/A',
                'degree': cleaned_fields[5] if len(cleaned_fields) > 5 else 'N/A',
                'experience': cleaned_fields[6] if len(cleaned_fields) > 6 else 'N/A',
                'joining_status': cleaned_fields[7] if len(cleaned_fields) > 7 else 'N/A',
                'joining_date': cleaned_fields[8] if len(cleaned_fields) > 8 else 'N/A',
                'leaving_date': cleaned_fields[9] if len(cleaned_fields) > 9 else 'N/A',
                'employment_type': cleaned_fields[10] if len(cleaned_fields) > 10 else 'N/A'
            }

            # Return as JSON string for proper structure
            return json.dumps(record_data, indent=2)

        elif content_type == 'student':
            cleaned_fields = [str(field).strip() for field in fields]
            record_data = {
                's_no': cleaned_fields[0] if len(cleaned_fields) > 0 else 'N/A',
                'name': cleaned_fields[1] if len(cleaned_fields) > 1 else 'N/A',
                'program': cleaned_fields[2] if len(cleaned_fields) > 2 else 'N/A',
                'year': cleaned_fields[3] if len(cleaned_fields) > 3 else 'N/A',
                'grade': cleaned_fields[4] if len(cleaned_fields) > 4 else 'N/A',
                'batch': cleaned_fields[5] if len(cleaned_fields) > 5 else 'N/A',
                'roll_number': cleaned_fields[6] if len(cleaned_fields) > 6 else 'N/A'
            }
            return json.dumps(record_data, indent=2)

        elif content_type == 'research':
            cleaned_fields = [str(field).strip() for field in fields]
            record_data = {
                'title': cleaned_fields[0] if len(cleaned_fields) > 0 else 'N/A',
                'author': cleaned_fields[1] if len(cleaned_fields) > 1 else 'N/A',
                'journal': cleaned_fields[2] if len(cleaned_fields) > 2 else 'N/A',
                'year': cleaned_fields[3] if len(cleaned_fields) > 3 else 'N/A',
                'citations': cleaned_fields[4] if len(cleaned_fields) > 4 else 'N/A',
                'impact_factor': cleaned_fields[5] if len(cleaned_fields) > 5 else 'N/A'
            }
            return json.dumps(record_data, indent=2)

        elif content_type == 'placement':
            cleaned_fields = [str(field).strip() for field in fields]
            record_data = {
                'company': cleaned_fields[0] if len(cleaned_fields) > 0 else 'N/A',
                'package_lpa': cleaned_fields[1] if len(cleaned_fields) > 1 else 'N/A',
                'students_placed': cleaned_fields[2] if len(cleaned_fields) > 2 else 'N/A',
                'year': cleaned_fields[3] if len(cleaned_fields) > 3 else 'N/A',
                'sector': cleaned_fields[4] if len(cleaned_fields) > 4 else 'N/A',
                'location': cleaned_fields[5] if len(cleaned_fields) > 5 else 'N/A'
            }
            return json.dumps(record_data, indent=2)

        # Generic fallback - return as structured key-value pairs
        cleaned_fields = [str(field).strip() for field in fields if str(field).strip()]
        if len(cleaned_fields) >= 2:
            record_data = {}
            for i, field in enumerate(cleaned_fields):
                record_data[f'field_{i+1}'] = field
            return json.dumps(record_data, indent=2)

        return None

    def normalize_field_value(self, value):
        """Normalize individual field values with enhanced cleaning"""
        if not value:
            return "N/A"

        value = str(value).strip()

        # Handle dates
        if re.match(r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}', value):
            return value

        # Handle boolean-like values
        if value.lower() in ['yes', 'no', 'true', 'false', 'y', 'n', 'active', 'inactive']:
            return value.capitalize()

        # Handle empty dashes and null values
        if value.lower() in ['--', '-', '', 'null', 'none', 'n/a', 'na', 'nil']:
            return "N/A"

        # Handle numeric values with units
        if re.match(r'^\d+\.?\d*\s*(lpa|cr|lakhs?|crores?|%|sqft|sq\.ft)?\s*$', value.lower()):
            return value

        # Handle email addresses
        if re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', value):
            return value.lower()

        # Handle phone numbers
        if re.match(r'^[\+\d\-\(\)\s]{10,15}$', value):
            return value

        # Clean text values
        value = re.sub(r'\s+', ' ', value)
        value = re.sub(r'[^\w\s\.\-\(\)&,]', '', value)

        # Apply title case for names and proper nouns
        if len(value) > 2 and not value.isdigit():
            return value.title()

        return value

    def extract_comprehensive_metadata(self, content, original_metadata=None):
        """Extract comprehensive metadata from content and preserve original metadata"""
        metadata = {}

        # Start with original metadata if provided
        if original_metadata:
            for key, value in original_metadata.items():
                if key in self.core_metadata_fields and value:
                    metadata[key] = self.normalize_text(str(value))

        # Extract NIRF ID
        if 'nirf_id' not in metadata:
            for pattern in self.nirf_patterns:
                matches = re.findall(pattern, content, re.IGNORECASE)
                if matches:
                    metadata['nirf_id'] = matches[0] if isinstance(matches[0], str) else matches[0][0]
                    break

        # Extract institution/college name
        if 'college_name' not in metadata and 'institution' not in metadata:
            # Try each pattern and extract the most relevant match
            best_match = None
            best_score = 0

            for pattern in self.institution_patterns:
                matches = re.findall(pattern, content, re.IGNORECASE)
                if matches:
                    for match in matches :
                        if isinstance(match,tuple):
                            institution_name = ' '.join([m for m in match if m]).strip()
                        else:
                            institution_name = match.strip()

                        # Score based on length and common keywords
                        score = len(institution_name)
                        if any(keyword in institution_name.lower() for keyword in
                               ['institute', 'university', 'college', 'iit', 'nit', 'iim']):
                            score += 50

                        if score > best_score:
                            best_score = score
                            best_match = institution_name
                            break

                    if best_match:
                        # Clean and format the institution name
                        institution_name = self.clean_institution_name(best_match)
                        metadata['college_name'] = institution_name
                        metadata['institution'] = institution_name

        # Extract year information
        if 'year' not in metadata:
            best_year = None
            best_confidence = 0

            for pattern in self.year_patterns:
                matches = re.findall(pattern, content, re.IGNORECASE)
                if matches:
                    for match in matches:
                        # Handle tuple matches
                        if isinstance(match, tuple):
                            year = match[-1]  # Get the last group (actual year)
                        else:
                            year = match

                        # Extract 4-digit year
                        year_match = re.search(r'(20\d{2})', str(year))
                        if year_match:
                            year_value = year_match.group(1)

                            # Calculate confidence based on pattern type
                            confidence = 1
                            if 'academic' in pattern or 'ay' in pattern:
                                confidence = 5  # High confidence for academic year
                            elif 'report' in pattern or 'nirf' in pattern:
                                confidence = 4  # High confidence for report year
                            elif 'data' in pattern or 'as on' in pattern:
                                confidence = 3  # Medium confidence for data year

                            if confidence > best_confidence:
                                best_confidence = confidence
                                best_year = year_value

            if best_year:
                metadata['year'] = best_year
            else:
                # Fallback: get most recent year from content
                year_matches = re.findall(r'(20\d{2})', content)
                if year_matches:
                    # Get the most recent year within reasonable range
                    valid_years = [y for y in year_matches if 2015 <= int(y) <= 2025]
                    if valid_years:
                        metadata['year'] = max(valid_years)









        # Extract category information
        if 'category' not in metadata:
            category_patterns = [
                r'category[:\s]+([A-Z0-9\s]+)', r'(engineering|medical|management|pharmacy|architecture)',
                r'(general|obc|sc|st|pwd)', r'(government|private|deemed|autonomous)'
            ]
            for pattern in category_patterns:
                matches = re.findall(pattern, content, re.IGNORECASE)
                if matches:
                    metadata['category'] = matches[0].strip().title()
                    break

        # Extract department/faculty information
        if 'department' not in metadata:
            dept_patterns = [
                r'department\s+of\s+([\w\s]+)', r'faculty\s+of\s+([\w\s]+)',
                r'school\s+of\s+([\w\s]+)', r'(computer\s+science|mechanical|electrical|civil|chemical)'
            ]
            for pattern in dept_patterns:
                matches = re.findall(pattern, content, re.IGNORECASE)
                if matches:
                    metadata['department'] = matches[0].strip().title()
                    break

        # Extract state/location information
        if 'state' not in metadata:
            state_patterns = [
                r'state[:\s]+([A-Za-z\s]+)', r'(maharashtra|karnataka|tamil nadu|delhi|gujarat|rajasthan|uttar pradesh|madhya pradesh|west bengal|kerala|andhra pradesh|telangana|punjab|haryana|bihar|odisha|jharkhand|chhattisgarh|assam|himachal pradesh|uttarakhand|goa|tripura|meghalaya|manipur|nagaland|mizoram|sikkim|arunachal pradesh|jammu and kashmir|ladakh)'
            ]
            for pattern in state_patterns:
                matches = re.findall(pattern, content, re.IGNORECASE)
                if matches:
                    metadata['state'] = matches[0].strip().title()
                    break

        # Extract university type
        if 'university_type' not in metadata:
            type_patterns = [
                r'(central\s+university|state\s+university|private\s+university|deemed\s+university|institute\s+of\s+national\s+importance)',
                r'(government|private|autonomous|affiliated)'
            ]
            for pattern in type_patterns:
                matches = re.findall(pattern, content, re.IGNORECASE)
                if matches:
                    metadata['university_type'] = matches[0].strip().title()
                    break
        # --- STEP 5: Inject user-provided metadata ---
        if hasattr(self, 'user_metadata') and self.user_metadata:
            for key, value in self.user_metadata.items():
                if value and key not in metadata:
                    metadata[key] = value

        # Ensure city and state are always present
        if 'city' not in metadata and hasattr(self, 'user_metadata') and self.user_metadata.get('city'):
            metadata['city'] = self.user_metadata['city']

        if 'state' not in metadata and hasattr(self, 'user_metadata') and self.user_metadata.get('state'):
            metadata['state'] = self.user_metadata['state']

        return metadata
    def calculate_content_similarity(self, content1, content2):
        """Calculate similarity between two content pieces"""
        # Normalize both contents
        norm1 = re.sub(r'\s+', ' ', content1.lower())
        norm2 = re.sub(r'\s+', ' ', content2.lower())

        # Remove special characters for comparison
        norm1 = re.sub(r'[^\w\s]', '', norm1)
        norm2 = re.sub(r'[^\w\s]', '', norm2)

        return SequenceMatcher(None, norm1, norm2).ratio()

    def is_duplicate_content(self, content):
        """Advanced duplicate detection with semantic similarity"""
        if not content or len(content.strip()) < 10:
            return True

        # Create normalized hash
        normalized = self.normalize_text(content)

        # Skip if content is too short after normalization
        if len(normalized.strip()) < 10:
            return True

        content_hash = hashlib.md5(normalized.encode()).hexdigest()

        # Check exact duplicates
        if content_hash in self.processed_hashes:
            self.stats['exact_duplicates'] += 1
            return True

        # Check semantic similarity with existing content (sample check to avoid performance issues)
        # FIXED: Only check similarity if we have reasonable amount of content to compare
        if len(self.normalized_content_cache) > 0 and len(self.normalized_content_cache) < 100:
            for existing_hash, existing_content in self.normalized_content_cache.items():
                # FIXED: Increased similarity threshold to 0.95 to be more strict
                # and added minimum length check
                if (len(normalized) > 50 and len(existing_content) > 50 and
                    self.calculate_content_similarity(normalized, existing_content) > 0.95):
                    self.stats['similar_duplicates'] += 1
                    return True

        # Store for future comparisons (limit cache size)
        if len(self.normalized_content_cache) < 500:
            self.normalized_content_cache[content_hash] = normalized

        self.processed_hashes.add(content_hash)
        return False

    def create_logical_chunks(self, content):
        """Create logical chunks focusing on single units"""
        if not content:
            return []

        # Structure the content first
        structured_content, is_structured = self.structure_tabular_content(content)

        # Detect section type for appropriate chunking strategy
        section_type = self.detect_section_type(content)

        if is_structured:
            return self.create_section_chunks(structured_content, section_type)
        else:
            return self.create_generic_chunks(structured_content, section_type)

    def detect_section_type(self, content):
        """Detect the section type for appropriate chunking strategy"""
        content_lower = content.lower()
        # Section detection patterns with priority
        section_patterns = {
            'faculty': [
                'faculty', 'professor', 'assistant professor', 'associate professor',
                'teaching staff', 'academic staff', 'designation', 'phd', 'qualification'
            ],
            'student_ug': [
                'undergraduate', 'ug', 'bachelor', 'btech', 'be', 'bsc', 'bcom', 'ba',
                'ug enrollment', 'ug admission'
            ],
            'student_pg': [
                'postgraduate', 'pg', 'master', 'mtech', 'me', 'msc', 'mcom', 'ma',
                'pg enrollment', 'pg admission'
            ],
            'placement_ug': [
                'ug placement', 'undergraduate placement', 'bachelor placement',
                'btech placement', 'be placement'
            ],
            'placement_pg': [
                'pg placement', 'postgraduate placement', 'master placement',
                'mtech placement', 'me placement'
            ],
            'placement_general': [
                'placement', 'recruited', 'job', 'company', 'package', 'salary',
                'employment', 'career'
            ],
            'financial_library': [
                'library', 'books', 'journals', 'library expenditure', 'library budget'
            ],
            'financial_equipment': [
                'equipment', 'laboratory', 'instruments', 'machinery', 'lab equipment'
            ],
            'financial_general': [
                'expenditure', 'budget', 'financial', 'revenue', 'grants', 'funding'
            ],
            'research': [
                'research', 'publication', 'journal', 'patent', 'ipr', 'consultancy',
                'phd awards', 'research projects'
            ],
            'infrastructure': [
                'infrastructure', 'building', 'facility', 'area', 'hostel', 'campus'
            ],
            'disability': [
                'disability', 'pwd', 'differently abled', 'accessibility', 'inclusive'
            ]
        }

        # Count matches for each section type
        section_scores = {}
        for section_type, keywords in section_patterns.items():
            score = sum(1 for keyword in keywords if keyword in content_lower)
            if score > 0:
                section_scores[section_type] = score

        # Return the section type with highest score
        if section_scores:
            return max(section_scores, key=section_scores.get)

        return 'generic'

    def create_section_chunks(self, structured_content, section_type):
        """Create chunks based on section type with specific strategies"""
        lines = [line.strip() for line in structured_content.split('\n') if line.strip()]
        chunks = []

        if section_type == 'faculty':
            # One chunk per faculty row
            chunks = self.chunk_faculty_data(lines)

        elif section_type in ['student_ug', 'student_pg']:
            # Chunk by UG/PG blocks
            chunks = self.chunk_student_data(lines, section_type)

        elif section_type in ['placement_ug', 'placement_pg', 'placement_general']:
            # Chunk per academic year or UG/PG split
            chunks = self.chunk_placement_data(lines, section_type)

        elif section_type in ['financial_library', 'financial_equipment', 'financial_general']:
            # Chunk per year and type
            chunks = self.chunk_financial_data(lines, section_type)

        elif section_type == 'research':
            # One chunk per yearly row or table
            chunks = self.chunk_research_data(lines)

        elif section_type in ['infrastructure', 'disability']:
            # One chunk for the entire block
            chunks = self.chunk_infrastructure_data(lines, section_type)

        else:
            # Generic chunking
            chunks = self.chunk_generic_data(lines)

        # Add chunk_type metadata to each chunk
        enhanced_chunks = []
        for i, chunk in enumerate(chunks):
            enhanced_chunk = {
                'content': chunk,
                'chunk_type': section_type,
                'section_index': i
            }
            enhanced_chunks.append(enhanced_chunk)

        return enhanced_chunks

    def chunk_faculty_data(self, lines):
        """Create chunks with 3-5 faculty records each for better readability"""
        chunks = []
        current_chunk_records = []
        faculty_records = []

        # First, collect all individual faculty records
        for line in lines:
            if line.startswith('==='):
                continue

            # Check if this is a valid faculty record (has pipe separators and reasonable length)
            if '|' in line and len(line) > 50:  # Minimum meaningful faculty record
                try:
                    # Parse the record to JSON format
                    fields = [field.strip() for field in line.split('|') if field.strip()]
                    if len(fields) >= 3:  # At least S.No, Name, Age
                        structured_record = self.parse_structured_record(fields, 'faculty')
                        if structured_record:
                            faculty_records.append(structured_record)
                except Exception as e:
                    # If parsing fails, keep the original line
                    faculty_records.append(line.strip())
        # Group faculty records into chunks of 3-5 records each
        records_per_chunk = 4
        for i in range(0, len(faculty_records), records_per_chunk):
            chunk_records = faculty_records[i:i + records_per_chunk]

            # Create a clean chunk with proper formatting
            chunk_header = f"=== FACULTY MEMBERS (Records {i+1}-{min(i+records_per_chunk, len(faculty_records))}) ==="
            chunk_content = chunk_header + "\n\n" + "\n\n".join(chunk_records)

            chunks.append(chunk_content)

        return chunks

    def chunk_student_data(self, lines, section_type):
        """Chunk by UG/PG blocks"""
        chunks = []
        current_block = ""
        block_type = section_type.replace('student_', '').upper()

        for line in lines:
            if line.startswith('==='):
                continue

            # Group related student records
            if any(keyword in line.lower() for keyword in ['total', 'enrolled', 'admitted', 'strength']):
                if current_block:
                    chunks.append(f"{block_type} Students: {current_block}")
                    current_block = ""
                current_block = line.strip()
            else:
                current_block += " | " + line.strip() if current_block else line.strip()

        if current_block:
            chunks.append(f"{block_type} Students: {current_block}")

        return chunks

    def chunk_placement_data(self, lines, section_type):
        """Chunk per academic year or UG/PG split"""
        chunks = []
        current_year_data = {}

        for line in lines:
            if line.startswith('==='):
                continue

            # Extract year information
            year_match = re.search(r'20\d{2}', line)
            year = year_match.group() if year_match else 'unknown'

            # Group by year and type
            key = f"{year}_{section_type}"
            if key not in current_year_data:
                current_year_data[key] = []
            current_year_data[key].append(line.strip())

        # Create chunks from grouped data
        for key, data in current_year_data.items():
            if data:
                chunk_content = " | ".join(data)
                chunks.append(f"Placement {key}: {chunk_content}")

        return chunks

    def chunk_financial_data(self, lines, section_type):
        """Chunk per year and type (e.g., Library, Equipment)"""
        chunks = []
        financial_blocks = {}

        for line in lines:
            if line.startswith('==='):
                continue

            # Extract year and categorize
            year_match = re.search(r'20\d{2}', line)
            year = year_match.group() if year_match else 'unknown'

            # Create unique key for year and financial type
            key = f"{year}_{section_type}"
            if key not in financial_blocks:
                financial_blocks[key] = []
            financial_blocks[key].append(line.strip())

        # Create chunks from financial blocks
        for key, data in financial_blocks.items():
            if data:
                chunk_content = " | ".join(data)
                chunks.append(f"Financial {key}: {chunk_content}")

        return chunks

    def chunk_research_data(self, lines):
        """One chunk per yearly row or table"""
        chunks = []
        yearly_research = {}

        for line in lines:
            if line.startswith('==='):
                continue

            # Extract year information
            year_match = re.search(r'20\d{2}', line)
            year = year_match.group() if year_match else 'unknown'

            if year not in yearly_research:
                yearly_research[year] = []
            yearly_research[year].append(line.strip())

        # Create chunks per year
        for year, data in yearly_research.items():
            if data:
                chunk_content = " | ".join(data)
                chunks.append(f"Research {year}: {chunk_content}")

        return chunks


    def chunk_infrastructure_data(self, lines, section_type):
        """One chunk for the entire infrastructure/disability block"""
        chunks = []
        block_content = []

        for line in lines:
            if line.startswith('==='):
                continue
            block_content.append(line.strip())

        if block_content:
            chunk_content = " | ".join(block_content)
            chunks.append(f"{section_type.title()}: {chunk_content}")

        return chunks

    def chunk_generic_data(self, lines):
        """Generic chunking for unidentified sections"""
        chunks = []
        current_chunk = ""

        for line in lines:
            if line.startswith('==='):
                continue

            if len(current_chunk) + len(line) <= self.chunk_size:
                current_chunk += " | " + line.strip() if current_chunk else line.strip()
            else:
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = line.strip()

        if current_chunk:
            chunks.append(current_chunk)

        return chunks

    def create_generic_chunks(self, content, section_type):
        """Create generic chunks for unstructured content with section type"""
        sentences = re.split(r'[.!?]+', content)
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            if len(current_chunk) + len(sentence) > self.chunk_size:
                if current_chunk.strip():
                    chunks.append({
                        'content': current_chunk.strip(),
                        'chunk_type': section_type,
                        'section_index': len(chunks)
                    })
                current_chunk = sentence
            else:
                current_chunk += (" " + sentence) if current_chunk else sentence

        if current_chunk.strip():
            chunks.append({
                'content': current_chunk.strip(),
                'chunk_type': section_type,
                'section_index': len(chunks)
            })

        return chunks

    def create_enhanced_payload(self, chunk_data, chunk_content, index, sub_index, base_metadata):
        """Create comprehensive payload with uniform metadata application"""
        content_type = self.detect_content_type(chunk_content)

        # Generate meaningful title based on content
        title = self.generate_meaningful_title(chunk_content, content_type, base_metadata)

        # Base payload with uniform metadata
        payload = {
            'content': chunk_content,
            'title': self.normalize_text(chunk_data.get('title', '')),
            'content_type': content_type,
            'chunk_index': index,
            'sub_chunk_index': sub_index,
            'processed_at': datetime.now().isoformat(),
            'content_hash': hashlib.md5(chunk_content.encode()).hexdigest()[:16],
            'total_subchunks': 1  # Will be updated after processing
        }

        # Apply uniform base metadata to all subchunks
        for key, value in base_metadata.items():
            payload[key] = value

        # Add chunk-specific metadata if not already present
        chunk_metadata = self.extract_comprehensive_metadata(chunk_content)
        for key, value in chunk_metadata.items():
            if key not in payload and value:
                payload[key] = value

        return payload

    def generate_meaningful_title(self, content, content_type, base_metadata):
        """Generate meaningful titles based on content type and institution"""
        institution = base_metadata.get('college_name', base_metadata.get('institution', 'Institution'))
        year = base_metadata.get('year', base_metadata.get('file_production_year', 'Unknown'))

        # Clean institution name for title
        if institution and institution != 'Institution':
            institution_short = institution.split()[0:3]  # Take first 3 words
            institution_short = ' '.join(institution_short)
        else:
            institution_short = 'Institution'

        if content_type == 'faculty':
            # Count faculty members in content
            faculty_count = content.count('"name":')
            if faculty_count > 0:
                return f"{institution_short} - Faculty Members ({faculty_count} records) - {year}"
            else:
                return f"{institution_short} - Faculty Information - {year}"

        elif content_type == 'student':
            if 'undergraduate' in content.lower() or 'ug' in content.lower():
                return f"{institution_short} - Undergraduate Students - {year}"
            elif 'postgraduate' in content.lower() or 'pg' in content.lower():
                return f"{institution_short} - Postgraduate Students - {year}"
            else:
                return f"{institution_short} - Student Information - {year}"

        elif content_type == 'placement':
            if 'undergraduate' in content.lower() or 'ug' in content.lower():
                return f"{institution_short} - UG Placements - {year}"
            elif 'postgraduate' in content.lower() or 'pg' in content.lower():
                return f"{institution_short} - PG Placements - {year}"
            else:
                return f"{institution_short} - Placement Statistics - {year}"

        elif content_type == 'research':
            return f"{institution_short} - Research Publications - {year}"

        elif content_type == 'infrastructure':
            return f"{institution_short} - Infrastructure Facilities - {year}"

        elif content_type == 'ranking':
            return f"{institution_short} - NIRF Ranking Data - {year}"

        else:
            return f"{institution_short} - {content_type.title()} Data - {year}"

    def process_chunk(self, chunk_data, index):
        """Process individual chunk with enhanced structuring and uniform metadata"""
        content = chunk_data.get('content', '')
        if not content or len(content.strip()) < 10:
            self.stats['empty_chunks'] += 1
            return []

        # Normalize content
        normalized_content = self.normalize_text(content)

        # Check for duplicates
        if self.is_duplicate_content(normalized_content):
            return []

        # Extract base metadata that will be applied uniformly to all subchunks
        base_metadata = self.extract_comprehensive_metadata(
            normalized_content,
            chunk_data.get('metadata', {})
        )

        # Create logical chunks
        logical_chunks = self.create_logical_chunks(normalized_content)
        if not logical_chunks:
            return []

        # Initialize encoder
        self.init_encoder()

        # Process each logical chunk
        points = []
        total_subchunks = len(logical_chunks)

        for sub_index, chunk_info in enumerate(logical_chunks):
            try:

                # Handle both old format (string) and new format (dict)
                if isinstance(chunk_info, dict):
                    chunk_content = chunk_info['content']
                    chunk_type = chunk_info['chunk_type']
                    section_index = chunk_info['section_index']
                else:
                    chunk_content = chunk_info
                    chunk_type = 'generic'
                    section_index = sub_index


                # Generate embedding
                embedding = self.encoder.encode([chunk_content])[0].tolist()
                if len(embedding) != 384:
                    continue

                # Create payload with uniform metadata
                payload = self.create_enhanced_payload(
                    chunk_data, chunk_content, index, sub_index, base_metadata
                )

                    # Add section-specific metadata
                payload['chunk_type'] = chunk_type
                payload['section_index'] = section_index
                payload['total_subchunks'] = total_subchunks

                # Create point
                point = PointStruct(
                    id=str(uuid.uuid4()),
                    vector=embedding,
                    payload=payload
                )

                points.append(point)
                self.stats['processed_chunks'] += 1

            except Exception as e:
                self.stats['encoding_errors'] += 1
                continue

        return points

    def batch_upload(self, points):
        """Upload points in batch with error handling"""
        if not points:
            return 0

        try:
            self.client.upsert(collection_name=self.collection_name, points=points)
            return len(points)
        except Exception as e:
            print(f"❌ Batch upload failed: {str(e)[:50]}...")
            # Try individual uploads
            success_count = 0
            for point in points:
                try:
                    self.client.upsert(collection_name=self.collection_name, points=[point])
                    success_count += 1
                except Exception:
                    continue
            return success_count

    def setup_collection(self):
        """Setup Qdrant collection with proper indexing"""
        try:
            collections = self.client.get_collections()
            if any(c.name == self.collection_name for c in collections.collections):
                print(f"ℹ Collection '{self.collection_name}' exists")
                return True

            self.client.create_collection(
                collection_name=self.collection_name,
                vectors_config=VectorParams(size=384, distance=Distance.COSINE)
            )
            print(f"✅ Collection '{self.collection_name}' created")
            return True
        except Exception as e:
            print(f"❌ Collection setup failed: {e}")
            return False

    def upload_nirf_data(self):
        """Main upload function with enhanced processing"""
        # Get file path
        try:
            import google.colab
            from google.colab import files
            print("📁 Upload your NIRF chunks JSON file:")
            uploaded = files.upload()
            file_path = list(uploaded.keys())[0]
        except ImportError:
            file_path = input("Enter JSON file path: ").strip().strip('"').strip("'")

        if not Path(file_path).exists():
            print(f"❌ File not found: {file_path}")
            return

        # Get additional metadata from user
        print("\n📋 Please provide additional metadata:")

        # NEW: Ask for file production year
        file_year = input("Enter the year when this data was produced/collected (e.g., 2023, 2024): ").strip()

        # NEW: Ask for college name if not clear from file
        college_name = input("Enter the college/institution name (leave blank if in file): ").strip()

        user_rank = input("Enter institution rank (e.g., 1, 25, 100): ").strip()
        user_location = input("Enter location/region (e.g., North India, South India): ").strip()
        user_city = input("Enter city name: ").strip()
        user_state = input("Enter state name: ").strip()

        # Store user metadata with new fields
        self.user_metadata = {
            'file_production_year': file_year if file_year else None,
            'user_provided_college': college_name if college_name else None,
            'rank': user_rank if user_rank else None,
            'location': user_location if user_location else None,
            'city': user_city if user_city else None,
            'state': user_state if user_state else None
        }

        # Qdrant configuration
        QDRANT_URL = "https://b5651607-31ce-49ba-916d-c35c89d731d2.us-east4-0.gcp.cloud.qdrant.io"
        API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.0ApHZL4Qn_A8bx7FCC62nx-IOrHI84W7GZlUZEyVgKk"

        # Collection name
        timestamp = datetime.now().strftime("%Y%m%d_%H%M")
        default_name = f"nirf_structured_uniform_{timestamp}"
        self.collection_name = input(f"Enter collection name (default: '{default_name}'): ").strip() or default_name



        # If user provided college name, use it as priority
        if college_name:
            self.user_metadata['college_name'] = college_name
            self.user_metadata['institution'] = college_name


        # If user provided file year, use it as priority
        if file_year:
            self.user_metadata['year'] = file_year



        # Connect to Qdrant
        print("🔗 Connecting to Qdrant...")
        try:
            self.client = QdrantClient(url=QDRANT_URL, api_key=API_KEY)
            print("✅ Connected to Qdrant")
        except Exception as e:
            print(f"❌ Connection failed: {e}")
            return

        if not self.setup_collection():
            return

        # Load data
        try:
            print("📖 Loading NIRF data...")
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            if not isinstance(data, list):
                print("❌ Expected JSON array format")
                return

            print(f"📊 Processing {len(data)} chunks...")
        except Exception as e:
            print(f"❌ Failed to load JSON: {e}")
            return

        # Process data
        total_uploaded = 0
        batch_buffer = []

        for i, chunk_data in enumerate(data):
            try:
                points = self.process_chunk(chunk_data, i)
                batch_buffer.extend(points)

                # Upload when batch is full
                if len(batch_buffer) >= self.batch_size:
                    uploaded = self.batch_upload(batch_buffer)
                    total_uploaded += uploaded
                    batch_buffer = []
                    gc.collect()

                # Progress update
                if (i + 1) % 100 == 0:
                    print(f"📊 Progress: {i + 1}/{len(data)} chunks, {total_uploaded} uploaded")

            except Exception as e:
                self.stats['processing_errors'] += 1
                continue

        # Upload remaining
        if batch_buffer:
            uploaded = self.batch_upload(batch_buffer)
            total_uploaded += uploaded

        # Final summary
        print(f"\n🎯 Upload Summary:")
        print(f"📁 Collection: {self.collection_name}")
        print(f"✅ Successfully uploaded: {total_uploaded} structured embeddings")
        print(f"📊 Processing Stats:")
        print(f"   • Processed chunks: {self.stats['processed_chunks']}")
        print(f"   • Exact duplicates removed: {self.stats['exact_duplicates']}")
        print(f"   • Similar duplicates removed: {self.stats['similar_duplicates']}")
        print(f"   • Empty chunks skipped: {self.stats['empty_chunks']}")
        print(f"   • Encoding errors: {self.stats['encoding_errors']}")
        print(f"   • Processing errors: {self.stats['processing_errors']}")

        # Collection info
        try:
            collection_info = self.client.get_collection(self.collection_name)
            print(f"\n📊 Collection Status:")
            print(f"   Total points: {collection_info.points_count}")
            print(f"   ✅ Structured content with uniform metadata across all subchunks!")
        except Exception as e:
            print(f"⚠ Could not retrieve collection info: {e}")

        # Display metadata field distribution
        print(f"\n📋 Metadata Fields Applied:")
        print(f"   • Core fields: {', '.join(self.core_metadata_fields)}")
        print(f"   • Uniform application ensures consistent querying")
        print(f"   • All subchunks inherit parent metadata")

        # Cleanup
        if self.encoder:
            del self.encoder
        gc.collect()
        print("🧹 Cleanup completed")

    def verify_metadata_consistency(self):
        """Verify that metadata is consistently applied across all points"""
        if not self.client or not self.collection_name:
            print("❌ No collection available for verification")
            return

        try:
            # Sample some points to verify metadata consistency
            sample_points = self.client.scroll(
                collection_name=self.collection_name,
                limit=100,
                with_payload=True
            )[0]

            metadata_stats = defaultdict(int)
            content_type_stats = defaultdict(int)
            chunk_type_stats = defaultdict(int)  # Add this


            for point in sample_points:
                payload = point.payload

                # Check core metadata fields
                for field in self.core_metadata_fields:
                    if field in payload and payload[field]:
                        metadata_stats[field] += 1

                # Check content type distribution
                if 'content_type' in payload:
                    content_type_stats[payload['content_type']] += 1

            print(f"\n🔍 Metadata Verification (Sample of {len(sample_points)} points):")
            print(f"📊 Core Metadata Coverage:")
            for field, count in metadata_stats.items():
                percentage = (count / len(sample_points)) * 100
                print(f"   • {field}: {count}/{len(sample_points)} ({percentage:.1f}%)")

            print(f"\n📈 Content Type Distribution:")
            for content_type, count in content_type_stats.items():
                percentage = (count / len(sample_points)) * 100
                print(f"   • {content_type}: {count} ({percentage:.1f}%)")

            print(f"\n🏷️ Section-wise Chunk Type Distribution:")
            for chunk_type, count in chunk_type_stats.items():
                percentage = (count / len(sample_points)) * 100
                print(f"   • {chunk_type}: {count} ({percentage:.1f}%)")


        except Exception as e:
            print(f"⚠ Verification failed: {e}")

    def create_sample_queries(self):
        """Generate sample queries to test the structured data"""
        sample_queries = [
            {
                "description": "Find faculty data for a specific college",
                "query": "faculty professors teaching staff",
                "filter": {"chunk_type": "faculty", "college_name": {"$exists": True}}
            },
            {
                "description": "Search for UG placement information",
                "query": "placement company package salary",
                "filter": {"chunk_type": "placement_ug", "year": {"$gte": "2020"}}
            },
            {
                "description": "Find PG student statistics",
                "query": "postgraduate students enrollment",
                "filter": {"chunk_type": "student_pg"}
            },
            {
                "description": "Search for library financial data",
                "query": "library expenditure budget books",
                "filter": {"chunk_type": "financial_library"}

            },
            {
                "description": "Find research publications by year",
                "query": "research publication journal",
                "filter": {"chunk_type": "research", "year": "2023"}
            },
            {
                "description": "Search infrastructure facilities",
                "query": "infrastructure building facility",
                "filter": {"chunk_type": "infrastructure"}
            }
        ]

        print(f"\n🔍 Sample Queries for Testing:")
        for i, query in enumerate(sample_queries, 1):
            print(f"\n{i}. {query['description']}")
            print(f"   Query: '{query['query']}'")
            print(f"   Filter: {json.dumps(query['filter'], indent=6)}")

        return sample_queries

if __name__ == "__main__":
    try:
        print("🚀 Enhanced NIRF Data Uploader - Structured Version with Uniform Metadata")
        print("🔧 Clean Content | Reduced Redundancy | Normalized Tables | Uniform Metadata")
        print("📋 Core Fields: college_name, nirf_id, year, category applied to all subchunks")

        # Configuration
        configs = {
            "1": (15, 400, "Low - Fast processing, smaller chunks"),
            "2": (20, 512, "Balanced - Optimal for most cases"),
            "3": (25, 600, "High - Detailed processing, larger chunks")
        }

        print("\n⚙ Configuration Options:")
        for key, (batch, chunk, desc) in configs.items():
            print(f"   {key}: {desc} (batch={batch}, chunk={chunk})")

        choice = input("\nSelect config (1/2/3, default=2): ").strip() or "2"
        batch_size, chunk_size, desc = configs.get(choice, (20, 512, "Balanced"))

        print(f"✅ Selected: {desc}")
        print(f"   Batch size: {batch_size}")
        print(f"   Chunk size: {chunk_size}")

        # Initialize uploader
        uploader = EnhancedNIRFUploader(batch_size=batch_size, chunk_size=chunk_size)

        # Upload data
        uploader.upload_nirf_data()

        # Verify metadata consistency
        verify_choice = input("\nVerify metadata consistency? (y/n, default=y): ").strip().lower()
        if verify_choice != 'n':
            uploader.verify_metadata_consistency()

        # Show sample queries
        sample_choice = input("\nShow sample queries for testing? (y/n, default=y): ").strip().lower()
        if sample_choice != 'n':
            uploader.create_sample_queries()

        print("\n🎉 Process completed successfully!")
        print("💡 All subchunks now have uniform metadata for consistent querying")

    except KeyboardInterrupt:
        print("\n⚠ Process interrupted by user")
    except Exception as e:
        print(f"\n❌ Unexpected error: {e}")
        import traceback
        traceback.print_exc()
    finally:
        gc.collect()
        print("🔄 Final cleanup completed")

📦 Installing packages... (No module named 'qdrant_client')
✅ qdrant-client installed
✅ sentence-transformers installed
✅ torch installed
✅ pandas installed
✅ numpy installed
🚀 Enhanced NIRF Data Uploader - Structured Version with Uniform Metadata
🔧 Clean Content | Reduced Redundancy | Normalized Tables | Uniform Metadata
📋 Core Fields: college_name, nirf_id, year, category applied to all subchunks

⚙ Configuration Options:
   1: Low - Fast processing, smaller chunks (batch=15, chunk=400)
   2: Balanced - Optimal for most cases (batch=20, chunk=512)
   3: High - Detailed processing, larger chunks (batch=25, chunk=600)

Select config (1/2/3, default=2): 3
✅ Selected: High - Detailed processing, larger chunks
   Batch size: 25
   Chunk size: 600
📁 Upload your NIRF chunks JSON file:


Saving Engineering 2025_chunks.json to Engineering 2025_chunks.json

📋 Please provide additional metadata:
Enter the year when this data was produced/collected (e.g., 2023, 2024): 2025
Enter the college/institution name (leave blank if in file): 
Enter institution rank (e.g., 1, 25, 100): 1
Enter location/region (e.g., North India, South India): South India
Enter city name: chennai
Enter state name: Tamil Nadu
Enter collection name (default: 'nirf_structured_uniform_20250721_0617'): 282005
🔗 Connecting to Qdrant...
✅ Connected to Qdrant
ℹ Collection '282005' exists
📖 Loading NIRF data...
📊 Processing 75 chunks...
🔧 Loading sentence transformer...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Encoder ready

🎯 Upload Summary:
📁 Collection: 282005
✅ Successfully uploaded: 279 structured embeddings
📊 Processing Stats:
   • Processed chunks: 279
   • Exact duplicates removed: 0
   • Similar duplicates removed: 4
   • Empty chunks skipped: 0
   • Encoding errors: 0
   • Processing errors: 0

📊 Collection Status:
   Total points: 406
   ✅ Structured content with uniform metadata across all subchunks!

📋 Metadata Fields Applied:
   • Core fields: college_name, nirf_id, year, category, institution, department, state, city, university_type, ranking_category, rank, location, chunk_type, file_production_year
   • Uniform application ensures consistent querying
   • All subchunks inherit parent metadata
🧹 Cleanup completed

Verify metadata consistency? (y/n, default=y): y

🔍 Metadata Verification (Sample of 100 points):
📊 Core Metadata Coverage:
   • state: 100/100 (100.0%)
   • city: 100/100 (100.0%)
   • year: 80/100 (80.0%)
   • category: 80/100 (80.0%)
   • rank: 80/100 (80.0%)
 