# Welcome to Colab!

## Explore the Gemini API
The Gemini API gives you access to Gemini models created by Google DeepMind. Gemini models are built from the ground up to be multimodal, so you can reason seamlessly across text, images, code, and audio.

**How to get started?**
*  Go to [Google AI Studio](https://aistudio.google.com/) and log in with your Google account.
*  [Create an API key](https://aistudio.google.com/app/apikey).
* Use a quickstart for [Python](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Get_started.ipynb), or call the REST API using [curl](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/rest/Prompting_REST.ipynb).

**Discover Gemini's advanced capabilities**
*  Play with Gemini [multimodal outputs](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Image-out.ipynb), mixing text and images in an iterative way.
*  Discover the [multimodal Live API](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Get_started_LiveAPI.ipynb ) (demo [here](https://aistudio.google.com/live)).
*  Learn how to [analyze images and detect items in your pictures](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Spatial_understanding.ipynb") using Gemini (bonus, there's a [3D version](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/examples/Spatial_understanding_3d.ipynb) as well!).
*  Unlock the power of [Gemini thinking model](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/quickstarts/Get_started_thinking.ipynb), capable of solving complex task with its inner thoughts.
      
**Explore complex use cases**
*  Use [Gemini grounding capabilities](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/examples/Search_grounding_for_research_report.ipynb) to create a report on a company based on what the model can find on internet.
*  Extract [invoices and form data from PDF](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/examples/Pdf_structured_outputs_on_invoices_and_forms.ipynb) in a structured way.
*  Create [illustrations based on a whole book](https://colab.research.google.com/github/google-gemini/cookbook/blob/main/examples/Book_illustration.ipynb) using Gemini large context window and Imagen.

To learn more, check out the [Gemini cookbook](https://github.com/google-gemini/cookbook) or visit the [Gemini API documentation](https://ai.google.dev/docs/).


In [None]:
import os
import re
import json
from typing import List, Dict, Any, Optional, Tuple
from qdrant_client import QdrantClient
from qdrant_client.models import (
    Distance, VectorParams, PointStruct, Filter,
    FieldCondition, MatchValue
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import uuid
from datetime import datetime
from collections import defaultdict, Counter
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
if not logger.handlers:
    handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

class EnhancedFacultyDataCleaner:
    def __init__(self, qdrant_url: str, api_key: str, collection_name: str):
        self.client = QdrantClient(url=qdrant_url, api_key=api_key)
        self.collection_name = collection_name
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50,
            separators=["\n\n", "\n", ". ", ", ", " "]
        )

        # Enhanced patterns based on snapshot analysis
        self.faculty_patterns = {
            'basic_entry': r'(\d+)\s+([A-Z\s]+?)\s+(\d+)\s+(Assistant Professor|Associate Professor|Professor|Lecturer)\s+(Male|Female)\s+(Ph\.?\s?D\.?|M\.?\s?Tech|B\.?\s?Tech|M\.?\s?Sc|B\.?\s?Sc|LLM)\s+(\d+)\s+(Yes|No)\s+(\d{2}-\d{2}-\d{4})',
            'pipe_separated': r'(\d+)\s*\|\s*([A-Z\s]+?)\s*\|\s*(\d+)\s*\|\s*(Assistant Professor|Associate Professor|Professor|Lecturer)\s*\|\s*(Male|Female)\s*\|\s*(Ph\.?\s?D\.?|M\.?\s?Tech|B\.?\s?Tech|M\.?\s?Sc|B\.?\s?Sc|LLM)\s*\|\s*(\d+)\s*\|\s*(Yes|No)\s*\|\s*(\d{2}-\d{2}-\d{4})',
            'structured_with_name': r'Name:\s*([A-Za-z\s]+?)\s*\|\s*Designation:\s*([^|]+)\s*\|\s*Gender:\s*([^|]+)\s*\|\s*Qualification:\s*([^|]+)'
        }

        # Location standardization from snapshot
        self.location_mappings = {
            'aligarh': {'city': 'Aligarh', 'state': 'Uttar Pradesh', 'region': 'North India'},
            'delhi': {'city': 'Delhi', 'state': 'Delhi', 'region': 'North India'},
            'mumbai': {'city': 'Mumbai', 'state': 'Maharashtra', 'region': 'West India'},
            'kolkata': {'city': 'Kolkata', 'state': 'West Bengal', 'region': 'East India'},
            'chennai': {'city': 'Chennai', 'state': 'Tamil Nadu', 'region': 'South India'},
            'bangalore': {'city': 'Bangalore', 'state': 'Karnataka', 'region': 'South India'},
            'hyderabad': {'city': 'Hyderabad', 'state': 'Telangana', 'region': 'South India'},
        }

        # Qualification standardization
        self.qualification_mappings = {
            'ph d': 'Ph.D', 'phd': 'Ph.D', 'ph.d': 'Ph.D', 'ph.d.': 'Ph.D',
            'm tech': 'M.Tech', 'm.tech': 'M.Tech', 'mtech': 'M.Tech',
            'b tech': 'B.Tech', 'b.tech': 'B.Tech', 'btech': 'B.Tech',
            'm sc': 'M.Sc', 'm.sc': 'M.Sc', 'msc': 'M.Sc',
            'b sc': 'B.Sc', 'b.sc': 'B.Sc', 'bsc': 'B.Sc',
            'llm': 'LLM', 'l.l.m': 'LLM', 'l.l.m.': 'LLM'
        }

        # Designation standardization
        self.designation_mappings = {
            'assistant': 'Assistant Professor',
            'associate': 'Associate Professor',
            'professor': 'Professor',
            'lecturer': 'Lecturer'
        }

        # Statistics tracking
        self.processing_stats = {
            'total_processed': 0,
            'successfully_parsed': 0,
            'improved_points': 0,
            'deleted_points': 0,
            'faculty_records_found': 0,
            'designation_distribution': defaultdict(int),
            'qualification_distribution': defaultdict(int),
            'processing_errors': []
        }

    def fetch_faculty_data_optimized(self) -> List[Dict[str, Any]]:
        """Optimized faculty data fetching using payload filtering"""
        logger.info("Fetching faculty data with optimized filters...")

        try:
            # Use scroll with filters to get faculty data more efficiently
            faculty_points = []
            offset = None

            while True:
                points, next_offset = self.client.scroll(
                    collection_name=self.collection_name,
                    scroll_filter=Filter(
                        should=[
                            FieldCondition(
                                key="content_type",
                                match=MatchValue(value="faculty")
                            ),
                            FieldCondition(
                                key="chunk_type",
                                match=MatchValue(value="faculty")
                            ),
                            FieldCondition(
                                key="category",
                                match=MatchValue(value="Faculty")
                            )
                        ]
                    ),
                    limit=100,
                    offset=offset,
                    with_payload=True,
                    with_vectors=True
                )



                for point in points:
                    if self._is_faculty_data_enhanced(point.payload):
                        faculty_points.append({
                            'id': point.id,
                            'payload': point.payload,
                            'vector': point.vector
                        })

                if next_offset is None:
                    break
                offset = next_offset

            logger.info(f"Found {len(faculty_points)} faculty data points")
            return faculty_points

        except Exception as e:
            logger.error(f"Error fetching data: {e}")
            # Fallback to original method
            return self._fetch_faculty_data_fallback()

    def _fetch_faculty_data_fallback(self) -> List[Dict[str, Any]]:
        """Fallback method for fetching faculty data"""
        logger.info("Using fallback method for data fetching...")

        scroll_result = self.client.scroll(
            collection_name=self.collection_name,
            limit=1000,
            with_payload=True,
            with_vectors=True
        )

        faculty_points = []
        for point in scroll_result[0]:
            if self._is_faculty_data_enhanced(point.payload):
                faculty_points.append({
                    'id': point.id,
                    'payload': point.payload,
                    'vector': point.vector
                })

        return faculty_points

    def _is_faculty_data_enhanced(self, payload: Dict[str, Any]) -> bool:
        """Enhanced faculty data detection using snapshot insights"""
        content = payload.get('content', '').lower()
        title = payload.get('title', '').lower()
        content_type = payload.get('content_type', '').lower()
        category = payload.get('category', '').lower()
        chunk_type = payload.get('chunk_type', '').lower()

        # Primary indicators from snapshot
        if content_type == 'faculty' or category == 'faculty' or chunk_type == 'faculty':
            return True

        # Secondary indicators
        faculty_keywords = [
            'faculty', 'professor', 'assistant professor', 'associate professor',
            'lecturer', 'phd', 'ph.d', 'ph d', 'male', 'female', 'department',
            'designation', 'qualification', 'joining', 'experience'
        ]

        # Pattern-based detection
        pattern_indicators = [
            r'\d+\s+[A-Z\s]+\s+\d+\s+(Assistant Professor|Associate Professor|Professor)',
            r'Name:\s*[A-Za-z\s]+\s*\|\s*Designation:',
            r'(Male|Female)\s+(Ph\.?\s?D\.?|M\.?\s?Tech)',
            r'\d{2}-\d{2}-\d{4}'  # Date pattern
        ]

        keyword_matches = sum(1 for keyword in faculty_keywords if keyword in content or keyword in title)
        pattern_matches = sum(1 for pattern in pattern_indicators if re.search(pattern, content, re.IGNORECASE))

        return keyword_matches >= 3 or pattern_matches >= 2

    def parse_faculty_entry_enhanced(self, text: str, metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Enhanced parsing with multiple pattern matching"""
        faculty_records = []

        # Try different parsing strategies
        for pattern_name, pattern in self.faculty_patterns.items():
            matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)

            if matches:
                logger.info(f"Using pattern '{pattern_name}' - found {len(matches)} matches")

                for match in matches:
                    if pattern_name == 'structured_with_name':
                        record = self._parse_structured_match(match, metadata)
                    else:
                        record = self._parse_standard_match(match, metadata)

                    if record and self._validate_faculty_record(record):
                        faculty_records.append(record)
                break

        # If no patterns matched, try alternative parsing
        if not faculty_records:
            faculty_records = self._parse_alternative_methods(text, metadata)

        # Clean and enhance records
        faculty_records = self._clean_and_enhance_records(faculty_records, metadata)

        self.processing_stats['faculty_records_found'] += len(faculty_records)
        return faculty_records

    def _parse_standard_match(self, match: Tuple, metadata: Dict[str, Any]) -> Dict[str, Any]:
        """Parse standard faculty match pattern"""
        try:
            if len(match) >= 9:
                record = {
                    'id': match[0].strip(),
                    'name': self._clean_name(match[1]),
                    'age': match[2].strip(),
                    'designation': match[3].strip(),
                    'gender': match[4].strip(),
                    'qualification': match[5].strip(),
                    'experience': match[6].strip(),
                    'phd_status': match[7].strip(),
                    'joining_date': match[8].strip()
                }

                # Add metadata
                record.update(self._extract_metadata_info(metadata))
                return record
        except Exception as e:
            logger.error(f"Error parsing standard match: {e}")

        return {}

    def _parse_structured_match(self, match: Tuple, metadata: Dict[str, Any]) -> Dict[str, Any]:
        """Parse structured faculty match pattern"""
        try:
            record = {
                'name': self._clean_name(match[0]),
                'designation': match[1].strip(),
                'gender': match[2].strip(),
                'qualification': match[3].strip()
            }

            # Add metadata
            record.update(self._extract_metadata_info(metadata))
            return record
        except Exception as e:
            logger.error(f"Error parsing structured match: {e}")

        return {}

    def _parse_alternative_methods(self, text: str, metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Alternative parsing methods for edge cases"""
        faculty_records = []

        # Method 1: Line-by-line parsing
        lines = [line.strip() for line in text.split('\n') if line.strip()]

        for line in lines:
            if self._contains_faculty_indicators(line):
                record = self._extract_faculty_from_line(line, metadata)
                if record:
                    faculty_records.append(record)

        # Method 2: Chunk-based parsing for dense text
        if not faculty_records and len(text) > 500:
            chunks = self._split_dense_text(text)
            for chunk in chunks:
                record = self._extract_faculty_from_chunk(chunk, metadata)
                if record:
                    faculty_records.append(record)

        return faculty_records

    def _clean_and_enhance_records(self, records: List[Dict[str, Any]], metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Clean and enhance faculty records with standardization"""
        cleaned_records = []

        for record in records:
            # Skip invalid records
            if not self._validate_faculty_record(record):
                continue

            # Standardize fields
            record = self._standardize_record_fields(record)

            # Add computed fields
            record = self._add_computed_fields(record, metadata)

            # Update statistics
            self._update_processing_stats(record)

            cleaned_records.append(record)

        return cleaned_records

    def _standardize_record_fields(self, record: Dict[str, Any]) -> Dict[str, Any]:
        """Standardize record fields using mappings"""
        # Standardize qualification
        if 'qualification' in record:
            qual_lower = record['qualification'].lower().strip()
            record['qualification'] = self.qualification_mappings.get(qual_lower, record['qualification'])

        # Standardize designation
        if 'designation' in record:
            des_lower = record['designation'].lower().strip()
            record['designation'] = self.designation_mappings.get(des_lower, record['designation'])

        # Standardize gender
        if 'gender' in record:
            record['gender'] = record['gender'].capitalize()

        # Clean and validate experience
        if 'experience' in record:
            record['experience'] = self._calculate_realistic_experience(record)

        return record

    def _add_computed_fields(self, record: Dict[str, Any], metadata: Dict[str, Any]) -> Dict[str, Any]:
        """Add computed fields based on metadata and analysis"""
        # Add career stage
        record['career_stage'] = self._determine_career_stage(record)

        # Add seniority score
        record['seniority_score'] = self._calculate_seniority_score(record)

        # Add location info from metadata
        if 'city' in metadata:
            record['city'] = metadata['city']
        if 'state' in metadata:
            record['state'] = metadata['state']
        if 'location' in metadata:
            record['region'] = metadata['location']

        # Add academic year info
        if 'year' in metadata:
            record['data_year'] = metadata['year']

        return record

    def _determine_career_stage(self, record: Dict[str, Any]) -> str:
        """Determine career stage based on designation and experience"""
        designation = record.get('designation', '').lower()
        experience = record.get('experience', '0')

        try:
            exp_years = int(experience) if experience.isdigit() else 0
        except:
            exp_years = 0

        if 'assistant' in designation:
            return 'Early Career' if exp_years < 5 else 'Mid Career'
        elif 'associate' in designation:
            return 'Mid Career' if exp_years < 15 else 'Senior Career'
        elif 'professor' in designation:
            return 'Senior Career' if exp_years < 20 else 'Distinguished Career'
        else:
            return 'Unknown'

    def _calculate_seniority_score(self, record: Dict[str, Any]) -> int:
        """Calculate seniority score for ranking"""
        score = 0

        # Designation points
        designation_points = {
            'Assistant Professor': 1,
            'Associate Professor': 2,
            'Professor': 3,
            'Lecturer': 0
        }
        score += designation_points.get(record.get('designation', ''), 0) * 30

        # Experience points
        try:
            experience = int(record.get('experience', '0')) if record.get('experience', '0').isdigit() else 0
            score += min(experience, 40)  # Cap at 40 years
        except:
            pass

        # Qualification points
        qualification_points = {
            'Ph.D': 20,
            'M.Tech': 15,
            'M.Sc': 10,
            'B.Tech': 5,
            'B.Sc': 2,
            'LLM': 15
        }
        score += qualification_points.get(record.get('qualification', ''), 0)

        return score

    def _calculate_realistic_experience(self, record: Dict[str, Any]) -> str:
        """Calculate realistic experience based on joining date"""
        experience = record.get('experience', '0')
        joining_date = record.get('joining_date', '')

        if not joining_date:
            return experience

        try:
            # Parse joining date
            day, month, year = joining_date.split('-')
            join_year = int(year)
            current_year = datetime.now().year

            # Calculate years from joining
            calculated_exp = current_year - join_year

            # Validate and adjust
            if calculated_exp < 0:
                return '0'
            elif calculated_exp > 50:
                return '50+'  # Cap unrealistic values
            else:
                return str(calculated_exp)

        except Exception:
            return experience

    def _validate_faculty_record(self, record: Dict[str, Any]) -> bool:
        """Validate faculty record for completeness and accuracy"""
        # Must have name and designation
        if not record.get('name') or not record.get('designation'):
            return False

        # Name validation
        name = record.get('name', '').strip()
        if len(name) < 2 or name.lower() in ['assistant', 'associate', 'professor', 'lecturer']:
            return False

        # Designation validation
        designation = record.get('designation', '').strip()
        valid_designations = ['Assistant Professor', 'Associate Professor', 'Professor', 'Lecturer']
        if designation not in valid_designations:
            return False

        return True

    def _extract_metadata_info(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
        """Extract relevant metadata information"""
        extracted = {}

        # Location info
        if 'city' in metadata:
            extracted['city'] = metadata['city']
        if 'state' in metadata:
            extracted['state'] = metadata['state']
        if 'location' in metadata:
            extracted['region'] = metadata['location']

        # Academic info
        if 'year' in metadata:
            extracted['data_year'] = metadata['year']
        if 'file_production_year' in metadata:
            extracted['file_year'] = metadata['file_production_year']

        return extracted

    def _clean_name(self, name: str) -> str:
        """Clean and standardize faculty name"""
        # Remove extra whitespace and capitalize properly
        name = ' '.join(name.split())

        # Handle special cases
        name = re.sub(r'\s+', ' ', name)  # Multiple spaces to single space
        name = name.title()  # Proper capitalization

        return name.strip()

    def _contains_faculty_indicators(self, line: str) -> bool:
        """Check if line contains faculty indicators"""
        indicators = [
            'professor', 'lecturer', 'male', 'female', 'phd', 'ph.d',
            'assistant', 'associate', 'faculty', 'department'
        ]

        line_lower = line.lower()
        return any(indicator in line_lower for indicator in indicators)

    def _extract_faculty_from_line(self, line: str, metadata: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """Extract faculty information from a single line"""
        # Implementation for line-based extraction
        # This is a simplified version - can be enhanced based on specific patterns

        record = {}

        # Extract basic information using regex
        name_match = re.search(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', line)
        if name_match:
            record['name'] = name_match.group(1)

        designation_match = re.search(r'(Assistant Professor|Associate Professor|Professor|Lecturer)', line, re.IGNORECASE)
        if designation_match:
            record['designation'] = designation_match.group(1)

        gender_match = re.search(r'(Male|Female)', line, re.IGNORECASE)
        if gender_match:
            record['gender'] = gender_match.group(1)

        if len(record) >= 2:  # At least name and designation
            record.update(self._extract_metadata_info(metadata))
            return record

        return None

    def _split_dense_text(self, text: str) -> List[str]:
        """Split dense text into manageable chunks"""
        # Split by common delimiters
        chunks = re.split(r'(?:\n|;|\|{2,})', text)
        return [chunk.strip() for chunk in chunks if len(chunk.strip()) > 20]

    def _extract_faculty_from_chunk(self, chunk: str, metadata: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """Extract faculty information from a text chunk"""
        # Similar to line extraction but for larger chunks
        if self._contains_faculty_indicators(chunk):
            return self._extract_faculty_from_line(chunk, metadata)
        return None

    def _update_processing_stats(self, record: Dict[str, Any]):
        """Update processing statistics"""
        if 'designation' in record:
            self.processing_stats['designation_distribution'][record['designation']] += 1
        if 'qualification' in record:
            self.processing_stats['qualification_distribution'][record['qualification']] += 1

    def create_enhanced_content(self, faculty_records: List[Dict[str, Any]]) -> str:
        """Create enhanced structured content with better formatting"""
        if not faculty_records:
            return ""

        # Sort by seniority score for better organization
        sorted_records = sorted(faculty_records, key=lambda x: x.get('seniority_score', 0), reverse=True)

        if len(sorted_records) == 1:
            return self._format_single_faculty(sorted_records[0])
        else:
            return self._format_multiple_faculty(sorted_records)

    def _format_single_faculty(self, record: Dict[str, Any]) -> str:
        """Format single faculty member with comprehensive details"""
        parts = []

        # Basic information
        if record.get('name'):
            parts.append(f"Faculty Name: {record['name']}")
        if record.get('designation'):
            parts.append(f"Position: {record['designation']}")
        if record.get('gender'):
            parts.append(f"Gender: {record['gender']}")
        if record.get('qualification'):
            parts.append(f"Highest Qualification: {record['qualification']}")

        # Experience and career info
        if record.get('experience'):
            exp_text = f"{record['experience']} years" if record['experience'].isdigit() else record['experience']
            parts.append(f"Teaching Experience: {exp_text}")
        if record.get('career_stage'):
            parts.append(f"Career Stage: {record['career_stage']}")

        # Administrative info
        if record.get('phd_status'):
            parts.append(f"PhD Status: {record['phd_status']}")
        if record.get('joining_date'):
            parts.append(f"Date of Joining: {record['joining_date']}")

        # Location info
        location_parts = []
        if record.get('city'):
            location_parts.append(record['city'])
        if record.get('state'):
            location_parts.append(record['state'])
        if location_parts:
            parts.append(f"Location: {', '.join(location_parts)}")

        return " | ".join(parts)

    def _format_multiple_faculty(self, records: List[Dict[str, Any]]) -> str:
        """Format multiple faculty members with organized structure"""
        content_parts = []

        for i, record in enumerate(records, 1):
            faculty_info = []

            # Essential information
            if record.get('name'):
                faculty_info.append(f"Name: {record['name']}")
            if record.get('designation'):
                faculty_info.append(f"Position: {record['designation']}")
            if record.get('qualification'):
                faculty_info.append(f"Qualification: {record['qualification']}")

            # Experience and career
            if record.get('experience'):
                exp_text = record['experience']
                if exp_text.isdigit():
                    exp_text = f"{exp_text} years"
                faculty_info.append(f"Experience: {exp_text}")
            if record.get('career_stage'):
                faculty_info.append(f"Career Stage: {record['career_stage']}")

            # Administrative info
            if record.get('joining_date'):
                faculty_info.append(f"Joined: {record['joining_date']}")

            if faculty_info:
                content_parts.append(f"Faculty {i}: {' | '.join(faculty_info)}")

        return "\n\n".join(content_parts)

    def create_intelligent_title(self, faculty_records: List[Dict[str, Any]], metadata: Dict[str, Any]) -> str:
        """Create intelligent title with context awareness"""
        if not faculty_records:
            return metadata.get('title', 'Faculty Information')

        # Get location context
        location_context = ""
        if metadata.get('city') and metadata.get('state'):
            location_context = f" - {metadata['city']}, {metadata['state']}"
        elif metadata.get('city'):
            location_context = f" - {metadata['city']}"

        if len(faculty_records) == 1:
            record = faculty_records[0]
            name = record.get('name', 'Faculty Member')
            designation = record.get('designation', 'Faculty')
            career_stage = record.get('career_stage', '')

            title = f"Faculty Profile: {name}, {designation}"
            if career_stage:
                title += f" ({career_stage})"
            title += location_context

            return title
        else:
            # Multiple faculty analysis
            total = len(faculty_records)
            designation_counts = Counter(record.get('designation', 'Unknown') for record in faculty_records)

            # Create descriptive title
            if len(designation_counts) == 1:
                designation = list(designation_counts.keys())[0]
                title = f"Faculty Directory: {total} {designation}{'s' if total > 1 else ''}"
            else:
                # Show distribution
                top_designations = designation_counts.most_common(2)
                desc_parts = []
                for des, count in top_designations:
                    desc_parts.append(f"{count} {des}{'s' if count > 1 else ''}")

                if len(desc_parts) == 1:
                    title = f"Faculty Directory: {desc_parts[0]}"
                else:
                    title = f"Faculty Directory: {' | '.join(desc_parts)}"

                if len(designation_counts) > 2:
                    title += f" | +{len(designation_counts) - 2} more"

            title += location_context
            return title

    def process_and_update_collection_enhanced(self):
        """Enhanced main processing method with better error handling and statistics"""
        logger.info("Starting enhanced faculty data cleaning process...")

        # Fetch faculty data with optimization
        faculty_points = self.fetch_faculty_data_optimized()

        if not faculty_points:
            logger.warning("No faculty data found to process")
            return

        updated_points = []
        points_to_delete = []

        for point_data in faculty_points:

            self.processing_stats['total_processed'] +=1
            try:
                original_id = point_data['id']
                payload = point_data['payload']
                vector = point_data.get('vector',point_data['vector'])

                content = payload.get('content', '')
                title = payload.get('title', '')

                logger.info(f"Processing point {original_id}")
                logger.debug(f"Original title: {title}")
                logger.debug(f"Content preview: {content[:100]}...")

                # Enhanced parsing
                faculty_records = self.parse_faculty_entry_enhanced(content, payload)

                if faculty_records:
                    logger.info(f"Found {len(faculty_records)} valid faculty records")

                    # Show the enhanced faculty data
                    for record in faculty_records:
                        logger.info(f"  - {record.get('name', 'Unknown')} ({record.get('designation', 'Unknown')}) - {record.get('career_stage', 'Unknown Stage')}")

                    # Create enhanced content
                    structured_content = self.create_enhanced_content(faculty_records)

                    # Create intelligent title
                    new_title = self.create_intelligent_title(faculty_records, payload)

                    # Handle content chunking intelligently
                    if len(structured_content) > 1000:
                        documents = [Document(page_content=structured_content)]
                        chunks = self.text_splitter.split_documents(documents)

                        for i, chunk in enumerate(chunks):
                            new_point = PointStruct(
                                id=str(uuid.uuid4()),
                                payload={
                                    **payload,
                                    'content': chunk.page_content,
                                    'title': f"{new_title} - Part {i+1}" if len(chunks) > 1 else new_title,
                                    'chunk_index': i,
                                    'total_chunks': len(chunks),

                                    'content_type': 'faculty',
                                    'chunk_type': 'faculty',
                                    'category': 'Faculty',
                                    'faculty_count': len(faculty_records),
                                    'processing_timestamp': datetime.now().isoformat(),
                                    'enhanced_structure': True,
                                    'faculty_records': faculty_records,
                                    'quality_score': self._calculate_quality_score(faculty_records),
                                    'data_source': 'enhanced_processing'
                                },
                                vector=vector
                            )
                            updated_points.append(new_point)
                    else:
                        # Single chunk
                        new_point = PointStruct(
                            id=str(uuid.uuid4()),
                            payload={
                                **payload,
                                'content': structured_content,
                                'title': new_title,
                                'content_type': 'faculty',
                                'chunk_type': 'faculty',
                                'category': 'Faculty',
                                'faculty_count': len(faculty_records),
                                'processing_timestamp': datetime.now().isoformat(),
                                'enhanced_structure': True,
                                'faculty_records': faculty_records,
                                'quality_score': self._calculate_quality_score(faculty_records),
                                'data_source': 'enhanced_processing'
                            },
                            vector=vector
                        )
                        updated_points.append(new_point)

                    # Mark original for deletion
                    points_to_delete.append(original_id)
                    self.processing_stats['improved_points'] += 1

                    logger.info(f"✓ Enhanced point {original_id} -> {len(faculty_records)} records")

                else:
                    logger.warning(f"No valid faculty records found for point {original_id}")
                    # Keep original if no valid records found
                    continue

                self.processing_stats['successfully_parsed'] += 1

            except Exception as e:
                logger.error(f"Error processing point {point_data.get('id', 'unknown')}: {e}")
                self.processing_stats['processing_errors'].append(str(e))
                continue

        # Execute updates and deletions
        self._execute_collection_updates(updated_points, points_to_delete)

        # Generate and log final report
        self._generate_processing_report()

        logger.info("Enhanced faculty data cleaning process completed!")

    def _calculate_quality_score(self, faculty_records: List[Dict[str, Any]]) -> float:
        """Calculate quality score for faculty records"""
        if not faculty_records:
            return 0.0

        total_score = 0
        total_records = len(faculty_records)

        for record in faculty_records:
            record_score = 0
            max_score = 0

            # Essential fields (60 points)
            if record.get('name'):
                record_score += 20
            max_score += 20

            if record.get('designation'):
                record_score += 20
            max_score += 20

            if record.get('qualification'):
                record_score += 20
            max_score += 20

            # Additional fields (40 points)
            if record.get('experience'):
                record_score += 10
            max_score += 10

            if record.get('gender'):
                record_score += 10
            max_score += 10

            if record.get('joining_date'):
                record_score += 10
            max_score += 10

            if record.get('career_stage'):
                record_score += 10
            max_score += 10

            # Calculate percentage for this record
            if max_score > 0:
                total_score += (record_score / max_score) * 100

        return round(total_score / total_records, 2)

    def _execute_collection_updates(self, updated_points: List[PointStruct], points_to_delete: List[str]):
        """Execute batch updates and deletions"""
        if not updated_points and not points_to_delete:
            logger.info("No updates to perform")
            return

        try:
            # Batch upsert new points
            if updated_points:
                logger.info(f"Upserting {len(updated_points)} enhanced points...")
                self.client.upsert(
                    collection_name=self.collection_name,
                    points=updated_points
                )
                logger.info("✓ Successfully upserted enhanced points")

            # Batch delete old points
            if points_to_delete:
                logger.info(f"Deleting {len(points_to_delete)} original points...")
                self.client.delete(
                    collection_name=self.collection_name,
                    points_selector=points_to_delete
                )
                self.processing_stats['deleted_points'] = len(points_to_delete)
                logger.info("✓ Successfully deleted original points")

        except Exception as e:
            logger.error(f"Error executing collection updates: {e}")
            raise

    def _generate_processing_report(self):
        """Generate comprehensive processing report"""
        stats = self.processing_stats

        logger.info("\n" + "="*60)
        logger.info("ENHANCED FACULTY DATA PROCESSING REPORT")
        logger.info("="*60)

        # Overall statistics
        logger.info(f"Total Points Processed: {stats['total_processed']}")
        logger.info(f"Successfully Parsed: {stats['successfully_parsed']}")
        logger.info(f"Points Enhanced: {stats['improved_points']}")
        logger.info(f"Original Points Deleted: {stats['deleted_points']}")
        logger.info(f"Faculty Records Found: {stats['faculty_records_found']}")

        # Quality metrics
        success_rate = (stats['successfully_parsed'] / max(stats['total_processed'], 1)) * 100
        logger.info(f"Success Rate: {success_rate:.2f}%")

        # Designation distribution
        if stats['designation_distribution']:
            logger.info("\nDesignation Distribution:")
            for designation, count in Counter(stats['designation_distribution']).most_common():
                logger.info(f"  {designation}: {count}")

        # Qualification distribution
        if stats['qualification_distribution']:
            logger.info("\nQualification Distribution:")
            for qualification, count in Counter(stats['qualification_distribution']).most_common():
                logger.info(f"  {qualification}: {count}")

        # Error summary
        if stats['processing_errors']:
            logger.info(f"\nProcessing Errors ({len(stats['processing_errors'])}):")
            for error in stats['processing_errors'][:5]:  # Show first 5 errors
                logger.info(f"  - {error}")
            if len(stats['processing_errors']) > 5:
                logger.info(f"  ... and {len(stats['processing_errors']) - 5} more")

        logger.info("="*60)

    def verify_enhancements(self) -> Dict[str, Any]:
        """Verify the quality of enhancements made"""
        logger.info("Verifying enhancements...")

        # Fetch enhanced faculty data
        enhanced_points = self.fetch_faculty_data_optimized()

        verification_stats = {
            'total_enhanced_points': len(enhanced_points),
            'average_quality_score': 0.0,
            'faculty_records_total': 0,
            'complete_profiles': 0,
            'partial_profiles': 0,
            'career_stage_distribution': defaultdict(int),
            'location_coverage': defaultdict(int),
            'verification_errors': []
        }

        total_quality_score = 0

        for point_data in enhanced_points:
            try:
                payload = point_data['payload']

                # Check for enhanced structure
                if not payload.get('enhanced_structure'):
                    continue

                # Get quality score
                quality_score = payload.get('quality_score', 0.0)
                total_quality_score += quality_score

                # Count faculty records
                faculty_records = payload.get('faculty_records', [])
                verification_stats['faculty_records_total'] += len(faculty_records)

                # Analyze completeness
                for record in faculty_records:
                    essential_fields = ['name', 'designation', 'qualification']
                    complete_fields = sum(1 for field in essential_fields if record.get(field))

                    if complete_fields == len(essential_fields):
                        verification_stats['complete_profiles'] += 1
                    else:
                        verification_stats['partial_profiles'] += 1

                    # Career stage distribution
                    career_stage = record.get('career_stage', 'Unknown')
                    verification_stats['career_stage_distribution'][career_stage] += 1

                    # Location coverage
                    location = record.get('city', 'Unknown Location')
                    verification_stats['location_coverage'][location] += 1

            except Exception as e:
                verification_stats['verification_errors'].append(str(e))

        # Calculate average quality score
        if enhanced_points:
            verification_stats['average_quality_score'] = round(
                total_quality_score / len(enhanced_points), 2
            )

        # Log verification results
        logger.info("\n" + "="*50)
        logger.info("ENHANCEMENT VERIFICATION REPORT")
        logger.info("="*50)
        logger.info(f"Total Enhanced Points: {verification_stats['total_enhanced_points']}")
        logger.info(f"Average Quality Score: {verification_stats['average_quality_score']}")
        logger.info(f"Total Faculty Records: {verification_stats['faculty_records_total']}")
        logger.info(f"Complete Profiles: {verification_stats['complete_profiles']}")
        logger.info(f"Partial Profiles: {verification_stats['partial_profiles']}")

        if verification_stats['career_stage_distribution']:
            logger.info("\nCareer Stage Distribution:")
            for stage, count in Counter(verification_stats['career_stage_distribution']).most_common():
                logger.info(f"  {stage}: {count}")

        if verification_stats['location_coverage']:
            logger.info("\nLocation Coverage:")
            for location, count in Counter(verification_stats['location_coverage']).most_common():
                logger.info(f"  {location}: {count}")

        logger.info("="*50)

        return verification_stats

    def export_enhanced_data(self, output_file: str = "enhanced_faculty_data.json"):
        """Export enhanced faculty data to JSON file"""
        logger.info(f"Exporting enhanced faculty data to {output_file}...")

        # Fetch all enhanced faculty data
        enhanced_points = self.fetch_faculty_data_optimized()

        export_data = {
            'metadata': {
                'export_timestamp': datetime.now().isoformat(),
                'total_points': len(enhanced_points),
                'export_version': '1.0'
            },
            'faculty_data': []
        }

        for point_data in enhanced_points:
            payload = point_data['payload']

            if payload.get('enhanced_structure'):
                export_entry = {
                    'id': point_data['id'],
                    'title': payload.get('title', ''),
                    'content': payload.get('content', ''),
                    'faculty_records': payload.get('faculty_records', []),
                    'quality_score': payload.get('quality_score', 0.0),
                    'processing_timestamp': payload.get('processing_timestamp', ''),
                    'location': {
                        'city': payload.get('city', ''),
                        'state': payload.get('state', ''),
                        'region': payload.get('region', '')
                    }
                }
                export_data['faculty_data'].append(export_entry)

        # Save to file
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(export_data, f, indent=2, ensure_ascii=False)

        logger.info(f"✓ Successfully exported {len(export_data['faculty_data'])} enhanced faculty records")
        return output_file


# Usage example and main execution
if __name__ == "__main__":
    # Configuration
    QDRANT_URL = "https://b5651607-31ce-49ba-916d-c35c89d731d2.us-east4-0.gcp.cloud.qdrant.io"
    API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.0ApHZL4Qn_A8bx7FCC62nx-IOrHI84W7GZlUZEyVgKk"
    COLLECTION_NAME = '282005'

    # Initialize enhanced cleaner
    cleaner = EnhancedFacultyDataCleaner(
        qdrant_url=QDRANT_URL,
        api_key=API_KEY,
        collection_name=COLLECTION_NAME
    )

    try:
        # Execute enhanced processing
        cleaner.process_and_update_collection_enhanced()

        # Verify enhancements
        verification_results = cleaner.verify_enhancements()

        # Export enhanced data
        export_file = cleaner.export_enhanced_data("enhanced_faculty_data.json")

        logger.info("All processing completed successfully!")
        logger.info(f"Enhanced data exported to: {export_file}")

    except Exception as e:
        logger.error(f"Processing failed: {e}")
        raise

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2025-07-21 06:32:04,632 - __main__ - INFO -   - Nadeem Akhtar (Assistant Professor) - Mid Career
INFO:__main__:  - Nadeem Akhtar (Assistant Professor) - Mid Career
2025-07-21 06:32:04,633 - __main__ - INFO -   - Syed Fahad Anwar (Professor) - Senior Career
INFO:__main__:  - Syed Fahad Anwar (Professor) - Senior Career
2025-07-21 06:32:04,634 - __main__ - INFO - ✓ Enhanced point 11ca2279-ed34-4acf-800c-023e3834ce53 -> 2 records
INFO:__main__:✓ Enhanced point 11ca2279-ed34-4acf-800c-023e3834ce53 -> 2 records
2025-07-21 06:32:04,635 - __main__ - INFO - Processing point 1266ba28-66da-450f-b22b-5d2bfa558ec2
INFO:__main__:Processing point 1266ba28-66da-450f-b22b-5d2bfa558ec2
2025-07-21 06:32:04,636 - __main__ - INFO - Using pattern 'pipe_separated' - found 2 matches
INFO:__main__:Using pattern 'pipe_separated' - found 2 matches
2025-07-21 06:32:04,637 - __main__ - INFO - Found 2 valid faculty records
INFO:__main__:Found 2 valid