In [1]:
# %%
# Cell 1: Imports and Setup
import os
import re
import json
import csv
import ast
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
import configparser
import tiktoken
import logging
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Phrases
import openai
from difflib import SequenceMatcher
import itertools

SAVE_DIR = "Saved_files_new"
os.makedirs(SAVE_DIR, exist_ok=True)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)


True

In [3]:
# %%
# Cell 2: OpenAI Setup and Utility (Updated for gpt-5-nano)
class CreditTracker:
    def __init__(self):
        self.total_tokens = 0
        self.total_cost = 0
        self.cost_per_1k_tokens = 0.00015
    def update(self, tokens):
        self.total_tokens += tokens
        self.total_cost += (tokens / 1000) * self.cost_per_1k_tokens
    def get_stats(self):
        return {"total_tokens": self.total_tokens, "total_cost": round(self.total_cost, 4)}

def initialize_openai():
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE')
    client = openai.OpenAI(api_key=api_key)
    return client, model_type

def num_tokens_from_string(string: str, model_name: str) -> int:
    """Get token count with fallback for unsupported models like gpt-5-nano"""
    try:
        encoding = tiktoken.encoding_for_model(model_name)
        return len(encoding.encode(string))
    except KeyError:
        # Fallback for unsupported models like gpt-5-nano
        if model_name.startswith('gpt-5-nano'):
            # Use o200k_base encoding as fallback for gpt-5-nano
            encoding = tiktoken.get_encoding("o200k_base")
            return len(encoding.encode(string))
        else:
            # For other unsupported models, use a reasonable approximation
            return len(string) // 4  # Rough approximation: 4 chars per token

client, model_type = initialize_openai()
credit_tracker = CreditTracker()


In [4]:
# %%
# Cell 3: Data Preprocessing Utilities

def extract_keywords_from_filename(filename):
    base = os.path.splitext(os.path.basename(filename))[0]
    parts = base.split('_')
    return [part for i, part in enumerate(parts) if i > 2 and part != 'results' and not part.isdigit()]

def get_custom_stop_words(search_keywords=None):
    stop_words = set(stopwords.words('english'))
    words_to_keep = set()
    if search_keywords:
        for keyword in search_keywords:
            keyword = keyword.lower()
            words_to_keep.add(keyword)
            for word in keyword.split():
                words_to_keep.add(word)
    stop_words = stop_words - words_to_keep
    scientific_terms = {'et', 'al', 'ref', 'reference', 'references', 'cited', 'cite',
        'fig', 'figure', 'figures', 'table', 'tables', 'chart', 'charts',
        'published', 'journal', 'conference', 'proceedings', 'vol', 'volume', 'pp', 'page', 'pages', 'doi'}
    return stop_words.union(scientific_terms)

def preprocess_text(text, search_keywords=None, min_word_length=2, remove_numbers=True):
    if not isinstance(text, (str, int, float)):
        return ''
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    
    if remove_numbers:
        # Define patterns to preserve (contingency analysis terms)
        preserve_patterns = [
            r'\bn-\d+\b',           # N-1, N-2, N-3, etc.
            r'\b\d+-\d+\b',         # patterns like 1-2, 2-3 (if needed)
        ]
        
        # Store protected terms with temporary replacements
        protected_terms = {}
        temp_counter = 0
        
        for pattern in preserve_patterns:
            matches = re.findall(pattern, text)
            for match in set(matches):  # Remove duplicates
                temp_placeholder = f"__PROTECTED_{temp_counter}__"
                protected_terms[temp_placeholder] = match
                text = text.replace(match, temp_placeholder)
                temp_counter += 1
        
        # Now remove all remaining numbers
        text = re.sub(r'\d+', '', text)
        
        # Restore protected terms
        for placeholder, original_term in protected_terms.items():
            text = text.replace(placeholder, original_term)
    
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'--+', ' ', text)
    tokens = word_tokenize(text)
    stop_words = get_custom_stop_words(search_keywords)
    tokens = [t for t in tokens if len(t) >= min_word_length and t not in stop_words and len(t) > 1 and not t.isdigit()]
    lemmatizer = WordNetLemmatizer()
    try:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    except:
        pass
    return ' '.join(tokens)


def preprocess_dataframe(df, text_col, search_keywords, processed_col='processed_text'):
    df[text_col] = df[text_col].fillna('').astype(str)
    df[processed_col] = df[text_col].apply(lambda x: preprocess_text(x, search_keywords))
    return df[df[processed_col].str.strip() != '']

def clean_fields_of_study(s):
    valid_fields = ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics',
        'Medicine','Business','Environmental Science','Chemistry','Materials Science',
        'Geography','Biology','Geology','Political Science','Psychology','Com']
    if pd.isna(s) or s == '[]':
        return ["Unknown"]
    if isinstance(s, str):
        fields = [field.strip().strip("'\"") for field in s.strip('[]').split(',')]
        return [f if f in valid_fields else "Unknown" for f in fields] or ["Unknown"]
    return ["Unknown"]


In [5]:
# %%
# Cell 4: Data Loading & Cleaning

#filename = "semantic_scholar_2025_02_14_reliability_resilience_power_systems_results.csv"
filename = "semantic_scholar_2025_09_14_reliability_resilience_power_systems_results.csv"
filepath = os.path.join("Saved_files", filename)
df = pd.read_csv(filepath, sep=";")
df['text'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')
search_keywords = extract_keywords_from_filename(filename)
df = preprocess_dataframe(df, text_col='text', search_keywords=search_keywords)
df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)
logger.info(f"Loaded and preprocessed {len(df)} papers")


2025-09-30 10:01:30,620 - INFO - Loaded and preprocessed 30917 papers


In [None]:
# %%
# %%
# Cell 5: Enhanced Method Detection Functions (COMPLETE CORRECTED VERSION)

def extract_candidate_terms(df, text_col='processed_text', max_features=20000):
    """Extract candidate terms from processed text using CountVectorizer."""
    vectorizer = CountVectorizer(
        ngram_range=(1, 4), max_df=0.95, min_df=2, max_features=max_features, token_pattern=r'\b[\w-]+\b'
    )
    matrix = vectorizer.fit_transform(df[text_col].fillna(''))
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    return [term for term, freq in sorted(zip(terms, freqs), key=lambda x: x[1], reverse=True)]

def parse_llm_python_list(output_text):
    """Improved parsing function for LLM outputs"""
    import re
    import ast
    
    content = output_text.strip()
    content = re.sub(r'```(?:python|json)?\n?', '', content)# Remove code block markers
    content = re.sub(r'```', '', content)# Remove closing code block markers
    
    list_patterns = [
        r'\[([^\]]+)\]',  # Standard list format
        r'List:\s*\[([^\]]+)\]',  # List: [items]
        r'Result:\s*\[([^\]]+)\]'  # Result: [items]
    ]
    
    for pattern in list_patterns:
        match = re.search(pattern, content, re.DOTALL | re.IGNORECASE)
        if match:
            try:
                return ast.literal_eval('[' + match.group(1) + ']')
            except:
                items = [item.strip().strip("'\"") for item in match.group(1).split(',')]
                return [item for item in items if item.strip()]
    
    lines = content.split('\n')
    items = []
    for line in lines:
        line = line.strip()
        if line and not line.startswith('#') and not line.startswith('//'): # Ignore comments
            line = re.sub(r'^\d+\.?\s*[-*]?\s*', '', line) # Remove leading numbers/bullets
            line = line.strip("'\"") # Remove surrounding quotes
            if line:
                items.append(line)
    
    return items

def get_method_phrases_enhanced(
    corpus_terms, client, model_type, credit_tracker, prompt,
    n_runs=3, temp=None, top_p=None, show_progress=True, batch_size=500
):
    """Enhanced method extraction with configurable prompt."""
    import collections
    from math import ceil

    all_phrases_sets = []
    n_batches = ceil(len(corpus_terms) / batch_size)
    
    for batch_idx in range(n_batches):
        batch_terms = corpus_terms[batch_idx * batch_size : (batch_idx + 1) * batch_size]

        # Format the prompt with current batch terms
        formatted_prompt = prompt.format(candidate_terms=batch_terms)

        for i in range(n_runs):
            try:
                api_params = {
                    "model": model_type,
                    "messages": [
                        {"role": "system", "content": "You are a comprehensive research method extraction expert. Your primary goal is maximum coverage of specific technical methods."},
                        {"role": "user", "content": formatted_prompt}
                    ],
                }
                if model_type.startswith('gpt-5-nano'):
                    api_params["max_completion_tokens"] = 8000
                else:
                    if temp is not None: api_params["temperature"] = temp
                    if top_p is not None: api_params["top_p"] = top_p
                    api_params["max_tokens"] = 8000

                response = client.chat.completions.create(**api_params)
                content = response.choices[0].message.content # Extract content
                phrases = parse_llm_python_list(content) # Extract list of phrases
                phrases = [p.lower().strip() for p in phrases if p.strip() and len(p.strip()) > 2]# Filter short/empty 
                all_phrases_sets.append(set(phrases)) # Store as set to avoid duplicates
                credit_tracker.update(num_tokens_from_string(content, model_type))
                if show_progress:
                    print(f"BATCH {batch_idx+1}/{n_batches}, run {i+1}: found {len(phrases)}")
                    print(f"  Sample: {phrases[:10]}")
            except Exception as e:
                logger.error(f"Error in LLM call for batch {batch_idx+1}, run {i+1}: {e}")
                all_phrases_sets.append(set())

    all_flat = [p for s in all_phrases_sets for p in s] # Flatten and deduplicate
    counts = collections.Counter(all_flat) # Count occurrences
    sorted_methods = sorted(counts.keys(), key=lambda x: (-counts[x], x)) # Sort by frequency then alphabetically
    print(f"\nTotal unique phrases: {len(counts)}") 
    print(f"Most frequent (top 10): {sorted_methods[:10]}")
    return sorted_methods, counts


def filter_generic_phrases(phrases, min_specificity_words=2):
    """Remove generic phrases using comprehensive blacklist."""
    
    # Comprehensive blacklist of generic terms
    generic_blacklist = {
        # Domain generic
        'energy', 'power', 'system', 'network', 'control', 'data',
        
        # Method generic  
        'analysis', 'method', 'approach', 'technique', 'procedure', 
        'framework', 'model', 'design', 'optimization', 'algorithm',
        
        # Process generic
        'application', 'implementation', 'development', 'evaluation',
        'assessment', 'review', 'study', 'research', 'investigation'
    }
    
    # Patterns to exclude (domain + generic combinations)
    generic_patterns = [
        r'^(energy|power|system|network|electrical)\s+(analysis|method|approach|design|optimization)$',
        r'^(control|data|signal)\s+(analysis|method|processing)$',
        r'^(system|network)\s+(optimization|design|analysis)$'
    ]
    
    filtered_phrases = []
    
    for phrase in phrases:
        phrase_lower = phrase.lower().strip()
        words = phrase_lower.split()
        
        # Skip if too generic (most words are in blacklist) 
        generic_word_count = sum(1 for word in words if word in generic_blacklist) # Count generic words
        if generic_word_count >= len(words) - min_specificity_words: # Allow up to two generic words
            continue
            
        # Skip if matches generic patterns
        if any(re.match(pattern, phrase_lower) for pattern in generic_patterns):
            continue
            
        # Skip obvious generic combinations
        if len(words) == 2 and all(word in generic_blacklist for word in words):
            continue
            
        filtered_phrases.append(phrase)
    
    return filtered_phrases

def load_method_phrases_from_csv(filename="extracted_method_phrases.csv"):
    """Load method phrases from CSV with cleaning"""
    try:
        filepath = os.path.join(SAVE_DIR, filename)
        df = pd.read_csv(filepath)
        method_phrases = df['Method Phrase'].tolist()
        method_counts = df['Count'].tolist()
        
        # CRITICAL: Clean CSV artifacts before returning
        method_phrases = prefilter_obvious_duplicates(method_phrases)
        
        # Rebuild counts for cleaned phrases (set to 1 if not available)
        if len(method_counts) != len(method_phrases):
            method_counts = [1] * len(method_phrases)
            
        return method_phrases, method_counts
    except Exception as e:
        logger.warning(f"Failed to load method phrases: {e}")
        return None, None

def validate_method_groups_enhanced(groups, original_batch):
    """Enhanced validation to catch problematic groupings."""
    validated_groups = {}
    
    # Define forbidden groupings (methods that should never be grouped)
    forbidden_pairs = [
        ("linear programming", "nonlinear programming"),
        ("first order", "second order"), 
        ("generation shift", "injection shift"),
        ("lstm", "gru"),
        ("genetic algorithm", "particle swarm"),
        ("saifi", "saidi"),  # Different reliability indices
        ("form", "sorm"),    # Different reliability methods
    ]
    
    for canonical, variants in groups.items():
        # Check for forbidden groupings
        valid_group = True
        for forbidden in forbidden_pairs:
            variants_text = " ".join(variants).lower()
            canonical_text = canonical.lower()
            
            if ((forbidden[0] in variants_text or forbidden[0] in canonical_text) and 
                (forbidden[1] in variants_text or forbidden[1] in canonical_text)):
                valid_group = False
                logger.warning(f"Splitting forbidden grouping: {canonical}")
                break
        
        if valid_group:
            # Ensure canonical is the longest/most descriptive term (not abbreviation)
            full_forms = [v for v in variants if len(v) > 5 and ' ' in v]  # Prefer multi-word terms
            if full_forms:
                canonical = max(full_forms, key=len)
            else:
                canonical = max(variants, key=len)
            
            validated_groups[canonical] = variants
        else:
            # Split into individual methods
            for variant in variants:
                validated_groups[variant] = [variant]
    
    return validated_groups


def save_method_phrases_to_csv(method_phrases, method_counts, filename="extracted_method_phrases.csv"):
    """Save method phrases to CSV file."""
    filename = os.path.join(SAVE_DIR, filename)
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Method Phrase", "Count"])
        if hasattr(method_counts, 'items'):
            for phrase, count in method_counts.items():
                clean_phrase = phrase.strip().replace('\n', ' ')
                writer.writerow([clean_phrase, count])
        else:
            for phrase, count in zip(method_phrases, method_counts):
                clean_phrase = phrase.strip().replace('\n', ' ')
                writer.writerow([clean_phrase, count])
    print(f"✓ Saved method phrases to {filename}")


def prefilter_obvious_duplicates(method_list, similarity_threshold=0.95):
    """Remove obvious near-duplicates before LLM processing to improve efficiency."""
    from difflib import SequenceMatcher
    
    filtered_methods = []
    seen_methods = set()
    
    for method in sorted(method_list, key=len):  # Process shorter methods first
        method_lower = method.lower().strip()
        
        is_duplicate = False
        for seen in seen_methods:
            similarity = SequenceMatcher(None, method_lower, seen).ratio()
            if similarity >= similarity_threshold:
                is_duplicate = True
                break
        
        if not is_duplicate:
            filtered_methods.append(method)
            seen_methods.add(method_lower)
    
    print(f"Pre-filtering: {len(method_list)} → {len(filtered_methods)} methods ({len(method_list) - len(filtered_methods)} obvious duplicates removed)")
    return filtered_methods

def are_methods_truly_similar(method_variants):
    """Check if methods in a group are truly the same technique by analyzing core words."""
    if len(method_variants) <= 1:
        return True
    
    # Extract core words (remove common qualifiers that indicate different techniques)
    qualifiers = {'improved', 'enhanced', 'adaptive', 'advanced', 'modified', 'sequential', 'parallel', 
                 'distributed', 'hybrid', 'multi', 'bi', 'tri', 'sub', 'quasi'}
    
    core_words_sets = []
    for method in method_variants:
        words = set(method.lower().split())
        core_words = words - qualifiers
        core_words_sets.append(core_words)
    
    # Check if core words overlap significantly across all variants
    if len(core_words_sets) < 2:
        return True
    
    base_core = core_words_sets[0]  # FIXED: was incorrectly `core_words_sets`
    for other_core in core_words_sets[1:]:
        if not base_core or not other_core:  # Handle empty sets
            continue
        overlap = len(base_core & other_core) / len(base_core | other_core) if (base_core | other_core) else 0
        if overlap < 0.7:  # Less than 70% overlap in core words
            return False
    
    return True

"""def validate_method_groups(groups, original_batch):
   #SIMPLIFIED validation that preserves LLM groupings
    validated = {}
    original_batch_lower = [m.lower() for m in original_batch]
    
    for canonical, variants in groups.items():
        if not isinstance(variants, list):
            variants = [variants]
        
        # Keep variants that exist in original batch (case insensitive)
        clean_variants = []
        for variant in variants:
            variant_lower = str(variant).strip().lower()
            if variant_lower in original_batch_lower:
                # FIXED: Keep original casing from original_batch
                original_idx = original_batch_lower.index(variant_lower)
                clean_variants.append(original_batch[original_idx])
        
        if clean_variants:
            # Use provided canonical name (don't change it)
            canonical_clean = canonical.lower()  # Normalize casing only
            validated[canonical_clean] = clean_variants
    
    print(f"  Validation preserved {len(validated)} groups from LLM")
    return validated
"""

def have_common_core_terms(method1, method2):
    """Check if two methods share meaningful core terms beyond stop words."""
    stop_words = {'and', 'or', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
    
    words1 = set(method1.split()) - stop_words
    words2 = set(method2.split()) - stop_words
    
    if len(words1) == 0 or len(words2) == 0:
        return False
    
    # Require at least 50% overlap in core terms
    overlap = len(words1 & words2) / min(len(words1), len(words2))
    return overlap >= 0.5

def fallback_similarity_grouping(method_batch, similarity_threshold=0.85):
    """Fallback grouping using string similarity when LLM fails."""
    from difflib import SequenceMatcher
    
    groups = {}
    processed = set()
    
    for method in sorted(method_batch, key=len):
        if method in processed:
            continue
        
        # Find similar methods using both string similarity and semantic checks
        similar_methods = [method]
        method_lower = method.lower()
        
        for other_method in method_batch:
            if other_method != method and other_method not in processed:
                other_lower = other_method.lower()
                similarity = SequenceMatcher(None, method_lower, other_lower).ratio()
                
                if similarity >= similarity_threshold:
                    # Additional check: ensure they're not just coincidentally similar
                    if have_common_core_terms(method_lower, other_lower):
                        similar_methods.append(other_method)
                        processed.add(other_method)
        
        canonical = min(similar_methods, key=len)  # Use shortest as canonical
        groups[canonical] = similar_methods
        processed.add(method)
    
    return groups

def post_process_method_groups(variant_groups):
    """MINIMAL post-processing that preserves LLM consolidation work"""
    final_groups = {}
    
    for canonical, variants in variant_groups.items():
        # Remove duplicates but keep groups intact
        clean_variants = list(set(variants))
        
        # Only split if canonical name is obviously generic (very restrictive)
        truly_generic = ['method', 'analysis', 'approach', 'technique'] 
        if any(canonical.lower() == generic for generic in truly_generic):
            # Only split if canonical is EXACTLY one of these generic terms
            for variant in clean_variants:
                final_groups[variant] = [variant]
        else:
            # PRESERVE the group as-is
            final_groups[canonical] = clean_variants
    
    print(f"  Post-processing preserved {len(final_groups)} groups")
    return final_groups


def build_method_variant_groups_enhanced(
    method_list, client, model_type, credit_tracker, prompt,
    batch_size=50, top_p=None, temp=None
):
    """Enhanced method grouping with configurable prompt."""
    variant_groups = {}
    processed_methods = set()
    
    # Step 1: Pre-filter obvious duplicates to improve LLM efficiency
    method_list = prefilter_obvious_duplicates(method_list)
    
    # Step 2: Process methods in batches using configurable prompting
    for i in range(0, len(method_list), batch_size):
        batch = method_list[i:i + batch_size]
        batch = [m for m in batch if m not in processed_methods]
        
        if not batch:
            continue
            
        # Format the prompt with current batch of methods
        formatted_prompt = prompt.format(method_list=batch)

        try:
            # Configure API parameters for different model types
            api_params = {
                "model": model_type,
                "messages": [
                    {"role": "system", "content": "You are a scientific method classification expert. Group only true variants while preserving distinct techniques."},
                    {"role": "user", "content": formatted_prompt}
                ]
            }
            
            if model_type.startswith('gpt-5-nano'):
                api_params["max_completion_tokens"] = 3000
            else:
                if temp is not None: api_params["temperature"] = temp
                if top_p is not None:  api_params["top_p"] = top_p
                api_params["max_tokens"] = 3000
            
            # Make LLM call and process response
            response = client.chat.completions.create(**api_params)
            logger.info(f"LLM raw response: {response}")
            content = response.choices[0].message.content
            logger.info(f"LLM content: {content}")
            credit_tracker.update(num_tokens_from_string(content, model_type))
            
            print(f"✓ Batch {i//batch_size + 1} LLM response received: {len(content)} characters")
            
            # Parse the dictionary response with error handling
            try:
                content = content.strip()
                if content.startswith('```'):
                    content = re.sub(r'```(?:python|json)?\n?', '', content)
                    content = re.sub(r'```$', '', content)
                
                dict_match = re.search(r'\{.*\}', content, re.DOTALL)
                if dict_match:
                    groups = ast.literal_eval(dict_match.group(0))
                    
                    # Validate and clean the groups using enhanced validation
                    validated_groups = validate_method_groups_enhanced(groups, batch)
                    variant_groups.update(validated_groups)
                    processed_methods.update(batch)
                    
                    print(f"✓ Processed batch {i//batch_size + 1}: {len(validated_groups)} groups created")
                else:
                    print(f"⚠️ No dictionary found in LLM response for batch {i//batch_size + 1}")
                    fallback_groups = fallback_similarity_grouping(batch)
                    variant_groups.update(fallback_groups)
                    processed_methods.update(batch)
                    
            except Exception as e:
                logger.warning(f"Failed to parse LLM response, using fallback similarity grouping: {e}")
                fallback_groups = fallback_similarity_grouping(batch)
                variant_groups.update(fallback_groups)
                processed_methods.update(batch)
                
        except Exception as e:
            logger.error(f"LLM call failed, using fallback: {e}")
            fallback_groups = fallback_similarity_grouping(batch)
            variant_groups.update(fallback_groups)
            processed_methods.update(batch)

    # Step 3: Post-process to ensure quality and remove inappropriate groupings
    final_groups = post_process_method_groups(variant_groups)
    
    logger.info(f"Created {len(final_groups)} method variant groups from {len(method_list)} original methods")
    return final_groups


def create_variant_mapping(variant_groups):
    """Create mapping from any variant to its canonical form for method consolidation."""
    variant_to_canonical = {}
    canonical_to_variants = {}
    
    for canonical, variants in variant_groups.items():
        canonical_to_variants[canonical] = variants
        for variant in variants:
            variant_to_canonical[variant.lower()] = canonical
    
    return variant_to_canonical, canonical_to_variants

def consolidate_variant_scores(scores, method_names, variant_to_canonical):
    """
    Consolidate method scores to prevent double-counting of variants.
    Uses MAXIMUM score among variants (not sum) to avoid inflating scores.
    """
    canonical_methods = list(set(variant_to_canonical.values()))
    canonical_scores = np.zeros((scores.shape[0], len(canonical_methods)))  # FIXED: was scores.shape
    
    canonical_to_idx = {method: i for i, method in enumerate(canonical_methods)}
    
    for j, method_name in enumerate(method_names):
        method_lower = method_name.lower()
        canonical = variant_to_canonical.get(method_lower, method_name)
        
        if canonical in canonical_to_idx:
            canonical_idx = canonical_to_idx[canonical]
            # Use MAXIMUM score among variants (not sum) to prevent double-counting
            canonical_scores[:, canonical_idx] = np.maximum(
                canonical_scores[:, canonical_idx], 
                scores[:, j]
            )
    
    return canonical_scores, canonical_methods

# ================================================================
# METHOD SCORING FUNCTIONS
# ================================================================

def compute_enhanced_tfidf_scores(processed_texts, method_variants_dict, ngram_range=(1, 4), min_df=1, max_df=0.95):
    """Compute TF-IDF scores for all method variants"""
    all_variants = []
    for variants in method_variants_dict.values():
        all_variants.extend(variants)
    
    existing_variants = []
    for variant in all_variants:
        variant_pattern = r'\b' + re.escape(variant.lower()) + r'\b'
        found = False
        for text in processed_texts[:1000]:  # Sample check for efficiency
            if re.search(variant_pattern, text.lower()):
                existing_variants.append(variant)
                found = True
                break
        if not found and len(existing_variants) < 5000:
            for text in processed_texts:
                if re.search(variant_pattern, text.lower()):
                    existing_variants.append(variant)
                    break
    
    print(f"Found {len(existing_variants)} variants that exist in corpus out of {len(all_variants)} total")
    
    if not existing_variants:
        logger.warning("No method variants found in corpus!")
        return np.zeros((len(processed_texts), 1)), ['no_methods_found']
    
    tfidf_vectorizer = TfidfVectorizer(
        vocabulary=existing_variants,
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        norm='l2',
        token_pattern=r'\b[\w-]+\b'
    )
    
    try:
        tfidf_matrix = tfidf_vectorizer.fit_transform(processed_texts)
        scores = tfidf_matrix.toarray()
        feature_names = tfidf_vectorizer.get_feature_names_out()
        return scores, feature_names
    except Exception as e:
        logger.error(f"TF-IDF computation failed: {e}")
        return np.zeros((len(processed_texts), len(existing_variants))), existing_variants

def compute_enhanced_lda_scores(processed_texts, method_variants_dict, ngram_range=(1, 4), n_topics=None, max_iter=20):
    """Compute LDA scores for method variants. FIXED VERSION"""
    all_variants = []
    for variants in method_variants_dict.values():
        all_variants.extend(variants)

    if n_topics is None:
        n_topics = min(len(all_variants), 100)

    vectorizer = CountVectorizer(
        vocabulary=all_variants,
        ngram_range=ngram_range,
        token_pattern=r'\b[\w-]+\b'
    )

    try:
        doc_term_matrix = vectorizer.fit_transform(processed_texts)
        feature_names = vectorizer.get_feature_names_out()
        
        print(f"  LDA Debug: doc_term_matrix.shape = {doc_term_matrix.shape}")
        print(f"  LDA Debug: len(all_variants) = {len(all_variants)}")
        print(f"  LDA Debug: len(feature_names) = {len(feature_names)}")
        
        if n_topics >= 2 and doc_term_matrix.shape[1] > 0:
            # FIXED: Use min to prevent n_topics > n_features
            actual_topics = min(n_topics, doc_term_matrix.shape[1], len(all_variants))
            
            lda = LatentDirichletAllocation(
                n_components=actual_topics,
                learning_method='batch',
                random_state=42,
                max_iter=max_iter
            )
            
            # Fit LDA and get topic distributions
            doc_topic_matrix = lda.fit_transform(doc_term_matrix)
            topic_term_matrix = lda.components_  # Shape: (n_topics, n_features)
            
            print(f"  LDA Debug: doc_topic_matrix.shape = {doc_topic_matrix.shape}")
            print(f"  LDA Debug: topic_term_matrix.shape = {topic_term_matrix.shape}")
            
            # FIXED: Convert topic-document matrix back to document-term space
            # We want scores for each method variant in each document
            # Method: multiply doc-topic scores by topic-term weights
            lda_scores = np.dot(doc_topic_matrix, topic_term_matrix)
            
            print(f"  LDA Debug: final lda_scores.shape = {lda_scores.shape}")
            
            # Ensure correct dimensions
            if lda_scores.shape[1] != len(all_variants):
                print(f"  LDA Warning: Score matrix columns ({lda_scores.shape[1]}) != variants ({len(all_variants)})")
                # Pad or truncate to match expected dimensions
                if lda_scores.shape[1] < len(all_variants):
                    padding = np.zeros((lda_scores.shape[0], len(all_variants) - lda_scores.shape[1]))
                    lda_scores = np.hstack([lda_scores, padding])
                else:
                    lda_scores = lda_scores[:, :len(all_variants)]
                    
                print(f"  LDA Debug: adjusted lda_scores.shape = {lda_scores.shape}")
            
        else:
            print(f"  LDA: Creating zero matrix due to insufficient topics/features")
            lda_scores = np.zeros((len(processed_texts), len(all_variants)))

        return lda_scores, feature_names
        
    except Exception as e:
        logger.error(f"LDA computation failed: {e}")
        print(f"LDA Error details: {e}")
        return np.zeros((len(processed_texts), len(all_variants))), all_variants

    
def compute_enhanced_compound_scores(df, method_variants_dict, processed_col='processed_text', window=150):
    """Enhanced compound scoring that handles variants"""
    n_docs = len(df)
    all_variants = []
    for variants in method_variants_dict.values():
        all_variants.extend(variants)
    
    n_methods = len(all_variants)
    scores = np.zeros((n_docs, n_methods), dtype=np.float32)
    docs = df[processed_col].fillna('').str.lower().tolist()
    
    for j, variant in enumerate(all_variants):
        variant_l = variant.lower()
        
        for i, text in enumerate(docs):
            if variant_l in text:
                scores[i, j] = 1.0
            elif len(variant_l.split()) > 1:
                words = variant_l.split()
                if all(word in text for word in words):
                    scores[i, j] = 0.7
            elif len(variant_l) <= 5 and variant_l.upper() in text.upper():
                scores[i, j] = 0.8
    
    return scores, all_variants

def assign_methods_improved(df, scores, method_names, top_n=5, min_score=0.005):
    """Improved method assignment with better diagnostics."""
    n_papers, n_methods = scores.shape
    
    # Initialize method columns
    for i in range(top_n):
        df[f'Method_{i+1}'] = ''
        df[f'Method_{i+1}_Score'] = 0.0
    
    df['Primary_Method'] = ''
    df['Primary_Method_Score'] = 0.0
    df['Method_Confidence'] = 'Low'
    df['Total_Method_Score'] = 0.0
    
    assigned_count = 0
    
    for paper_idx in range(n_papers):
        paper_scores = scores[paper_idx, :]
        
        top_indices = np.argsort(paper_scores)[::-1][:top_n]
        top_scores = paper_scores[top_indices]
        
        valid_mask = top_scores >= min_score
        valid_indices = top_indices[valid_mask]
        valid_scores = top_scores[valid_mask]
        
        if len(valid_indices) > 0:
            assigned_count += 1
            
            # FIXED: Use [0] to get first element, not entire array
            df.loc[paper_idx, 'Primary_Method'] = method_names[valid_indices[0]]
            df.loc[paper_idx, 'Primary_Method_Score'] = valid_scores[0]
            df.loc[paper_idx, 'Total_Method_Score'] = valid_scores.sum()
            
            if valid_scores[0] > 0.1:
                df.loc[paper_idx, 'Method_Confidence'] = 'High'
            elif valid_scores[0] > 0.05:
                df.loc[paper_idx, 'Method_Confidence'] = 'Medium'
            
            for i, (idx, score) in enumerate(zip(valid_indices, valid_scores)):
                if i < top_n:
                    df.loc[paper_idx, f'Method_{i+1}'] = method_names[idx]
                    df.loc[paper_idx, f'Method_{i+1}_Score'] = score
    
    logger.info(f"  Assigned methods to {assigned_count}/{n_papers} papers ({100*assigned_count/n_papers:.1f}%)")
    return df

def aggressive_fallback_grouping(method_list, similarity_threshold=0.75):
    """
    Enhanced fallback grouping with aggressive similarity matching and pattern recognition.
    This will handle cases where LLM API fails.
    """
    from difflib import SequenceMatcher
    
    groups = {}
    processed = set()
    
    # First pass: Handle obvious patterns
    pattern_groups = handle_common_patterns(method_list)
    for canonical, variants in pattern_groups.items():
        groups[canonical] = variants
        processed.update(variants)
    
    # Second pass: Similarity-based grouping for remaining methods
    remaining_methods = [m for m in method_list if m not in processed]
    
    for method in sorted(remaining_methods, key=len):
        if method in processed:
            continue
            
        group = [method]
        method_lower = method.lower()
        method_tokens = set(method_lower.split())
        
        for other_method in remaining_methods:
            if other_method == method or other_method in processed:
                continue
                
            other_lower = other_method.lower()
            other_tokens = set(other_lower.split())
            
            # Multiple similarity checks
            string_sim = SequenceMatcher(None, method_lower, other_lower).ratio()
            token_overlap = len(method_tokens & other_tokens) / len(method_tokens | other_tokens) if (method_tokens | other_tokens) else 0
            
            # Check for containment (one is substring of other)
            containment = method_lower in other_lower or other_lower in method_lower
            
            # Group if any condition met
            if (string_sim >= similarity_threshold or 
                token_overlap >= 0.6 or 
                containment):
                group.append(other_method)
                processed.add(other_method)
        
        # Use shortest name as canonical
        canonical = min(group, key=lambda x: (len(x), x))
        groups[canonical] = group
        processed.add(method)
    
    return groups

def handle_common_patterns(method_list):
    """Handle common method name patterns that should be grouped together."""
    import re
    
    pattern_groups = {}
    processed = set()
    
    # Common abbreviation patterns
    abbreviation_patterns = [
        (r'^ga$', r'genetic algorithm.*'),
        (r'^pso$', r'particle swarm optimization.*'),
        (r'^abc$', r'.*bee colony.*'),
        (r'^gwo$', r'grey wolf.*'),
        (r'^opf$', r'optimal power flow.*'),
        (r'^milp$', r'.*integer.*linear.*programming.*'),
        (r'^dnn.*', r'.*neural network.*'),
        (r'^cnn$', r'convolutional neural network.*'),
        (r'^rnn$', r'.*neural network rnn.*'),
        (r'^svm$', r'support vector machine.*'),
        (r'^pca$', r'principal component analysis.*'),
    ]
    
    # Method variant patterns
    variant_patterns = [
        # Neural network variants
        (r'.*neural network.*', ['neural network', 'bp neural network', 'neural network algorithm', 'artificial neural network']),
        # Genetic algorithm variants  
        (r'.*genetic algorithm.*', ['genetic algorithm', 'genetic algorithm ga', 'adaptive genetic algorithm']),
        # Monte Carlo variants
        (r'.*monte carlo.*', ['monte carlo simulation', 'sequential monte carlo', 'carlo simulation result']),
        # Particle swarm variants
        (r'.*particle swarm.*', ['particle swarm optimization', 'binary particle swarm', 'improved particle swarm optimization']),
        # Random forest variants
        (r'.*random forest.*', ['random forest', 'random forest rf', 'random forest algorithm']),
        # Machine learning variants
        (r'.*machine learning.*', ['machine learning', 'ensemble learning']),
    ]
    
    # Group by abbreviation patterns
    for abbrev_pattern, full_pattern in abbreviation_patterns:
        abbrev_matches = []
        full_matches = []
        
        for method in method_list:
            if method in processed:
                continue
            if re.search(abbrev_pattern, method.lower()):
                abbrev_matches.append(method)
            elif re.search(full_pattern, method.lower()):
                full_matches.append(method)
        
        if abbrev_matches and full_matches:
            all_matches = abbrev_matches + full_matches
            canonical = min(full_matches, key=len) if full_matches else min(abbrev_matches, key=len)
            pattern_groups[canonical] = all_matches
            processed.update(all_matches)
    
    # Group by variant patterns
    for base_pattern, known_variants in variant_patterns:
        matches = []
        for method in method_list:
            if method in processed:
                continue
            if (re.search(base_pattern, method.lower()) or 
                method.lower() in [v.lower() for v in known_variants]):
                matches.append(method)
        
        if len(matches) > 1:
            canonical = min(matches, key=len)
            pattern_groups[canonical] = matches
            processed.update(matches)
    
    return pattern_groups


In [10]:
# %%
# Cell 6: Method Scoring Functions (Enhanced)

def compute_enhanced_tfidf_scores(processed_texts, method_variants_dict, ngram_range=(1, 4), min_df=1, max_df=0.95):
    """Compute TF-IDF scores for all method variants"""
    # Get all variants
    all_variants = []
    for variants in method_variants_dict.values():
        all_variants.extend(variants)
    
    # Create vocabulary from actual variants that exist in corpus
    existing_variants = []
    for variant in all_variants:
        # Check if variant appears in any document
        variant_pattern = r'\b' + re.escape(variant.lower()) + r'\b'
        found = False
        for text in processed_texts:  # Sample check for efficiency
            if re.search(variant_pattern, text.lower()):
                existing_variants.append(variant)
                found = True
                break
        if not found and len(existing_variants) < 5000:  # Keep checking if we don't have too many
            for text in processed_texts:
                if re.search(variant_pattern, text.lower()):
                    existing_variants.append(variant)
                    break
    
    print(f"Found {len(existing_variants)} variants that exist in corpus out of {len(all_variants)} total")
    
    if not existing_variants:
        logger.warning("No method variants found in corpus!")
        return np.zeros((len(processed_texts), 1)), ['no_methods_found']
    
    tfidf_vectorizer = TfidfVectorizer(
        vocabulary=existing_variants,
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        norm='l2',
        token_pattern=r'\b[\w-]+\b'
    )
    
    try:
        tfidf_matrix = tfidf_vectorizer.fit_transform(processed_texts)
        scores = tfidf_matrix.toarray()
        feature_names = tfidf_vectorizer.get_feature_names_out()
        return scores, feature_names
    except Exception as e:
        logger.error(f"TF-IDF computation failed: {e}")
        return np.zeros((len(processed_texts), len(existing_variants))), existing_variants
    
def compute_enhanced_tfidf_scores_adapted(processed_texts, method_variants_dict, ngram_range=(1, 4), min_df=1, max_df=0.9):
    """Compute adapted TF-IDF scores for method variants with binary TF weighting"""

    
    # Extract all variants from method_variants_dict - THIS IS WHERE IT'S USED
    all_variants = []
    for variants in method_variants_dict.values():
        all_variants.extend(variants)
    
    # Create vocabulary from actual variants that exist in corpus
    existing_variants = []
    for variant in all_variants:
        variant_pattern = r'\b' + re.escape(variant.lower()) + r'\b'
        found = False
        for text in processed_texts:
            if re.search(variant_pattern, text.lower()):
                existing_variants.append(variant)
                found = True
                break
        if not found and len(existing_variants) < 5000:
            for text in processed_texts:
                if re.search(variant_pattern, text.lower()):
                    existing_variants.append(variant)
                    break
    
    print(f"Found {len(existing_variants)} variants that exist in corpus out of {len(all_variants)} total")
    
    if not existing_variants:
        logger.warning("No method variants found in corpus!")
        return np.zeros((len(processed_texts), 1)), ['no_methods_found']
    
    # Step 1: Binary term counting
    count_vectorizer = CountVectorizer(
        vocabulary=existing_variants,  # Uses the filtered variants from method_variants_dict
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        binary=True,  # KEY CHANGE: Binary presence/absence
        token_pattern=r'\b[\w-]+\b'
    )
    
    count_matrix = count_vectorizer.fit_transform(processed_texts)
    
    # Step 2: Apply IDF weighting to binary counts
    tfidf_transformer = TfidfTransformer(
        norm='l2',           # Keep document length normalization
        use_idf=True,        # Keep IDF weighting for specificity
        smooth_idf=True,     # Prevents division by zero
        sublinear_tf=False   # Not needed with binary weighting
    )
    
    tfidf_matrix = tfidf_transformer.fit_transform(count_matrix)
    
    scores = tfidf_matrix.toarray()
    feature_names = count_vectorizer.get_feature_names_out()
    return scores, feature_names



def compute_enhanced_lda_scores(processed_texts, method_variants_dict, ngram_range=(1, 4), n_topics=None, max_iter=20):
    """Compute LDA scores for method variants."""
    all_variants = []
    for variants in method_variants_dict.values():
        all_variants.extend(variants)

    if n_topics is None:
        n_topics = min(len(all_variants), 100)

    vectorizer = CountVectorizer(
        vocabulary=all_variants,
        ngram_range=ngram_range,
        token_pattern=r'\b[\w-]+\b'
    )

    try:
        doc_term_matrix = vectorizer.fit_transform(processed_texts)
        feature_names = vectorizer.get_feature_names_out()

        if n_topics >= 2 and doc_term_matrix.shape[1] > 0:
            lda = LatentDirichletAllocation(
                n_components=min(n_topics, doc_term_matrix.shape[1]),
                learning_method='batch',
                random_state=42,
                max_iter=max_iter
            )
            lda_matrix = lda.fit_transform(doc_term_matrix)
        else:
            lda_matrix = np.zeros((doc_term_matrix.shape[0], len(all_variants)))

        return lda_matrix, feature_names
    except Exception as e:
        logger.error(f"LDA computation failed: {e}")
        return np.zeros((len(processed_texts), len(all_variants))), all_variants
    
def compute_enhanced_compound_scores(df, method_variants_dict, processed_col='processed_text', window=150):
    """Enhanced compound scoring with proximity matching and filtering common stop/context words"""
    import re
    from itertools import product
    
    n_docs = len(df)
    all_variants = []
    for variants in method_variants_dict.values():
        all_variants.extend(variants)
    
    n_methods = len(all_variants)
    scores = np.zeros((n_docs, n_methods), dtype=np.float32)
    docs = df[processed_col].fillna('').str.lower().tolist()
    
    # Common distracting words that indicate figure/table/diagram context
    forbidden_context_words = {
        "figure", "table", "diagram", "plot", "chart", "graph", "section", 
        "appendix", "equation", "formula", "example", "case", "study", 
        "shown", "presented", "illustrated", "depicts", "shows"
    }
    
    for j, variant in enumerate(all_variants):
        variant_l = variant.lower().strip()
        variant_words = variant_l.split()
        
        for i, text in enumerate(docs):
            # 1. Exact phrase match (highest priority)
            pattern = r'\b' + re.escape(variant_l) + r'\b'
            if re.search(pattern, text):
                scores[i, j] = 1.0
                continue
            
            # 2. Hyphenated form matching
            hyphenated_pattern = r'\b' + re.escape(variant_l.replace(' ', '-')) + r'\b'
            if re.search(hyphenated_pattern, text):
                scores[i, j] = 1.0
                continue
            
            # 3. Pure abbreviation matching (≤4 chars, single word)
            if len(variant_l) <= 4 and variant_l.isalpha() and len(variant_words) == 1:
                abbrev_pattern = r'\b' + variant_l.upper() + r'\b'
                if re.search(abbrev_pattern, text.upper()):
                    scores[i, j] = 0.9
                continue
            
            # 4. Proximity matching for compound terms (2-5 words only)
            if 2 <= len(variant_words) <= 5:
                word_positions = []
                
                # Find positions of each word
                for word in variant_words:
                    # Skip very common words that might cause false positives
                    if word in {"of", "the", "a", "an", "and", "or", "in", "on", "at", "to", "for"}:
                        continue
                        
                    word_pattern = r'\b' + re.escape(word) + r'\b'
                    matches = [m.start() for m in re.finditer(word_pattern, text)]
                    if not matches:
                        word_positions = []
                        break
                    word_positions.append(matches)
                
                # Check if all words found and proximity conditions met
                if word_positions and len(word_positions) >= len(variant_words) - 1:  # Allow missing 1 stop word
                    found_valid_match = False
                    
                    for pos_tuple in product(*word_positions):
                        min_pos = min(pos_tuple)
                        max_pos = max(pos_tuple)
                        span = max_pos - min_pos
                        
                        # Check if within window and no forbidden context words nearby
                        if span <= window:
                            # Extract snippet around the match
                            snippet_start = max(0, min_pos - 50)
                            snippet_end = min(len(text), max_pos + 50)
                            snippet = text[snippet_start:snippet_end]
                            
                            # Check for forbidden context words in the snippet
                            has_forbidden_context = any(
                                fw in snippet for fw in forbidden_context_words
                            )
                            
                            if not has_forbidden_context:
                                scores[i, j] = 0.7  # Lower score for proximity match
                                found_valid_match = True
                                break
                    
                    if found_valid_match:
                        continue
    
    return scores, all_variants




def aggregate_variant_scores_to_canonical(scores, variant_names, variant_to_canonical):
    """Aggregate variant scores back to canonical method names"""
    canonical_methods = list(set(variant_to_canonical.values()))
    canonical_scores = np.zeros((scores.shape[0], len(canonical_methods)))
    
    canonical_to_idx = {method: i for i, method in enumerate(canonical_methods)}
    
    for j, variant in enumerate(variant_names):
        canonical = variant_to_canonical.get(variant.lower(), variant)
        if canonical in canonical_to_idx:
            canonical_idx = canonical_to_idx[canonical]
            canonical_scores[:, canonical_idx] += scores[:, j]  # Sum scores for variants
    
    return canonical_scores, canonical_methods

def prefilter_obvious_duplicates(method_list):
    """Clean CSV artifacts and normalize method names before LLM processing."""
    cleaned_methods = []
    for method in method_list:
        # Remove CSV artifacts (trailing quotes, commas, escaped characters)
        cleaned = method.strip().strip('"').strip("'").rstrip(',').strip()
        # Remove escaped quotes
        cleaned = cleaned.replace('\\"', '').replace("\\'", '')
        # Remove empty or very short entries
        if len(cleaned) > 1:
            cleaned_methods.append(cleaned)
    
    return list(set(cleaned_methods))  # Remove exact duplicates


def assign_top_methods_enhanced(
    df, canonical_scores, canonical_methods, variant_scores, variant_names,
    top_n=5, min_score=0.005
):
    """Enhanced method assignment with granular variant tracking"""
    
    # Assign top canonical methods
    for rank in range(top_n):
        top_method = []
        top_score = []
        top_variants = []
        confidence = []

        for i, row in enumerate(canonical_scores):
            if np.allclose(row, row):  # All equal
                top_method.append("")
                top_score.append(0.0)
                top_variants.append("")
                confidence.append("")
                continue

            idxs = np.argsort(row)[::-1]
            if rank < len(idxs):
                method_idx = idxs[rank]
                method = canonical_methods[method_idx]
                score = row[method_idx]
                
                if score >= min_score:
                    # Find contributing variants
                    variant_contributions = []
                    for v_idx, variant in enumerate(variant_names):
                        if variant_scores[i, v_idx] > 0:
                            # Check if this variant belongs to the current canonical method
                            variant_canonical = variant_to_canonical.get(variant.lower(), variant)
                            if variant_canonical == method:
                                variant_contributions.append(f"{variant}({variant_scores[i, v_idx]:.2f})")
                    
                    top_method.append(method)
                    top_score.append(score)
                    top_variants.append("; ".join(variant_contributions[:3]))  # Top 3 variants
                    confidence.append("confident" if score > min_score * 2 else "low_confidence")
                else:
                    top_method.append("")
                    top_score.append(0.0)
                    top_variants.append("")
                    confidence.append("")
            else:
                top_method.append("")
                top_score.append(0.0)
                top_variants.append("")
                confidence.append("")

        df[f'Top_{rank+1}_Method'] = top_method
        df[f'Top_{rank+1}_Score'] = top_score
        df[f'Top_{rank+1}_Variants'] = top_variants
        df[f'Top_{rank+1}_Confidence'] = confidence

    # Set primary columns
    df['Primary_Method'] = df['Top_1_Method']
    df['Primary_Method_Score'] = df['Top_1_Score']
    df['Primary_Method_Variants'] = df['Top_1_Variants']
    df['Method_Confidence'] = df['Top_1_Confidence']

    return df
def save_method_phrases_to_csv(method_phrases, method_counts, filename="extracted_method_phrases.csv"):
    filename = os.path.join(SAVE_DIR, filename)
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Method Phrase", "Count"])
        # If method_counts is a Counter, convert to dict
        if hasattr(method_counts, 'items'):
            for phrase, count in method_counts.items():
                clean_phrase = phrase.strip().replace('\n', ' ')
                writer.writerow([clean_phrase, count])
        else:
            # fallback: zipped lists
            for phrase, count in zip(method_phrases, method_counts):
                clean_phrase = phrase.strip().replace('\n', ' ')
                writer.writerow([clean_phrase, count])
    print(f"✓ Saved method phrases to {filename}")
    
def assign_methods_improved(df, scores, method_names, top_n=5, min_score=0.005):
    """
    Improved method assignment with better diagnostics.
    """
    n_papers, n_methods = scores.shape
    
    # Initialize method columns
    for i in range(top_n):
        df[f'Method_{i+1}'] = ''
        df[f'Method_{i+1}_Score'] = 0.0
    
    df['Primary_Method'] = ''
    df['Primary_Method_Score'] = 0.0
    df['Method_Confidence'] = 'None'
    df['Total_Method_Score'] = 0.0
    
    assigned_count = 0
    
    for paper_idx in range(n_papers):
        paper_scores = scores[paper_idx, :]
        
        # Get top methods for this paper
        top_indices = np.argsort(paper_scores)[::-1][:top_n]
        top_scores = paper_scores[top_indices]
        
        # Filter by minimum score
        valid_mask = top_scores >= min_score
        valid_indices = top_indices[valid_mask]
        valid_scores = top_scores[valid_mask]
        
        if len(valid_indices) > 0:
            assigned_count += 1
            
            # Assign primary method
            df.loc[paper_idx, 'Primary_Method'] = method_names[valid_indices[0]]
            df.loc[paper_idx, 'Primary_Method_Score'] = valid_scores[0]
            df.loc[paper_idx, 'Total_Method_Score'] = valid_scores.sum()
            
            # Assign confidence based on top score
            if valid_scores[0] > 0.7:
                df.loc[paper_idx, 'Method_Confidence'] = 'Super-High'
            elif valid_scores[0] > 0.5:
                df.loc[paper_idx, 'Method_Confidence'] = 'High'
            elif valid_scores[0] > 0.2:
                df.loc[paper_idx, 'Method_Confidence'] = 'Medium'
            elif valid_scores[0] > 0.05:
                df.loc[paper_idx, 'Method_Confidence'] = 'Low'
            
            # Assign all valid methods
            for i, (idx, score) in enumerate(zip(valid_indices, valid_scores)):
                if i < top_n:
                    df.loc[paper_idx, f'Method_{i+1}'] = method_names[idx]
                    df.loc[paper_idx, f'Method_{i+1}_Score'] = score
    
    logger.info(f"  Assigned methods to {assigned_count}/{n_papers} papers ({100*assigned_count/n_papers:.1f}%)")
    return df

In [11]:
# %%
# %%
# Cell 7: Enhanced Topic Analysis Functions with Multi-N-gram Support

def run_lda_topic_modeling(df, num_topics=10, num_words=25):
    tokenized_texts = df['processed_text'].apply(lambda x: x.split()).tolist()
    bigram = Phrases(tokenized_texts, min_count=10, threshold=50, delimiter='_')
    trigram = Phrases(bigram[tokenized_texts], threshold=50, delimiter='_')
    phrased = []
    for doc in tokenized_texts:
        bigrams_ = [w for w in bigram[doc] if '_' in w]
        trigrams_ = [w for w in trigram[bigram[doc]] if '_' in w]
        combined = doc + bigrams_ + trigrams_
        phrased.append(' '.join(combined))
    vectorizer = CountVectorizer(ngram_range=(1, 1), token_pattern=r'\b[\w_-]+\b', max_df=0.95, min_df=2, max_features=10000)
    doc_term_matrix = vectorizer.fit_transform(phrased)
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    topic_distributions = lda_model.fit_transform(doc_term_matrix)
    feature_names = vectorizer.get_feature_names_out()
    topic_keywords = {}
    for topic_idx, topic in enumerate(lda_model.components_):
        top_indices = topic.argsort()[:-num_words-1:-1]
        top_words = [feature_names[i] for i in top_indices]
        topic_keywords[topic_idx] = {'top_words': top_words}
    return lda_model, vectorizer, topic_distributions, topic_keywords

def assign_papers_to_topics(topic_distributions):
    paper_classifications = []
    for idx, dist in enumerate(topic_distributions):
        top_2_topics = np.argsort(dist)[-2:][::-1]
        primary_score = dist[top_2_topics]
        other_topics_sum = sum(dist) - primary_score
        dominance_ratio = primary_score / (other_topics_sum + 1e-10)
        paper_classifications.append({
            'paper_idx': idx,
            'primary_topic': top_2_topics[0],
            'secondary_topic': top_2_topics[1],
            'primary_score': primary_score,
            'dominance_ratio': dominance_ratio
        })
    return paper_classifications

def string_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def topic_name_llm_robust(
    lda_keywords, tfidf_ngrams, top_titles,
    client, model_type, credit_tracker,
    initial_iterations=3, max_iterations=10, similarity_threshold=0.7,
    temp=None, top_p=None 
):
    prompt = (
        "Based on the following keywords and n-grams from LDA and TF-IDF, plus top paper titles, provide a concise topic name "
        "(bigram or trigram, single word if very specific):\n"
        f"LDA: {', '.join(lda_keywords)}\n"
        f"TFIDF: {', '.join(tfidf_ngrams)}\n"
        f"TITLES: {', '.join(top_titles)}\n"
        "Return ONLY the topic name."
    )
    
    iterations = initial_iterations
    from collections import Counter
    
    while iterations <= max_iterations:
        generated_names = []
        for _ in range(iterations):
            try:
                # Build API parameters dictionary properly
                api_params = {
                    "model": model_type,
                    "messages": [
                        {"role": "system", "content": "You are a science topic-naming assistant."},
                        {"role": "user", "content": prompt}
                    ]
                }
                
                # Add parameters conditionally (FIXED SYNTAX)
                if temp is not None:
                    api_params["temperature"] = temp
                if top_p is not None:
                    api_params["top_p"] = top_p
                
                # Handle different model types
                if model_type.startswith('gpt-5-nano'):
                    api_params["max_completion_tokens"] = 100
                else:
                    api_params["max_tokens"] = 100
                
                response = client.chat.completions.create(**api_params)
                content = response.choices[0].message.content.strip()
                
                # Track token usage
                if hasattr(response, 'usage') and response.usage:
                    credit_tracker.update(response.usage.total_tokens)
                else:
                    # Fallback token estimation
                    credit_tracker.update(num_tokens_from_string(prompt + content, model_type))
                
                if content:
                    generated_names.append(content)
                    
            except Exception as e:
                print(f"⚠️ Error in LLM call: {e}")
                continue
        
        # Check for consensus (FIXED LOGIC)
        if generated_names:  # Only proceed if we have generated names
            for i, name in enumerate(generated_names):
                matches = [other for j, other in enumerate(generated_names)
                          if i != j and string_similarity(name, other) >= similarity_threshold]
                if len(matches) >= len(generated_names) // 2:
                    print(f"Topic name stabilized after {iterations} iterations: {name}")
                    return name
        
        # Increase iterations and try again
        iterations += 2
        print(f"No majority topic name found, increasing iterations to {iterations}.")
    
    # Fallback: return most common name (FIXED RETURN LOGIC)
    if generated_names:
        most_common = Counter(generated_names).most_common(1)
        if most_common:
            best_name = most_common[0][0]  # Extract the actual name, not the tuple
            print(f"Returning most common topic name after {max_iterations} iterations: {best_name}")
            return best_name
    
    # Ultimate fallback
    print("⚠️ Failed to generate any topic names, using fallback")
    return "Unknown Topic"

def get_top_titles_for_topic(df, paper_classifications, topic_idx, n_titles=10):
    dominant_papers = [p for p in paper_classifications if p['primary_topic'] == topic_idx]
    paper_infos = [
        (df.iloc[p['paper_idx']]['citationCount'] if 'citationCount' in df.columns else 0, df.iloc[p['paper_idx']]['title'])
        for p in dominant_papers if not pd.isna(df.iloc[p['paper_idx']]['title'])
    ]
    # Correctly sort by citation count (descending)
    top_titles = [title for _, title in sorted(paper_infos, key=lambda x: -x[0])[:n_titles]]
    return top_titles

def get_top_tfidf_ngrams_per_topic_enhanced(df, topic_col='Primary_Topic_Index', text_col='processed_text', 
                                          top_k=15, min_df=2, max_df=0.8):
    """
    Enhanced function to extract top TF-IDF keywords, bigrams, and trigrams for each topic.
    
    Parameters:
    - df: DataFrame with topic assignments and text
    - topic_col: Column name for topic indices
    - text_col: Column name for processed text
    - top_k: Number of top n-grams to extract per topic per type
    - min_df: Minimum document frequency for TF-IDF
    - max_df: Maximum document frequency for TF-IDF
    
    Returns:
    - Dictionary with structure: {topic_id: {'keywords': {...}, 'bigrams': {...}, 'trigrams': {...}}}
    """
    
    print("🔍 Extracting topic-specific n-grams...")
    
    # Ensure text column exists and is not empty
    if text_col not in df.columns:
        print(f"❌ Text column '{text_col}' not found in DataFrame")
        return {}
    
    # Remove rows with missing text or topic assignments
    df_clean = df.dropna(subset=[text_col, topic_col]).copy()
    print(f"📊 Processing {len(df_clean)} documents across {df_clean[topic_col].nunique()} topics")
    
    # Initialize result structure
    topic_ngrams = {}
    
    # Get unique topics
    unique_topics = sorted(df_clean[topic_col].dropna().unique())
    
    # Define n-gram configurations
    ngram_configs = {
        'keywords': (1, 1),    # Unigrams
        'bigrams': (2, 2),     # Bigrams  
        'trigrams': (3, 3)     # Trigrams
    }
    
    for ngram_type, (min_n, max_n) in ngram_configs.items():
        print(f"📈 Processing {ngram_type} ({min_n}-{max_n} grams)...")
        
        try:
            # Create TF-IDF vectorizer for this n-gram type
            vectorizer = TfidfVectorizer(
                ngram_range=(min_n, max_n),
                min_df=min_df,
                max_df=max_df,
                stop_words='english',
                lowercase=True,
                token_pattern=r'\b[a-zA-Z][a-zA-Z0-9]*\b'  # Only alphanumeric tokens starting with letter
            )
            
            # Fit on all documents
            tfidf_matrix = vectorizer.fit_transform(df_clean[text_col])
            feature_names = vectorizer.get_feature_names_out()
            
            print(f"  ✅ Created {len(feature_names)} {ngram_type} features")
            
            # Extract top terms for each topic
            for topic_idx in unique_topics:
                topic_idx = int(topic_idx)
                
                # Initialize topic entry if not exists
                if topic_idx not in topic_ngrams:
                    topic_ngrams[topic_idx] = {}
                
                # Get documents for this topic
                doc_indices = df_clean[df_clean[topic_col] == topic_idx].index
                topic_doc_positions = [df_clean.index.get_loc(idx) for idx in doc_indices]
                
                if len(topic_doc_positions) == 0:
                    topic_ngrams[topic_idx][ngram_type] = {}
                    continue
                
                # Calculate mean TF-IDF scores for this topic
                topic_tfidf = tfidf_matrix[topic_doc_positions].mean(axis=0).A1
                
                # Get top terms
                top_indices = topic_tfidf.argsort()[-top_k:][::-1]
                top_terms = {
                    feature_names[i]: float(topic_tfidf[i]) 
                    for i in top_indices 
                    if topic_tfidf[i] > 0
                }
                
                topic_ngrams[topic_idx][ngram_type] = top_terms
                
        except Exception as e:
            print(f"  ❌ Error processing {ngram_type}: {e}")
            # Initialize empty entries for all topics for this n-gram type
            for topic_idx in unique_topics:
                topic_idx = int(topic_idx)
                if topic_idx not in topic_ngrams:
                    topic_ngrams[topic_idx] = {}
                topic_ngrams[topic_idx][ngram_type] = {}
    
    # Print summary
    print(f"\n📊 N-gram Extraction Summary:")
    for topic_idx in sorted(topic_ngrams.keys()):
        topic_data = topic_ngrams[topic_idx]
        print(f"  Topic {topic_idx}:")
        for ngram_type in ['keywords', 'bigrams', 'trigrams']:
            count = len(topic_data.get(ngram_type, {}))
            print(f"    {ngram_type}: {count} terms")
    
    return topic_ngrams

# Legacy function for backward compatibility
def get_top_tfidf_ngrams_per_topic(df, tfidf_matrix, feature_names, topic_col='Primary_Topic_Index', top_k=10):
    """
    Legacy function - now calls the enhanced version for keywords only.
    """
    print("⚠️  Using legacy function - consider switching to get_top_tfidf_ngrams_per_topic_enhanced")
    
    result = get_top_tfidf_ngrams_per_topic_enhanced(
        df, topic_col=topic_col, text_col='processed_text', top_k=top_k
    )
    
    if not result:
        return {}
    
    # Convert to legacy format (keywords only)
    legacy_result = {}
    for topic_idx, topic_data in result.items():
        keywords = topic_data.get('keywords', {})
        # Convert to list of tuples format
        legacy_result[topic_idx] = [(term, score) for term, score in keywords.items()]
    
    return legacy_result

def get_author_stats(paper_classifications, df_field, n_top=5):
    top_papers = {}
    author_topic_stats = {}
    
    for topic in set(p['primary_topic'] for p in paper_classifications):
        topic_papers = [p for p in paper_classifications if p['primary_topic'] == topic]
        
        # Fix: Handle various numpy array cases for dominance_ratio
        for p in topic_papers:
            dominance_ratio = p['dominance_ratio']
            
            if isinstance(dominance_ratio, np.ndarray):
                if dominance_ratio.size == 1:
                    p['dominance_ratio'] = float(dominance_ratio.item())
                else:
                    # Take the first element if it's a multi-element array
                    p['dominance_ratio'] = float(dominance_ratio.flat[0])
            elif hasattr(dominance_ratio, 'item'):
                p['dominance_ratio'] = float(dominance_ratio.item())
            else:
                p['dominance_ratio'] = float(dominance_ratio)
            
            # Also fix primary_score if needed
            primary_score = p['primary_score']
            if isinstance(primary_score, np.ndarray):
                if primary_score.size == 1:
                    p['primary_score'] = float(primary_score.item())
                else:
                    p['primary_score'] = float(primary_score.flat[0])
            elif hasattr(primary_score, 'item'):
                p['primary_score'] = float(primary_score.item())
            else:
                p['primary_score'] = float(primary_score)
        
        topic_papers.sort(key=lambda x: x['dominance_ratio'], reverse=True)
        top_papers[topic] = []
        
        for p in topic_papers[:n_top]:
            paper_idx = p['paper_idx']
            try:
                authors = df_field.iloc[paper_idx]['authors']
                if isinstance(authors, str):
                    try: 
                        authors = ast.literal_eval(authors)
                    except (ValueError, SyntaxError): 
                        authors = []
                if isinstance(authors, list):
                    author_list = []
                    for author in authors:
                        if isinstance(author, dict):
                            author_list.append({'name': author.get('name', 'Unknown'), 'id': author.get('authorId', 'Unknown')})
                else: 
                    author_list = []
                    
                top_papers[topic].append({
                    'paperId': df_field.iloc[paper_idx].get('paperId',''),
                    'title': df_field.iloc[paper_idx].get('title',''),
                    'authors': author_list,
                    'score': float(p['primary_score']),
                    'dominance_ratio': float(p['dominance_ratio'])
                })
            except Exception as e: 
                continue
                
    return top_papers, author_topic_stats


In [12]:
# %%
# %%
# Cell 8: Enhanced Utility Functions for Saving with Topic N-grams

def save_term_frequencies(df, suffix_string, save_dir=SAVE_DIR, max_keywords=50000):
    """Save .json containing keywords, bigrams, trigrams with their counts for later visualization."""
    freq_data = {}
    processed_text = df['processed_text'].fillna('').astype(str)
    
    for n in range(1, 4):
        vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english', max_features=max_keywords)
        matrix = vectorizer.fit_transform(processed_text)
        terms = vectorizer.get_feature_names_out()
        freqs = matrix.sum(axis=0).A1
        
        # Fix: Access the frequency (x[1]) for sorting, not the whole tuple (x)
        freq_dict = {term: int(freq) for term, freq in sorted(zip(terms, freqs), key=lambda x: -x[1])}
        
        if n == 1: 
            freq_data['keywords'] = freq_dict
        elif n == 2: 
            freq_data['bigrams'] = freq_dict
        elif n == 3: 
            freq_data['trigrams'] = freq_dict
    
    out_fn = os.path.join(save_dir, f'term_frequencies_{suffix_string}.json')
    with open(out_fn, 'w', encoding='utf-8') as f:
        json.dump(freq_data, f, indent=2)
    print(f"✓ Saved term frequency summary to {out_fn}")
    return out_fn

def save_author_and_venue_frequencies(df, suffix_string, save_dir=SAVE_DIR):
    if 'authors' in df.columns:
        authors_all = []
        for item in df['authors']:
            if isinstance(item, str) and item.strip():
                try:
                    obj = eval(item) if (item.strip().startswith("[") or item.strip().startswith("{")) else item.strip()
                except Exception:
                    obj = item.strip()
            else:
                obj = item
            if isinstance(obj, list):
                for author in obj:
                    if isinstance(author, dict) and 'name' in author:
                        authors_all.append(author['name'])
                    elif isinstance(author, str):
                        authors_all.append(author)
            elif isinstance(obj, dict) and 'name' in obj:
                authors_all.append(obj['name'])
            elif isinstance(obj, str):
                authors_all.append(obj)
        author_counts = pd.Series(authors_all).value_counts().reset_index()
        author_counts.columns = ['Author', 'Frequency']
        author_fn = os.path.join(save_dir, f"semantic_scholar_{suffix_string}_author_analysis.csv")
        author_counts.to_csv(author_fn, sep=';', encoding='utf-8', index=False)
        print(f"✓ Saved author frequencies: {author_fn}")
    else:
        print("No 'authors' column found in DF: skipping author frequencies.")
        
    if 'venue' in df.columns:
        venue_counts = df['venue'].value_counts().reset_index()
        venue_counts.columns = ['Venue', 'Frequency']
        venue_fn = os.path.join(save_dir, f"semantic_scholar_{suffix_string}_venue_frequencies.csv")
        venue_counts.to_csv(venue_fn, sep=';', encoding='utf-8', index=False)
        print(f"✓ Saved venue frequencies: {venue_fn}")
    else:
        print("No 'venue' column found in DF: skipping venue frequencies.")

def save_topic_analysis_outputs(
    df, lda_model, lda_vectorizer, topic_distributions, topic_keywords, topic_names, topic_ngrams,
    author_stats, top_papers, tfidf_ngrams, suffix_string
):
    topic_metadata = {
        "topics": {int(k): v for k,v in topic_keywords.items()},
        "topic_names": {int(k): v for k,v in topic_names.items()},
        "topic_ngrams": {int(k): v for k,v in topic_ngrams.items()},
    }
    with open(os.path.join(SAVE_DIR, f"topics_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump(topic_metadata, f, indent=2)
    with open(os.path.join(SAVE_DIR, f"topic_names_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump({int(k):v for k,v in topic_names.items()}, f, indent=2)
    np.save(os.path.join(SAVE_DIR, f"topic_distributions_{suffix_string}.npy"), topic_distributions)
    import joblib
    joblib.dump(lda_model, os.path.join(SAVE_DIR, f"lda_model_{suffix_string}.joblib"))
    joblib.dump(lda_vectorizer, os.path.join(SAVE_DIR, f"lda_vectorizer_{suffix_string}.joblib"))
    with open(os.path.join(SAVE_DIR, f"top_papers_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump({int(k): v for k, v in top_papers.items()}, f, ensure_ascii=False, indent=2, default=str)
    pd.DataFrame.from_dict(author_stats, orient='index').to_csv(
        os.path.join(SAVE_DIR, f"author_stats_{suffix_string}.csv"))
    
    # ENHANCED: Save topic-specific TF-IDF n-grams in the new format
    if topic_ngrams and isinstance(list(topic_ngrams.values())[0], dict):
        # New format with keywords/bigrams/trigrams structure
        with open(os.path.join(SAVE_DIR, f"topic_specific_tfidf_ngrams_{suffix_string}.json"), "w", encoding="utf-8") as f:
            json.dump({int(k): v for k, v in topic_ngrams.items()}, f, indent=2, ensure_ascii=False)
        print(f"✓ Saved enhanced topic-specific TF-IDF n-grams to topic_specific_tfidf_ngrams_{suffix_string}.json")
    else:
        # Legacy format fallback
        with open(os.path.join(SAVE_DIR, f"topic_specific_tfidf_ngrams_{suffix_string}.json"), "w", encoding="utf-8") as f:
            json.dump({int(k):[(term,float(score)) for term,score in v] for k,v in topic_ngrams.items()}, f, indent=2)
        print(f"✓ Saved legacy topic-specific TF-IDF n-grams to topic_specific_tfidf_ngrams_{suffix_string}.json")

def diagnostics_enhanced(df, canonical_scores, variant_scores, canonical_methods, variant_names):
    n_docs, n_canonical = canonical_scores.shape
    n_variants = variant_scores.shape[1]
    
    print("=== ENHANCED DIAGNOSTICS ===")
    print(f"Total documents: {n_docs}")
    print(f"Canonical methods: {n_canonical}")
    print(f"Method variants: {n_variants}")
    print(f"Canonical coverage: {(canonical_scores > 0).any(axis=1).sum()}/{n_docs} ({100*(canonical_scores>0).any(axis=1).mean():.1f}%)")
    print(f"Variant coverage: {(variant_scores > 0).any(axis=1).sum()}/{n_docs} ({100*(variant_scores>0).any(axis=1).mean():.1f}%)")
    
    if 'Primary_Method' in df.columns:
        print("\nMethod distribution (top 10):")
        method_dist = df['Primary_Method'].value_counts().head(10)
        for method, count in method_dist.items():
            if method:  # Skip empty strings
                print(f"  {method}: {count}")
    
    if 'Method_Confidence' in df.columns:
        print("\nConfidence distribution:")
        conf_dist = df['Method_Confidence'].value_counts()
        for conf, count in conf_dist.items():
            if conf:  # Skip empty strings
                print(f"  {conf}: {count}")
    
    print(f"\nCanonical methods sample: {canonical_methods[:5]}")
    print(f"Variant methods sample: {variant_names[:10]}")
    print(f"\nCanonical scores stats: mean={canonical_scores.mean():.3f}, std={canonical_scores.std():.3f}")
    print(f"Variant scores stats: mean={variant_scores.mean():.3f}, std={variant_scores.std():.3f}")


In [13]:
def align_scores_robust(scores, current_features, target_features):
    
    #Enhanced alignment with dimension safety checks and detailed error handling.
    
    if not target_features:
        return np.array([]).reshape(scores.shape[0], 0)
    
    # SAFETY CHECK: Verify dimensions match expectations
    expected_cols = len(current_features)
    actual_cols = scores.shape[1]
    
    if expected_cols != actual_cols:
        print(f"⚠️  DIMENSION MISMATCH DETECTED:")
        print(f"    Expected columns: {expected_cols} (from feature names)")
        print(f"    Actual columns: {actual_cols} (from score matrix)")
        print(f"    Using actual matrix dimensions for safety")
        
        # Use only the features that actually exist in the matrix
        safe_current_features = current_features[:actual_cols]
        print(f"    Truncated feature list: {len(safe_current_features)} features")
    else:
        safe_current_features = current_features
    
    # Initialize aligned matrix with zeros
    aligned_scores = np.zeros((scores.shape[0], len(target_features)))
    current_to_idx = {feat: i for i, feat in enumerate(safe_current_features)}
    
    # Map existing features to aligned positions with bounds checking
    found_features = 0
    skipped_features = 0
    
    for j, feat in enumerate(target_features):
        if feat in current_to_idx:
            source_idx = current_to_idx[feat]
            
            # BOUNDS CHECK: Ensure source index is valid
            if source_idx < scores.shape[1]:
                aligned_scores[:, j] = scores[:, source_idx]
                found_features += 1
            else:
                print(f"⚠️  Skipping feature '{feat}': index {source_idx} >= {scores.shape[1]}")
                skipped_features += 1
    
    print(f"    ✓ Aligned {found_features}/{len(target_features)} features")
    if skipped_features > 0:
        print(f"    ⚠️  Skipped {skipped_features} features due to bounds issues")
    
    return aligned_scores

def normalize_scores(scores):
    #Normalize scores to  range per matrix for fair weighting.[1]
    if scores.max() == 0:
        return scores
    return scores / scores.max()
def enhanced_method_diagnostics(df, scores, method_names, variant_groups):
    
    #Comprehensive diagnostics for method assignment quality and consolidation effectiveness.
    
    print("\n" + "="*80)
    print("COMPREHENSIVE METHOD DETECTION DIAGNOSTICS")
    print("="*80)
    
    # Basic assignment statistics
    n_papers = len(df)
    assigned_papers = (df['Primary_Method'] != '').sum()
    assignment_rate = 100 * assigned_papers / n_papers
    
    print(f"\n📊 ASSIGNMENT OVERVIEW:")
    print(f"  Total papers processed: {n_papers:,}")
    print(f"  Papers with methods assigned: {assigned_papers:,} ({assignment_rate:.1f}%)")
    print(f"  Papers without methods: {n_papers - assigned_papers:,} ({100-assignment_rate:.1f}%)")
    
    # Score distribution analysis
    print(f"\n📈 SCORE DISTRIBUTION ANALYSIS:")
    print(f"  Final score matrix shape: {scores.shape}")
    print(f"  Total canonical methods: {len(method_names)}")
    print(f"  Score range: [{scores.min():.4f}, {scores.max():.4f}]")
    print(f"  Mean score: {scores.mean():.4f}")
    print(f"  Standard deviation: {scores.std():.4f}")
    
    # Score threshold analysis
    thresholds = [0.001, 0.005, 0.01, 0.05, 0.1]
    for threshold in thresholds:
        count = (scores > threshold).sum()
        print(f"  Scores > {threshold}: {count:,} ({100*count/scores.size:.2f}% of all scores)")
    
    # Method popularity and assignment quality
    if assigned_papers > 0:
        print(f"\n🔥 TOP ASSIGNED METHODS:")
        method_counts = df['Primary_Method'].value_counts()
        
        for i, (method, count) in enumerate(method_counts.head(15).items()):
            if method:  # Skip empty strings
                percentage = 100 * count / assigned_papers
                # Check if method was consolidated from variants
                variants = variant_groups.get(method, [method])
                variant_info = f" (from {len(variants)} variants)" if len(variants) > 1 else ""
                print(f"  {i+1:2d}. {method}: {count:,} papers ({percentage:.1f}%){variant_info}")
    
    # Confidence distribution analysis
    if 'Method_Confidence' in df.columns:
        print(f"\n🎯 CONFIDENCE DISTRIBUTION:")
        conf_counts = df['Method_Confidence'].value_counts()
        for conf, count in conf_counts.items():
            percentage = 100 * count / n_papers
            print(f"  {conf}: {count:,} ({percentage:.1f}%)")
    
    # Consolidation effectiveness analysis
    print(f"\n🔧 CONSOLIDATION EFFECTIVENESS:")
    total_variants = sum(len(variants) for variants in variant_groups.values())
    consolidated_groups = len([v for v in variant_groups.values() if len(v) > 1])
    
    print(f"  Total method variants processed: {total_variants:,}")
    print(f"  Final canonical methods: {len(variant_groups):,}")
    print(f"  Groups with multiple variants: {consolidated_groups:,}")
    print(f"  Consolidation ratio: {total_variants/len(variant_groups):.2f}:1")
    
    # Quality assessment and recommendations
    print(f"\n⚠️  QUALITY ASSESSMENT:")
    
    if assignment_rate < 50:
        print(f"  ⚠️  Low assignment rate ({assignment_rate:.1f}%) - consider:")
        print(f"      -  Lowering MIN_ASSIGN_SCORE (current: {MIN_ASSIGN_SCORE})")
        print(f"      -  Reviewing method extraction quality")
        print(f"      -  Checking text preprocessing effectiveness")
    else:
        print(f"  ✅ Good assignment rate ({assignment_rate:.1f}%)")
    
    if scores.max() < 0.1:
        print(f"  ⚠️  Low maximum scores ({scores.max():.4f}) - scoring method may need adjustment")
    else:
        print(f"  ✅ Reasonable maximum scores ({scores.max():.4f})")
    
    zero_score_methods = (scores.max(axis=0) == 0).sum()
    if zero_score_methods > 0:
        zero_percentage = 100 * zero_score_methods / len(method_names)
        print(f"  ⚠️  {zero_score_methods} methods ({zero_percentage:.1f}%) have zero scores across all papers")
        print(f"      Consider reviewing method extraction or scoring parameters")
    else:
        print(f"  ✅ All methods have non-zero scores in at least some papers")
    
    print("\n" + "="*80)
    return {
        'assignment_rate': assignment_rate,
        'total_papers': n_papers,
        'assigned_papers': assigned_papers,
        'score_stats': {
            'min': scores.min(),
            'max': scores.max(),
            'mean': scores.mean(),
            'std': scores.std()
        }
    }


### Topic and author analysis

In [14]:
# %%
# %%
# Cell 9: Enhanced Topic Analysis Workflow with Multi-N-gram Support

NUM_TOPICS = 10
NUM_TOPIC_WORDS = 20
TOPIC_LLM_ITER_INIT = 3
TOPIC_LLM_ITER_MAX = 9
TOPIC_LLM_SIM_THRESH = 0.8
TOPIC_LLM_TEMP = 0.2 #TOPIC_LLM_TOP_P = 0.2 # Adjust only one of these


current_date = datetime.now().strftime("%Y_%m_%d")
keyword_str = '_'.join(extract_keywords_from_filename(filename)) if 'filename' in locals() else ""
suffix_string = f"{current_date}_{keyword_str}"

# Save basic term frequencies and author/venue analysis
save_term_frequencies(df, suffix_string)
save_author_and_venue_frequencies(df, suffix_string)

logger.info("Starting topic modeling workflow...")  
lda_model, lda_vectorizer, topic_distributions, topic_keywords = run_lda_topic_modeling(
    df, num_topics=NUM_TOPICS, num_words=NUM_TOPIC_WORDS)
logger.info("✓ LDA topic modeling completed.")

paper_classifications = assign_papers_to_topics(topic_distributions)
df['Primary_Topic_Index'] = [int(p['primary_topic']) for p in paper_classifications]
df['Primary_Score'] = [p['primary_score'] for p in paper_classifications]
df['Dominance_Ratio'] = [p['dominance_ratio'] for p in paper_classifications]

logger.info("✓ Papers assigned to topics based on LDA distributions.")

# ENHANCED: Use the new multi-n-gram extraction function
logger.info("Extracting enhanced topic-specific TF-IDF n-grams...")
topic_ngrams = get_top_tfidf_ngrams_per_topic_enhanced(
    df, topic_col='Primary_Topic_Index', text_col='processed_text', top_k=15, min_df=2, max_df=0.8
)
logger.info("✓ Extracted enhanced topic-specific TF-IDF n-grams for naming.")

# Generate topic names using enhanced n-grams
topic_names = {}
for topic_idx, keywords in topic_keywords.items():
    lda_ngrams = keywords['top_words'][:NUM_TOPIC_WORDS]
    
    # ENHANCED: Use keywords from the new structure
    topic_data = topic_ngrams.get(topic_idx, {})
    tfidf_keywords = list(topic_data.get('keywords', {}).keys())[:NUM_TOPIC_WORDS]
    
    top_titles = get_top_titles_for_topic(df, paper_classifications, topic_idx, n_titles=10)
    topic_name = topic_name_llm_robust(
        lda_ngrams, tfidf_keywords, top_titles,
        client, model_type, credit_tracker,
        initial_iterations=TOPIC_LLM_ITER_INIT,
        max_iterations=TOPIC_LLM_ITER_MAX,
        similarity_threshold=TOPIC_LLM_SIM_THRESH,
        temp=TOPIC_LLM_TEMP#, top_p=TOPIC_LLM_TOP_P
    )
    topic_names[topic_idx] = topic_name
    logger.info(f"Topic {topic_idx}: {topic_name if topic_name else 'Unnamed'}")

df['Primary_Topic'] = df['Primary_Topic_Index'].map(lambda x: topic_names.get(x, f"Topic_{x}"))
logger.info("✓ Topic naming and assignment completed.")

top_papers, author_stats = get_author_stats(paper_classifications, df, n_top=5)

# ENHANCED: Save with the new n-grams structure
save_topic_analysis_outputs(df, lda_model, lda_vectorizer, topic_distributions, 
                           topic_keywords, topic_names, topic_ngrams, author_stats, 
                           top_papers, topic_ngrams, suffix_string)

print("\nSample topics and names:")
print({k: topic_names[k] for k in list(topic_names)[:5]})

# Show sample of enhanced n-grams structure
if topic_ngrams:
    print(f"\n📊 Enhanced N-grams Structure Sample:")
    sample_topic = list(topic_ngrams.keys())[0]
    sample_data = topic_ngrams[sample_topic]
    topic_name = topic_names.get(sample_topic, f"Topic {sample_topic}")
    print(f"Topic {sample_topic} ({topic_name}):")
    for ngram_type in ['keywords', 'bigrams', 'trigrams']:
        terms = sample_data.get(ngram_type, {})
        if terms:
            top_terms = list(terms.items())[:5]
            print(f"  {ngram_type}: {top_terms}")


✓ Saved term frequency summary to Saved_files_new\term_frequencies_2025_09_30_reliability_resilience_power_systems.json


2025-09-30 10:14:55,076 - INFO - Starting topic modeling workflow...


✓ Saved author frequencies: Saved_files_new\semantic_scholar_2025_09_30_reliability_resilience_power_systems_author_analysis.csv
✓ Saved venue frequencies: Saved_files_new\semantic_scholar_2025_09_30_reliability_resilience_power_systems_venue_frequencies.csv


2025-09-30 10:14:55,384 - INFO - collecting all words and their counts
2025-09-30 10:14:55,384 - INFO - PROGRESS: at sentence #0, processed 0 words and 0 word types
2025-09-30 10:14:56,881 - INFO - PROGRESS: at sentence #10000, processed 1525966 words and 864319 word types
2025-09-30 10:14:58,315 - INFO - PROGRESS: at sentence #20000, processed 3006723 words and 1468449 word types
2025-09-30 10:14:59,863 - INFO - PROGRESS: at sentence #30000, processed 4445145 words and 1949442 word types
2025-09-30 10:15:00,001 - INFO - collected 1990025 token types (unigram + bigrams) from a corpus of 4575260 words and 30917 sentences
2025-09-30 10:15:00,002 - INFO - merged Phrases<1990025 vocab, min_count=10, threshold=50, max_vocab_size=40000000>
2025-09-30 10:15:00,005 - INFO - Phrases lifecycle event {'msg': 'built Phrases<1990025 vocab, min_count=10, threshold=50, max_vocab_size=40000000> in 4.62s', 'datetime': '2025-09-30T10:15:00.005445', 'gensim': '4.3.2', 'python': '3.11.13 (main, Jun 12 202

🔍 Extracting topic-specific n-grams...
📊 Processing 30917 documents across 10 topics
📈 Processing keywords (1-1 grams)...
  ✅ Created 32526 keywords features
📈 Processing bigrams (2-2 grams)...
  ✅ Created 436411 bigrams features
📈 Processing trigrams (3-3 grams)...


2025-09-30 10:17:50,657 - INFO - ✓ Extracted enhanced topic-specific TF-IDF n-grams for naming.


  ✅ Created 264787 trigrams features

📊 N-gram Extraction Summary:
  Topic 0:
    keywords: 15 terms
    bigrams: 15 terms
    trigrams: 15 terms
  Topic 1:
    keywords: 15 terms
    bigrams: 15 terms
    trigrams: 15 terms
  Topic 2:
    keywords: 15 terms
    bigrams: 15 terms
    trigrams: 15 terms
  Topic 3:
    keywords: 15 terms
    bigrams: 15 terms
    trigrams: 15 terms
  Topic 4:
    keywords: 15 terms
    bigrams: 15 terms
    trigrams: 15 terms
  Topic 5:
    keywords: 15 terms
    bigrams: 15 terms
    trigrams: 15 terms
  Topic 6:
    keywords: 15 terms
    bigrams: 15 terms
    trigrams: 15 terms
  Topic 7:
    keywords: 15 terms
    bigrams: 15 terms
    trigrams: 15 terms
  Topic 8:
    keywords: 15 terms
    bigrams: 15 terms
    trigrams: 15 terms
  Topic 9:
    keywords: 15 terms
    bigrams: 15 terms
    trigrams: 15 terms


2025-09-30 10:17:52,918 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:17:54,663 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:17:55,975 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:17:55,978 - INFO - Topic 0: Wireless Communication Systems


Topic name stabilized after 3 iterations: Wireless Communication Systems


2025-09-30 10:17:57,598 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:17:59,394 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:00,011 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:00,018 - INFO - Topic 1: Power System Reliability


Topic name stabilized after 3 iterations: Power System Reliability


2025-09-30 10:18:03,567 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:06,883 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:09,882 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:09,888 - INFO - Topic 2: Renewable Energy Systems


Topic name stabilized after 3 iterations: Renewable Energy Systems


2025-09-30 10:18:11,020 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:12,365 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:13,152 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:13,156 - INFO - Topic 3: Electric Vehicle Battery Systems


Topic name stabilized after 3 iterations: Electric Vehicle Battery Systems


2025-09-30 10:18:14,138 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:14,814 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:16,310 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:16,317 - INFO - Topic 4: Energy Resilience Management


Topic name stabilized after 3 iterations: Energy Resilience Management


2025-09-30 10:18:17,974 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:18,925 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:19,401 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:19,408 - INFO - Topic 5: Thermal Energy Systems


Topic name stabilized after 3 iterations: Thermal Energy Systems


2025-09-30 10:18:20,412 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:21,298 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:21,976 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:21,992 - INFO - Topic 6: Renewable Energy Optimization


Topic name stabilized after 3 iterations: Renewable Energy Optimization


2025-09-30 10:18:23,305 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:24,415 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:25,450 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:25,458 - INFO - Topic 7: Water Energy Systems


Topic name stabilized after 3 iterations: Water Energy Systems


2025-09-30 10:18:26,504 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:26,926 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:27,403 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:27,444 - INFO - Topic 8: Smart Network Reliability


Topic name stabilized after 3 iterations: Smart Network Reliability


2025-09-30 10:18:28,860 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:29,428 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:30,117 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 10:18:30,122 - INFO - Topic 9: Power System Control
2025-09-30 10:18:30,131 - INFO - ✓ Topic naming and assignment completed.


Topic name stabilized after 3 iterations: Power System Control
✓ Saved enhanced topic-specific TF-IDF n-grams to topic_specific_tfidf_ngrams_2025_09_30_reliability_resilience_power_systems.json

Sample topics and names:
{0: 'Wireless Communication Systems', 1: 'Power System Reliability', 2: 'Renewable Energy Systems', 3: 'Electric Vehicle Battery Systems', 4: 'Energy Resilience Management'}

📊 Enhanced N-grams Structure Sample:
Topic 0 (Wireless Communication Systems):
  keywords: [('communication', 0.05952357402952956), ('channel', 0.05415651052964458), ('network', 0.05195755025204625), ('wireless', 0.043335504838006085), ('user', 0.03508300912112406)]
  bigrams: [('power allocation', 0.014523518301132025), ('wireless communication', 0.012700154332130124), ('multiple access', 0.010141318320924015), ('data rate', 0.009929391975903307), ('base station', 0.009328527408284647)]
  trigrams: [('bit error rate', 0.009837525248469871), ('input multiple output', 0.0097201250798645), ('multiple

### Method analysis

#### old prompt definition

In [None]:
#old prompt versions
"""You are a comprehensive research methodology expert analyzing power systems literature.

TASK: Extract ONLY, but ALL phrases that can be strictly defined as a specific research method, algorithm, and/or techniques from these candidate terms.
GOAL: Find all the qualified phrases in this batch. Be thorough and comprehensive. Generic terms or terms that are describing the objectiv or result of analysis does not qualify

Candidate terms: {candidate_terms}

COMPREHENSIVE EXTRACTION CRITERIA:
✅ INCLUDE specific methods, algorithms and/or techniques in these categories:
1. Named algorithms: genetic algorithm, differential evolution...
2. Mathematical methods: monte carlo simulation, linear programming...
3. Machine learning: neural network, support vector machine...
4. Analysis techniques: fault tree analysis, load flow analysis...
5. Power system analysis methods: unit commitment, optimal power flow...
6. Power system indicator terms: LOLE, EENS, PTDF...

Distinguish between actuall analysis-method, algorithms and/or technique phrases and objectives of an anlysis.
Discard typicall generic phrases or phrases that describe the objective(goal) rather than the method, algoritm and/or technique, e.g.  'Risk assesment', 'Capacity utilization', 'Control Strategy' 
Keep method, algorithm and/or technique phrases used in connection to such objectives

TARGET: Extract only specific research methods, algorithm and/or techniques that are qualified by the above criteriasfrom this batch.
Review the list before finalizing to make sure it only contains qualified terms are 
Return as Python list with comprehensive coverage:"""

"You are a comprehensive research methodology expert analyzing power systems literature.\n\nTASK: Extract ONLY, but ALL phrases that can be strictly defined as a specific research method, algorithm, and/or techniques from these candidate terms.\nGOAL: Find all the qualified phrases in this batch. Be thorough and comprehensive. Generic terms or terms that are describing the objectiv or result of analysis does not qualify\n\nCandidate terms: {candidate_terms}\n\nCOMPREHENSIVE EXTRACTION CRITERIA:\n✅ INCLUDE specific methods, algorithms and/or techniques in these categories:\n1. Named algorithms: genetic algorithm, differential evolution...\n2. Mathematical methods: monte carlo simulation, linear programming...\n3. Machine learning: neural network, support vector machine...\n4. Analysis techniques: fault tree analysis, load flow analysis...\n5. Power system analysis methods: unit commitment, optimal power flow...\n6. Power system indicator terms: LOLE, EENS, PTDF...\n\nDistinguish between

#### Prompt definition

In [20]:
# Define your extraction prompt
extraction_prompt = improved_extraction_prompt = """You are a research methodology extraction expert specializing in power systems literature history, seminal work and recend advances.

TASK: From the candidate terms below, extract ALL terms that represent specific, named research methodologies, algorithms, or techniques.

Candidate terms: {candidate_terms}

QUALIFICATION CRITERIA - A term qualifies if it is:
✅ A NAMED algorithm, method, or technique with specific definition
✅ A SPECIFIC mathematical/computational approach 
✅ A WELL-DEFINED analysis technique with established procedure
✅ A RECOGNIZED power systems methodology

INCLUDE these categories:
1. Named algorithms: "genetic algorithm", "particle swarm optimization", "differential evolution"
2. Mathematical methods: "monte carlo simulation", "linear programming", "quadratic programming" 
3. Machine learning: "neural network", "support vector machine", "random forest", "k-means clustering"
4. Analysis techniques: "fault tree analysis", "load flow analysis", "modal analysis", "sensitivity analysis"
5. Power system methods: "unit commitment", "optimal power flow", "economic dispatch", "state estimation"
6. Power system indicators: "LOLE", "EENS", "PTDF", "LODF" (specific technical acronyms)

EXCLUDE these patterns:
❌ Generic nouns: "method", "analysis", "approach", "technique", "framework", "system", "procedure"
❌ Generic combinations: "energy analysis", "power method", "system approach", "network optimization"
❌ Process descriptions: "optimization process", "analysis procedure", "design methodology" 
❌ Objective descriptions: "performance improvement", "efficiency enhancement", "cost reduction"
❌ Research activities: "literature review", "case study", "experimental analysis", "field research"

DECISION PROCESS:
1. Scan each candidate term
2. Ask: "Is this a specific, named methodology that a researcher could implement?"
3. If yes and fits categories above → INCLUDE
4. If generic or describes outcome/process → EXCLUDE

EXAMPLES:
✅ INCLUDE: "monte carlo simulation" (specific technique), "unit commitment" (specific power systems method)
❌ EXCLUDE: "energy analysis" (generic), "optimization approach" (too broad), "case study" (research activity)

Extract ALL qualifying terms from the candidate list. Be comprehensive within the qualification criteria, when in doubt, include the term.

Return as a clean Python list: ["term1", "term2", "term3", ...]"""


# Define your grouping prompt  
grouping_prompt = """You are an expert in power systems analysis methods. Your task is to group variants of the same core technique.

Methods to analyze: {method_list}

STRICT GROUPING RULES:

✅ GROUP these cases (same core method):
- Abbreviation + full form: e.g. opf" with "optimal power flow" and "copt" with "capacity outage probability table"
- Method + descriptor: "neural network" with "artificial neural network" 
- Acronym + full name: e.g. "anfis" with "adaptive neuro fuzzy inference system"
- Slight variations: "monte carlo" with "monte carlo simulation"
- Same indices: "saifi" with "system average interruption frequency index"

❌ NEVER GROUP these cases (different methods):
- Different algorithm types: "linear programming" ≠ "nonlinear programming"
- Different orders/levels: "first order" ≠ "second order reliability method"  
- Different shift factors: "generation shift factors" ≠ "injection shift factors"
- Different optimization algorithms: "genetic algorithm" ≠ "particle swarm"
- Different neural networks: "lstm" ≠ "gru"

CANONICAL SELECTION:
- Always use the FULL descriptive name as the canonical form
- Never use abbreviations as canonical forms
- Example: Use "optimal power flow" not "opf"

Return a Python dictionary where:
- Keys are FULL canonical method names (never abbreviations)
- Values are lists of ALL variants including the canonical form itself

Example format:
{{
  "optimal power flow": ["optimal power flow", "opf", "power flow optimization"],
  "monte carlo simulation": ["monte carlo simulation", "monte carlo", "monte-carlo"]
}}
"""

In [21]:
# %%
# %%
# Cell 10A: Method Extraction and Grouping Phase (LLM-based)
#Runs LLM calls for method extraction and grouping



# =============================================================================
# CONFIGURATION PARAMETERS - Adjust these for optimal method detection
# =============================================================================
MAX_FEATURES = 30000                    # Maximum features for candidate term extraction
BATCH_SIZE = 5000                       # Batch size for LLM processing
METHOD_LLM_N_RUNS = 3                   # Number of LLM runs for method extraction
VARIANT_GROUP_BATCH_SIZE = 5000         # Batch size for method variant grouping
TEMP = 0.2 #TOP_P = 0.2 #Adjust only one of these, not both....
logger.info("=== Starting Method Extraction and Grouping Phase ===")
save_method_phrases=False # initalizing setpoint for saving if new file is created
save_method_groups= False # initalizing setpoint for saving if new file is created

# =============================================================================
# STEP 1: LOAD OR EXTRACT METHOD PHRASES FROM CORPUS
# =============================================================================
logger.info("Step 1: Loading or extracting method phrases...")

# Try to load existing method phrases from previous runs
try:
    method_phrases, method_counts = load_method_phrases_from_csv(filename="extracted_method_phrases.csv")
except (FileNotFoundError, TypeError):
    method_phrases, method_counts = None, None

# If no existing phrases found or too few, extract new ones using LLM
if (method_phrases is None) or (len(method_phrases) < 3):
    logger.info("  1a: Extracting candidate terms from processed text...")
    
    # Extract candidate n-grams (1-4 grams) from the corpus using CountVectorizer
    candidate_terms = extract_candidate_terms(df, text_col='processed_text', max_features=MAX_FEATURES)
    logger.info(f"  ✓ Extracted {len(candidate_terms)} candidate terms")
    print(f"  Sample candidate terms: {candidate_terms[:10]}")
    
    logger.info("  1b: Using LLM to identify research methods from candidate terms...")
    
    # Use LLM to intelligently identify research methods from candidate terms
    method_phrases, method_counts = get_method_phrases_enhanced(
        candidate_terms,
        client,
        model_type,
        credit_tracker,
        prompt=extraction_prompt,
        n_runs=METHOD_LLM_N_RUNS,
        batch_size=BATCH_SIZE,
        #top_p=TOP_P, % use only one of TOP_P or TEMP
        temp=TEMP
    )
    method_phrases = filter_generic_phrases(method_phrases)
    save_method_phrases = True # enable saving of new file
    #save_method_phrases_to_csv(method_phrases, method_counts)
else:
    logger.info(f"  ✓ Loaded {len(method_phrases)} method phrases from existing CSV")

# Validate that method extraction was successful
if not method_phrases:
    logger.error("No method phrases extracted! Check your LLM configuration and prompts.")
    raise RuntimeError("Method extraction failed - no phrases found")

logger.info(f"✓ Method phrase extraction complete: {len(method_phrases)} phrases")
print(f"  Sample methods: {method_phrases[:10]}")
# Save method phrases (can be edited)

# =============================================================================
# STEP 2: ENHANCED METHOD VARIANT CONSOLIDATION
# =============================================================================
logger.info("Step 2: Building enhanced method variant groups with consolidation...")

# check if file exist
variant_groups_path= os.path.join(SAVE_DIR,"method_variant_groups.json") 
if os.path.exists(variant_groups_path):
    logger.info(f"Found existing variant groups file: {variant_groups_path}")
    logger.info(" Loading existing groups instead of regenerating...")

    with open(variant_groups_path, 'r', encoding='utf-8') as f:
        canonical_to_variants=json.load(f)
    variant_groups=canonical_to_variants

    logger.info(f"Loaded {len(canonical_to_variants)} existing method groups")
    logger.info(f"Sample method groups{list(canonical_to_variants.items())[:10]}")

else: 
    logger.info(" No existing variant groups found. Running LLM grouping")
    # Use enhanced LLM-based variant grouping to consolidate similar methods
    variant_groups = build_method_variant_groups_enhanced(
    method_phrases, 
    client, 
    model_type, 
    credit_tracker, 
    prompt=grouping_prompt, 
    #top_p=TOP_P,
    temp=TEMP,
    batch_size=VARIANT_GROUP_BATCH_SIZE
    ) if method_phrases else {}
    save_method_groups = True
    # Create fallback mapping if LLM-based grouping fails completely
    if not variant_groups and method_phrases:
        logger.info("  LLM grouping failed completely - using enhanced aggressive fallback grouping...")
        variant_groups = aggressive_fallback_grouping(method_phrases, similarity_threshold=0.75)
        logger.info(f"  ✓ Aggressive fallback created {len(variant_groups)} groups from {len(method_phrases)} methods")



# Create bidirectional mappings for efficient lookup during scoring
variant_to_canonical, canonical_to_variants = create_variant_mapping(variant_groups)
logger.info(f"✓ Created {len(canonical_to_variants)} canonical methods with {len(variant_to_canonical)} total variants")

# =============================================================================
# SAVE FOR MANUAL EDITING
# =============================================================================
logger.info("Saving method phrases and groups for manual review and editing...")

if save_method_phrases: # Save method phrases (can be edited)
    save_method_phrases_to_csv(method_phrases, method_counts, filename="extracted_method_phrases.csv")
if save_method_groups: # Save variant groups (can be edited)
    
    with open(os.path.join(SAVE_DIR, "method_variant_groups.json"), 'w') as f:
        json.dump(canonical_to_variants, f, indent=2, ensure_ascii=False)

# Display consolidation results
print(f"\n📊 Method Extraction and Grouping Results:")
print(f"  Original methods: {len(method_phrases) if method_phrases else 0}")
print(f"  Consolidated methods: {len(canonical_to_variants)}")
reduction = len(method_phrases) - len(canonical_to_variants) if method_phrases else 0
print(f"  Reduction: {reduction} methods ({100*reduction/len(method_phrases):.1f}% reduction)" if method_phrases and len(method_phrases) > 0 else "")

print(f"\n⏸️  EDITING CHECKPOINT:")
print(f"  📝 Edit extracted_method_phrases.csv to add/remove/rename methods")
print(f"  📝 Edit method_variant_groups.json to adjust groupings")
print(f"  ✅ Files saved to: {SAVE_DIR}")
print(f"  ➡️  Run Cell 10B when editing is complete")

logger.info("✅ Method extraction and grouping phase completed successfully!")
logger.info(f"💰 Credit usage so far: {credit_tracker.get_stats()}")


2025-09-30 13:38:25,213 - INFO - === Starting Method Extraction and Grouping Phase ===
2025-09-30 13:38:25,213 - INFO - Step 1: Loading or extracting method phrases...
2025-09-30 13:38:25,218 - INFO -   ✓ Loaded 386 method phrases from existing CSV
2025-09-30 13:38:25,218 - INFO - ✓ Method phrase extraction complete: 386 phrases
2025-09-30 13:38:25,219 - INFO - Step 2: Building enhanced method variant groups with consolidation...
2025-09-30 13:38:25,220 - INFO -  No existing variant groups found. Running LLM grouping


  Sample methods: ['deep learning approach', 'genetic programming', 'wavelet transform dwt', 'simulated annealing', 'multi-fidelity task', 'two-stage stochastic', 'hybrid system modeling', 'multi-agent', 'bayesian optimization', 'multi-fidelity activity']


2025-09-30 13:39:22,432 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-30 13:39:22,577 - INFO - LLM raw response: ChatCompletion(id='chatcmpl-CLSyUk0mjQIrH5v2vd2o19kQLlNvi', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='```python\n{\n    "deep learning approach": ["deep learning approach", "deep learning algorithm", "deep learning model", "deep learning-based", "deep learning"],\n    "genetic programming": ["genetic programming", "genetic algorithm", "genetic algorithm based", "non-dominated sorting genetic algorithm", "non-dominated sorting genetic"],\n    "wavelet transform dwt": ["wavelet transform dwt", "discrete wavelet transform dwt", "discrete wavelet transform"],\n    "simulated annealing": ["simulated annealing"],\n    "multi-fidelity task": ["multi-fidelity task", "multi-fidelity activity", "multi-fidelity investigation", "multi-fidelity application", "multi-fidelity ana

✓ Batch 1 LLM response received: 14146 characters
⚠️ No dictionary found in LLM response for batch 1


2025-09-30 13:39:24,186 - INFO - Created 334 method variant groups from 386 original methods
2025-09-30 13:39:24,187 - INFO - ✓ Created 334 canonical methods with 386 total variants
2025-09-30 13:39:24,187 - INFO - Saving method phrases and groups for manual review and editing...
2025-09-30 13:39:24,191 - INFO - ✅ Method extraction and grouping phase completed successfully!
2025-09-30 13:39:24,193 - INFO - 💰 Credit usage so far: {'total_tokens': 139138, 'total_cost': 0.0209}


  Post-processing preserved 334 groups

📊 Method Extraction and Grouping Results:
  Original methods: 386
  Consolidated methods: 334
  Reduction: 52 methods (13.5% reduction)

⏸️  EDITING CHECKPOINT:
  📝 Edit extracted_method_phrases.csv to add/remove/rename methods
  📝 Edit method_variant_groups.json to adjust groupings
  ✅ Files saved to: Saved_files_new
  ➡️  Run Cell 10B when editing is complete


#### added function to compare extracted terms and grouped terms after editing

In [46]:
#one-time operation to compare groups and terms.clear
#compare the terms in the json file "method_variant_groups" with the terms in the csv file "extract_method_phrases"
# list all the terms that are not found in extracted_method_phrases and add them

def compare_variant_groups_with_extracted_terms(variant_groups_file, extracted_phrases_file, output_file=None):
    """
    Compare method variant groups with extracted method phrases to find missing terms
    
    Parameters:
    - variant_groups_file: path to JSON file with method variant groups
    - extracted_phrases_file: path to CSV file with extracted method phrases  
    - output_file: optional path to save detailed analysis CSV
    
    Returns:
    - dictionary with analysis results
    """
    
    # Load data
    print("Loading data...")
    with open(variant_groups_file, 'r', encoding='utf-8') as f:
        method_variant_groups = json.load(f)
    
    df_extracted = pd.read_csv(extracted_phrases_file)
    
    # Normalize terms for comparison (lowercase, strip whitespace)
    def normalize_term(term):
        if pd.isna(term):
            return ""
        return str(term).strip().lower()
    
    # Create sets for comparison
    extracted_terms = set(df_extracted['Method Phrase'].apply(normalize_term))
    extracted_terms.discard("")  # Remove empty strings
    
    # Get all terms from variant groups
    variant_terms = set()
    term_to_group_mapping = {}  # Track which group each term belongs to
    
    for group_key, variants in method_variant_groups.items():
        # Add the group key itself
        normalized_key = normalize_term(group_key)
        if normalized_key:
            variant_terms.add(normalized_key)
            term_to_group_mapping[normalized_key] = group_key
        
        # Add all variants in the group
        for variant in variants:
            normalized_variant = normalize_term(variant)
            if normalized_variant:
                variant_terms.add(normalized_variant)
                term_to_group_mapping[normalized_variant] = group_key
    
    # Find differences
    missing_from_extracted = sorted(variant_terms - extracted_terms)
    missing_from_variants = sorted(extracted_terms - variant_terms)
    common_terms = sorted(extracted_terms & variant_terms)
    
    # Summary statistics
    results = {
        'total_variant_groups': len(method_variant_groups),
        'total_unique_variant_terms': len(variant_terms),
        'total_extracted_terms': len(extracted_terms),
        'common_terms_count': len(common_terms),
        'missing_from_extracted_count': len(missing_from_extracted),
        'missing_from_variants_count': len(missing_from_variants),
        'missing_from_extracted': missing_from_extracted,
        'missing_from_variants': missing_from_variants,
        'common_terms': common_terms,
        'coverage_percentage': (len(common_terms) / len(variant_terms) * 100) if len(variant_terms) > 0 else 0
    }
    
    # Print results
    print(f"\n{'='*60}")
    print(f"COMPARISON RESULTS")
    print(f"{'='*60}")
    print(f"Total variant groups: {results['total_variant_groups']}")
    print(f"Total unique terms in variant groups: {results['total_unique_variant_terms']}")
    print(f"Total extracted terms: {results['total_extracted_terms']}")
    print(f"Common terms: {results['common_terms_count']}")
    print(f"Terms in variants but missing from extracted: {results['missing_from_extracted_count']}")
    print(f"Terms in extracted but missing from variants: {results['missing_from_variants_count']}")
    print(f"Coverage of variant terms in extracted: {results['coverage_percentage']:.1f}%")
    
    return results


def check_duplicates_in_json_groups(json_file):
    """
    CORRECTED: Don't count group name as duplicate when it appears in its own variants
    """
    
    with open(json_file, 'r', encoding='utf-8') as f:
        method_variant_groups = json.load(f)
    
    def normalize_term(term):
        return str(term).strip().lower() if not pd.isna(term) else ""
    
    # Check for duplicate group keys
    normalized_keys = [normalize_term(key) for key in method_variant_groups.keys()]
    key_counts = Counter(normalized_keys)
    duplicate_keys = {k: v for k, v in key_counts.items() if v > 1}
    
    # Check for terms appearing in multiple groups
    # CORRECTED: Don't count group name appearing in its own variants
    term_to_groups = defaultdict(set)
    
    for group_key, variants in method_variant_groups.items():
        normalized_key = normalize_term(group_key)
        normalized_variants = [normalize_term(v) for v in variants if normalize_term(v)]
        
        # Add all variants to the mapping
        for variant in variants:
            normalized_variant = normalize_term(variant)
            if normalized_variant:
                term_to_groups[normalized_variant].add(group_key)
        
        # Only add the group key if it's NOT already in its variants
        if normalized_key and normalized_key not in normalized_variants:
            term_to_groups[normalized_key].add(group_key)
    
    # Find terms that appear in multiple groups
    cross_group_duplicates = {}
    for term, groups in term_to_groups.items():
        if len(groups) > 1:
            cross_group_duplicates[term] = list(groups)
    
    return {
        'duplicate_group_keys': duplicate_keys,
        'cross_group_duplicates': cross_group_duplicates
    }

def comprehensive_duplicate_check(json_file, csv_file, output_file=None, clean_csv=False):
    """Complete duplicate analysis with corrected logic"""
    
    json_results = check_duplicates_in_json_groups(json_file)
    csv_results = check_duplicates_in_csv_terms(csv_file)  # Previous function unchanged
    
    print("SUMMARY (CORRECTED):")
    print(f"JSON - Duplicate group keys: {len(json_results['duplicate_group_keys'])}")
    print(f"JSON - Terms in multiple groups: {len(json_results['cross_group_duplicates'])}")
    print(f"CSV - Duplicate terms: {len(csv_results['duplicate_terms'])}")
    
    return {'json_results': json_results, 'csv_results': csv_results}




In [49]:
groups='Saved_files_new\method_variant_groups.json'
phrases='Saved_files_new\extracted_method_phrases.csv'
results = compare_variant_groups_with_extracted_terms(
        groups, 
        phrases
    )
    
    # Print missing terms
print(f"\nMISSING TERMS ({len(results['missing_from_variants'])}):")
for i, term in enumerate(results['missing_from_extracted']):
       print(f"{i+1:3d}. {term}")
# controll for duplicates
# Basic check: 
comprehensive_duplicate_check(groups, phrases)
# With cleaning: comprehensive_duplicate_check('groups.json', 'terms.csv', clean_csv=True)


Loading data...

COMPARISON RESULTS
Total variant groups: 300
Total unique terms in variant groups: 393
Total extracted terms: 393
Common terms: 393
Terms in variants but missing from extracted: 0
Terms in extracted but missing from variants: 0
Coverage of variant terms in extracted: 100.0%

MISSING TERMS (0):
SUMMARY (CORRECTED):
JSON - Duplicate group keys: 0
JSON - Terms in multiple groups: 0
CSV - Duplicate terms: 0


{'json_results': {'duplicate_group_keys': {}, 'cross_group_duplicates': {}},
 'csv_results': {'total_terms': 393,
  'duplicate_terms': {},
  'duplicate_details': {}}}

#### method detection and assignment

In [43]:
# %%
# %%
# Cell 10B: Scoring and Assignment Phase (Uses Edited Terms)

# =============================================================================
# CONFIGURATION PARAMETERS FOR SCORING
# =============================================================================
TFIDF_WEIGHT = 0.65                      # Weight for TF-IDF scoring in final combination
LDA_WEIGHT = 0.05                        # Weight for LDA scoring in final combination  
COMPOUND_WEIGHT = 0.30                   # Weight for compound scoring in final combination
TOP_METHODS_PER_PAPER = 10               # Number of top methods to assign per paper
MIN_ASSIGN_SCORE = 0.01                  # Minimum score threshold for method assignment

logger.info("=== Starting Scoring and Assignment Phase ===")

# =============================================================================
# LOAD EDITED METHODS AND GROUPS
# =============================================================================
logger.info("Loading method phrases and groups (potentially edited)...")

# Load method phrases (potentially edited by user)
method_phrases, method_counts = load_method_phrases_from_csv(filename="extracted_method_phrases.csv")
logger.info(f"✓ Loaded {len(method_phrases)} method phrases")

# Load variant groups (potentially edited by user)
with open(os.path.join(SAVE_DIR, "method_variant_groups.json"), 'r') as f:
    canonical_to_variants = json.load(f)
logger.info(f"✓ Loaded {len(canonical_to_variants)} method groups")

# Recreate mappings
variant_to_canonical, _ = create_variant_mapping(canonical_to_variants)

# =============================================================================
# COMPUTE MULTIPLE SCORING MATRICES FOR ROBUST METHOD DETECTION
# =============================================================================
logger.info("Computing enhanced scoring matrices using multiple approaches...")

# Convert DataFrame text to list for processing
processed_texts = df['processed_text'].fillna('').tolist()

# 3a: TF-IDF Scoring - Captures term frequency and document importance
logger.info("  Computing TF-IDF scores for method variants...")

#standard (non-boolean) tfidf implementation
#tfidf_scores, tfidf_feature_names = compute_enhanced_tfidf_scores(
#    processed_texts, canonical_to_variants
#)
tfidf_scores, tfidf_feature_names = compute_enhanced_tfidf_scores_adapted(
    processed_texts, canonical_to_variants, max_df=0.98,min_df=1
)

logger.info(f"  ✓ TF-IDF: {tfidf_scores.shape} with {len(tfidf_feature_names)} features")

# 3b: LDA Scoring - Captures topic-based method associations
logger.info("  Computing LDA scores for method variants...")  
method_vocab = list(canonical_to_variants.keys())
lda_scores, lda_feature_names = compute_enhanced_lda_scores(
    processed_texts, canonical_to_variants, n_topics=len(method_vocab)
)
logger.info(f"  ✓ LDA: {lda_scores.shape} with {len(lda_feature_names)} features")

# 3c: Compound Scoring - Captures exact phrase matches and partial matches
logger.info("  Computing compound scores for method variants...")
compound_scores, compound_feature_names = compute_enhanced_compound_scores(
    df, canonical_to_variants
)
logger.info(f"  ✓ Compound: {compound_scores.shape} with {len(compound_feature_names)} features")

# =============================================================================
# FEATURE ALIGNMENT AND HARMONIZATION
# =============================================================================
logger.info("Aligning and harmonizing features across scoring methods...")

# Create union of all features to preserve maximum method coverage
all_features = set(tfidf_feature_names) | set(lda_feature_names) | set(compound_feature_names)
all_features = sorted(list(all_features))  # Sort for consistency

logger.info(f"  Feature alignment statistics:")
logger.info(f"    Total unique features: {len(all_features)}")
logger.info(f"    TF-IDF features: {len(tfidf_feature_names)}")
logger.info(f"    LDA features: {len(lda_feature_names)}")  
logger.info(f"    Compound features: {len(compound_feature_names)}")

# Use your existing align_scores_robust function
tfidf_aligned = align_scores_robust(tfidf_scores, tfidf_feature_names, all_features)
lda_aligned = align_scores_robust(lda_scores, lda_feature_names, all_features)
compound_aligned = align_scores_robust(compound_scores, compound_feature_names, all_features)

logger.info(f"✓ Feature alignment complete: {tfidf_aligned.shape}")

# =============================================================================
# SCORE NORMALIZATION AND CONSOLIDATION
# =============================================================================
logger.info("Normalizing scores and applying variant consolidation...")



# Normalize each scoring matrix to ensure fair contribution to final scores
tfidf_normalized = normalize_scores(tfidf_aligned)
lda_normalized = normalize_scores(lda_aligned)
compound_normalized = normalize_scores(compound_aligned)

# Combine normalized scores using weighted average
combined_scores = (
    TFIDF_WEIGHT * tfidf_normalized + 
    LDA_WEIGHT * lda_normalized + 
    COMPOUND_WEIGHT * compound_normalized
)

logger.info(f"✓ Score combination complete: {combined_scores.shape}")
logger.info(f"  Combined score range: [{combined_scores.min():.4f}, {combined_scores.max():.4f}]")

# Apply variant consolidation to prevent double-counting
if variant_to_canonical:
    logger.info("  Applying variant score consolidation to prevent double-counting...")
    
    # Consolidate variant scores using maximum (not sum) to avoid inflating scores
    final_scores, canonical_methods = consolidate_variant_scores(
        combined_scores, all_features, variant_to_canonical
    )
    logger.info(f"  ✓ Consolidated {len(all_features)} methods to {len(canonical_methods)} canonical methods")
else:
    # No consolidation needed - use combined scores as-is
    final_scores = combined_scores
    canonical_methods = all_features
    logger.info("  No variant consolidation applied (no variant mappings found)")

logger.info(f"✓ Final consolidated scores: {final_scores.shape}")
logger.info(f"  Final score range: [{final_scores.min():.4f}, {final_scores.max():.4f}]")

# =============================================================================
# METHOD ASSIGNMENT TO PAPERS
# =============================================================================
logger.info("Assigning methods to papers using consolidated scores...")

# Assign top methods to each paper using the consolidated scores
df = assign_methods_improved(
    df, final_scores, canonical_methods, 
    top_n=TOP_METHODS_PER_PAPER, 
    min_score=MIN_ASSIGN_SCORE
)

# Additional diagnostic: Verify no double-counting occurred
assigned_methods = df[df['Primary_Method'] != '']['Primary_Method'].tolist()
method_assignment_counts = pd.Series(assigned_methods).value_counts()

print(f"\n🔍 Final Assignment Verification:")
print(f"  Papers assigned methods: {len(assigned_methods)}")
print(f"  Unique methods assigned: {len(method_assignment_counts)}")
print(f"  Top assigned methods:")

for method, count in method_assignment_counts.head(10).items():
    # Check if this method has variants that were consolidated
    variants = canonical_to_variants.get(method, [method])
    if len(variants) > 1:
        print(f"    {method}: {count} papers (consolidated from: {variants})")
    else:
        print(f"    {method}: {count} papers")

# =============================================================================
# SAVE RESULTS AND METADATA
# =============================================================================
logger.info("Saving results and metadata...")

# Save method variant mappings for future reference and transparency
with open(os.path.join(SAVE_DIR, f"method_variant_groups_{suffix_string}.json"), 'w') as f:
    json.dump(canonical_to_variants, f, indent=2)

# Save consolidated score matrix for analysis and debugging
pd.DataFrame(final_scores, columns=canonical_methods).to_csv(
    os.path.join(SAVE_DIR, f"consolidated_method_scores_{suffix_string}.csv")
)

# Save final enhanced dataframe with method assignments
enhanced_analysis_filename = f"enhanced_method_analysis_{suffix_string}.csv"
df.to_csv(os.path.join(SAVE_DIR, enhanced_analysis_filename), index=False)

logger.info(f"✓ Results saved:")
logger.info(f"  Enhanced analysis: {enhanced_analysis_filename}")
logger.info(f"  Method variant groups: method_variant_groups_{suffix_string}.json")
logger.info(f"  Consolidated scores: consolidated_method_scores_{suffix_string}.csv")

# Run diagnostics
diagnostic_results = enhanced_method_diagnostics(df, final_scores, canonical_methods, canonical_to_variants)

print(f"\n✅ Enhanced Method Detection Pipeline Completed Successfully!")
print(f"📁 All results saved to: {SAVE_DIR}")
print(f"📊 Assignment Rate: {diagnostic_results['assignment_rate']:.1f}%")
print(f"🔧 Methods Consolidated: {len(method_phrases) if method_phrases else 0} → {len(canonical_methods)}")

logger.info("Enhanced method detection pipeline with consolidation completed successfully!")
logger.info(f"Credit usage: {credit_tracker.get_stats()}")


2025-09-30 15:41:45,367 - INFO - === Starting Scoring and Assignment Phase ===
2025-09-30 15:41:45,368 - INFO - Loading method phrases and groups (potentially edited)...
2025-09-30 15:41:45,375 - INFO - ✓ Loaded 402 method phrases
2025-09-30 15:41:45,378 - INFO - ✓ Loaded 305 method groups
2025-09-30 15:41:45,379 - INFO - Computing enhanced scoring matrices using multiple approaches...
2025-09-30 15:41:45,390 - INFO -   Computing TF-IDF scores for method variants...


Found 364 variants that exist in corpus out of 399 total


2025-09-30 15:43:14,757 - INFO -   ✓ TF-IDF: (30917, 364) with 364 features
2025-09-30 15:43:14,758 - INFO -   Computing LDA scores for method variants...
2025-09-30 15:44:22,737 - INFO -   ✓ LDA: (30917, 305) with 399 features
2025-09-30 15:44:22,738 - INFO -   Computing compound scores for method variants...
2025-09-30 15:58:02,546 - INFO -   ✓ Compound: (30917, 399) with 399 features
2025-09-30 15:58:02,547 - INFO - Aligning and harmonizing features across scoring methods...
2025-09-30 15:58:02,548 - INFO -   Feature alignment statistics:
2025-09-30 15:58:02,549 - INFO -     Total unique features: 399
2025-09-30 15:58:02,550 - INFO -     TF-IDF features: 364
2025-09-30 15:58:02,550 - INFO -     LDA features: 399
2025-09-30 15:58:02,550 - INFO -     Compound features: 399


    ✓ Aligned 364/399 features
⚠️  DIMENSION MISMATCH DETECTED:
    Expected columns: 399 (from feature names)
    Actual columns: 305 (from score matrix)
    Using actual matrix dimensions for safety
    Truncated feature list: 305 features
    ✓ Aligned 305/399 features


2025-09-30 15:58:03,172 - INFO - ✓ Feature alignment complete: (30917, 399)
2025-09-30 15:58:03,173 - INFO - Normalizing scores and applying variant consolidation...


    ✓ Aligned 399/399 features


2025-09-30 15:58:03,632 - INFO - ✓ Score combination complete: (30917, 399)
2025-09-30 15:58:03,649 - INFO -   Combined score range: [0.0000, 0.9502]
2025-09-30 15:58:03,651 - INFO -   Applying variant score consolidation to prevent double-counting...
2025-09-30 15:58:03,891 - INFO -   ✓ Consolidated 399 methods to 305 canonical methods
2025-09-30 15:58:03,892 - INFO - ✓ Final consolidated scores: (30917, 305)
2025-09-30 15:58:03,905 - INFO -   Final score range: [0.0000, 0.9502]
2025-09-30 15:58:03,906 - INFO - Assigning methods to papers using consolidated scores...
2025-09-30 15:58:32,310 - INFO -   Assigned methods to 18465/30917 papers (59.7%)
2025-09-30 15:58:32,445 - INFO - Saving results and metadata...



🔍 Final Assignment Verification:
  Papers assigned methods: 18465
  Unique methods assigned: 293
  Top assigned methods:
    energy storage system esss: 1281 papers (consolidated from: ['energy storage system', 'distributed energy storage system', 'ess'])
    capacity factor: 640 papers
    distributed generation: 570 papers
    voltage control: 450 papers
    neural network: 389 papers (consolidated from: ['neural network model', 'back propagation', 'neural network based', 'neural network anns', 'artificial neural network', 'anns', 'neural network ann', 'neural network', 'neural network algorithm'])
    power system fault: 384 papers
    voltage stability: 326 papers (consolidated from: ['voltage stability', 'voltage stability index'])
    demand response: 307 papers (consolidated from: ['demand response', 'demand response dr'])
    renewable energy paper: 286 papers
    particle swarm optimization: 285 papers (consolidated from: ['particle swarm optimization pso', 'pso', 'particle s

2025-09-30 15:58:51,096 - INFO - ✓ Results saved:
2025-09-30 15:58:51,098 - INFO -   Enhanced analysis: enhanced_method_analysis_2025_09_30_reliability_resilience_power_systems.csv
2025-09-30 15:58:51,098 - INFO -   Method variant groups: method_variant_groups_2025_09_30_reliability_resilience_power_systems.json
2025-09-30 15:58:51,100 - INFO -   Consolidated scores: consolidated_method_scores_2025_09_30_reliability_resilience_power_systems.csv



COMPREHENSIVE METHOD DETECTION DIAGNOSTICS

📊 ASSIGNMENT OVERVIEW:
  Total papers processed: 30,917
  Papers with methods assigned: 18,465 (59.7%)
  Papers without methods: 12,452 (40.3%)

📈 SCORE DISTRIBUTION ANALYSIS:
  Final score matrix shape: (30917, 305)
  Total canonical methods: 305
  Score range: [0.0000, 0.9502]
  Mean score: 0.0025
  Standard deviation: 0.0386
  Scores > 0.001: 76,943 (0.82% of all scores)
  Scores > 0.005: 66,931 (0.71% of all scores)
  Scores > 0.01: 63,197 (0.67% of all scores)
  Scores > 0.05: 47,557 (0.50% of all scores)
  Scores > 0.1: 47,557 (0.50% of all scores)

🔥 TOP ASSIGNED METHODS:
   2. energy storage system esss: 1,281 papers (6.9%) (from 3 variants)
   3. capacity factor: 640 papers (3.5%)
   4. distributed generation: 570 papers (3.1%)
   5. voltage control: 450 papers (2.4%)
   6. neural network: 389 papers (2.1%) (from 9 variants)
   7. power system fault: 384 papers (2.1%)
   8. voltage stability: 326 papers (1.8%) (from 2 variants)
   9

2025-09-30 15:58:51,383 - INFO - Enhanced method detection pipeline with consolidation completed successfully!
2025-09-30 15:58:51,385 - INFO - Credit usage: {'total_tokens': 139138, 'total_cost': 0.0209}



✅ Enhanced Method Detection Pipeline Completed Successfully!
📁 All results saved to: Saved_files_new
📊 Assignment Rate: 59.7%
🔧 Methods Consolidated: 402 → 305


In [None]:

# old "one-fell swoup"executing of method assignment
"""
# %%
# Cell 10: Complete Enhanced Method Extraction and Assignment Workflow - REFACTORED

# =============================================================================
# CONFIGURATION PARAMETERS - Adjust these for optimal method detection
# =============================================================================
MAX_FEATURES = 10000                    # Maximum features for candidate term extraction
TFIDF_WEIGHT = 0.45                      # Weight for TF-IDF scoring in final combination
LDA_WEIGHT = 0.25                        # Weight for LDA scoring in final combination  
COMPOUND_WEIGHT = 0.30                   # Weight for compound scoring in final combination
TOP_METHODS_PER_PAPER = 10              # Number of top methods to assign per paper
MIN_ASSIGN_SCORE = 0.02                # Minimum score threshold for method assignment
BATCH_SIZE = 5000                       # Batch size for LLM processing
METHOD_LLM_N_RUNS = 3                   # Number of LLM runs for method extraction
VARIANT_GROUP_BATCH_SIZE = 5000           # Batch size for method variant grouping
TOP_P=0.92
TEMP=0.15

logger.info("=== Starting Enhanced Method Detection Pipeline with Consolidation ===")

# =============================================================================
# STEP 1: LOAD OR EXTRACT METHOD PHRASES FROM CORPUS
# =============================================================================
logger.info("Step 1: Loading or extracting method phrases...")

# Try to load existing method phrases from previous runs
try:
    method_phrases, method_counts = load_method_phrases_from_csv(filename="extracted_method_phrases.csv")
except (FileNotFoundError, TypeError):
    method_phrases, method_counts = None, None

# If no existing phrases found or too few, extract new ones using LLM
if (method_phrases is None) or (len(method_phrases) < 3):
    logger.info("  1a: Extracting candidate terms from processed text...")
    
    # Extract candidate n-grams (1-4 grams) from the corpus using CountVectorizer
    candidate_terms = extract_candidate_terms(df, text_col='processed_text', max_features=MAX_FEATURES)
    logger.info(f"  ✓ Extracted {len(candidate_terms)} candidate terms")
    print(f"  Sample candidate terms: {candidate_terms[:10]}")
    
    logger.info("  1b: Using LLM to identify research methods from candidate terms...")
    
    # Use LLM to intelligently identify research methods from candidate terms
    # This filters out generic terms and focuses on actual research methodologies
    # Call the functions with your prompts
    
    method_phrases, method_counts = get_method_phrases_enhanced(
        candidate_terms,
        client,
        model_type,
        credit_tracker,
        prompt=extraction_prompt,
        n_runs=METHOD_LLM_N_RUNS,
        batch_size=BATCH_SIZE,
        top_p=TOP_P,
        temp=TEMP
    )
    method_phrases = filter_generic_phrases(method_phrases)
    # Save extracted phrases for future use
    save_method_phrases_to_csv(method_phrases, method_counts)
else:
    logger.info(f"  ✓ Loaded {len(method_phrases)} method phrases from existing CSV")

# Validate that method extraction was successful
if not method_phrases:
    logger.error("No method phrases extracted! Check your LLM configuration and prompts.")
    raise RuntimeError("Method extraction failed - no phrases found")

logger.info(f"✓ Method phrase extraction complete: {len(method_phrases)} phrases")
print(f"  Sample methods: {method_phrases[:10]}")
# Apply after LLM extraction:


# =============================================================================
# STEP 2: ENHANCED METHOD VARIANT CONSOLIDATION
# =============================================================================
logger.info("Step 2: Building enhanced method variant groups with consolidation...")

# Use enhanced LLM-based variant grouping to consolidate similar methods
variant_groups = build_method_variant_groups_enhanced(
    method_phrases, 
    client, 
    model_type, 
    credit_tracker, 
    prompt=grouping_prompt, 
    top_p=TOP_P,
    temp=TEMP,
    batch_size=VARIANT_GROUP_BATCH_SIZE
) if method_phrases else {}

# Create fallback mapping if LLM-based grouping fails completely
if not variant_groups and method_phrases:
    logger.info("  LLM grouping failed completely - using enhanced aggressive fallback grouping...")
    variant_groups = aggressive_fallback_grouping(method_phrases, similarity_threshold=0.75)
    logger.info(f"  ✓ Aggressive fallback created {len(variant_groups)} groups from {len(method_phrases)} methods")

# Create bidirectional mappings for efficient lookup during scoring
variant_to_canonical, canonical_to_variants = create_variant_mapping(variant_groups)
logger.info(f"✓ Created {len(canonical_to_variants)} canonical methods with {len(variant_to_canonical)} total variants")


# Display consolidation results
print(f"\n📊 Method Consolidation Results:")
print(f"  Original methods: {len(method_phrases) if method_phrases else 0}")
print(f"  Consolidated methods: {len(canonical_to_variants)}")
reduction = len(method_phrases) - len(canonical_to_variants) if method_phrases else 0
print(f"  Reduction: {reduction} methods ({100*reduction/len(method_phrases):.1f}% reduction)" if method_phrases and len(method_phrases) > 0 else "")

print("\nSample variant groups (showing groups with multiple variants):")
sample_count = 0
for canonical, variants in canonical_to_variants.items():
    if len(variants) > 1 and sample_count < 5:  # Only show groups with multiple variants
        print(f"  {canonical}: {variants}")
        sample_count += 1

# =============================================================================
# STEP 3: COMPUTE MULTIPLE SCORING MATRICES FOR ROBUST METHOD DETECTION
# =============================================================================
logger.info("Step 3: Computing enhanced scoring matrices using multiple approaches...")

# Convert DataFrame text to list for processing
processed_texts = df['processed_text'].fillna('').tolist()

# 3a: TF-IDF Scoring - Captures term frequency and document importance
logger.info("  3a: Computing TF-IDF scores for method variants...")
tfidf_scores, tfidf_feature_names = compute_enhanced_tfidf_scores(
    processed_texts, canonical_to_variants
)
logger.info(f"  ✓ TF-IDF: {tfidf_scores.shape} with {len(tfidf_feature_names)} features")

# 3b: LDA Scoring - Captures topic-based method associations
logger.info("  3b: Computing LDA scores for method variants...")  
method_vocab = list(canonical_to_variants.keys())
lda_scores, lda_feature_names = compute_enhanced_lda_scores(
    processed_texts, canonical_to_variants, n_topics=len(method_vocab)
)
logger.info(f"  ✓ LDA: {lda_scores.shape} with {len(lda_feature_names)} features")

# 3c: Compound Scoring - Captures exact phrase matches and partial matches
logger.info("  3c: Computing compound scores for method variants...")
compound_scores, compound_feature_names = compute_enhanced_compound_scores(
    df, canonical_to_variants
)
logger.info(f"  ✓ Compound: {compound_scores.shape} with {len(compound_feature_names)} features")

# =============================================================================
# STEP 4: FEATURE ALIGNMENT AND HARMONIZATION
# =============================================================================
logger.info("Step 4: Aligning and harmonizing features across scoring methods...")

# Create union of all features to preserve maximum method coverage
# This ensures we don't lose methods that appear in only one scoring approach
all_features = set(tfidf_feature_names) | set(lda_feature_names) | set(compound_feature_names)
all_features = sorted(list(all_features))  # Sort for consistency

logger.info(f"  Feature alignment statistics:")
logger.info(f"    Total unique features: {len(all_features)}")
logger.info(f"    TF-IDF features: {len(tfidf_feature_names)}")
logger.info(f"    LDA features: {len(lda_feature_names)}")  
logger.info(f"    Compound features: {len(compound_feature_names)}")

def align_scores_robust(scores, current_features, target_features):
    
    #Enhanced alignment with dimension safety checks and detailed error handling.
    
    if not target_features:
        return np.array([]).reshape(scores.shape[0], 0)
    
    # SAFETY CHECK: Verify dimensions match expectations
    expected_cols = len(current_features)
    actual_cols = scores.shape[1]
    
    if expected_cols != actual_cols:
        print(f"⚠️  DIMENSION MISMATCH DETECTED:")
        print(f"    Expected columns: {expected_cols} (from feature names)")
        print(f"    Actual columns: {actual_cols} (from score matrix)")
        print(f"    Using actual matrix dimensions for safety")
        
        # Use only the features that actually exist in the matrix
        safe_current_features = current_features[:actual_cols]
        print(f"    Truncated feature list: {len(safe_current_features)} features")
    else:
        safe_current_features = current_features
    
    # Initialize aligned matrix with zeros
    aligned_scores = np.zeros((scores.shape[0], len(target_features)))
    current_to_idx = {feat: i for i, feat in enumerate(safe_current_features)}
    
    # Map existing features to aligned positions with bounds checking
    found_features = 0
    skipped_features = 0
    
    for j, feat in enumerate(target_features):
        if feat in current_to_idx:
            source_idx = current_to_idx[feat]
            
            # BOUNDS CHECK: Ensure source index is valid
            if source_idx < scores.shape[1]:
                aligned_scores[:, j] = scores[:, source_idx]
                found_features += 1
            else:
                print(f"⚠️  Skipping feature '{feat}': index {source_idx} >= {scores.shape[1]}")
                skipped_features += 1
    
    print(f"    ✓ Aligned {found_features}/{len(target_features)} features")
    if skipped_features > 0:
        print(f"    ⚠️  Skipped {skipped_features} features due to bounds issues")
    
    return aligned_scores


# Align all scoring matrices to the unified feature space
tfidf_aligned = align_scores_robust(tfidf_scores, tfidf_feature_names, all_features)
lda_aligned = align_scores_robust(lda_scores, lda_feature_names, all_features)
compound_aligned = align_scores_robust(compound_scores, compound_feature_names, all_features)

logger.info(f"✓ Feature alignment complete: {tfidf_aligned.shape}")

# =============================================================================
# STEP 5: SCORE NORMALIZATION AND CONSOLIDATION
# =============================================================================
logger.info("Step 5: Normalizing scores and applying variant consolidation...")

def normalize_scores(scores):
    #Normalize scores to  range per matrix for fair weighting.[1]
    if scores.max() == 0:
        return scores
    return scores / scores.max()

# Normalize each scoring matrix to ensure fair contribution to final scores
tfidf_normalized = normalize_scores(tfidf_aligned)
lda_normalized = normalize_scores(lda_aligned)
compound_normalized = normalize_scores(compound_aligned)

# Combine normalized scores using weighted average
combined_scores = (
    TFIDF_WEIGHT * tfidf_normalized + 
    LDA_WEIGHT * lda_normalized + 
    COMPOUND_WEIGHT * compound_normalized
)

logger.info(f"✓ Score combination complete: {combined_scores.shape}")
logger.info(f"  Combined score range: [{combined_scores.min():.4f}, {combined_scores.max():.4f}]")

# Apply variant consolidation to prevent double-counting
if variant_to_canonical:
    logger.info("  Applying variant score consolidation to prevent double-counting...")
    
    # Consolidate variant scores using maximum (not sum) to avoid inflating scores
    final_scores, canonical_methods = consolidate_variant_scores(
        combined_scores, all_features, variant_to_canonical
    )
    logger.info(f"  ✓ Consolidated {len(all_features)} methods to {len(canonical_methods)} canonical methods")
    
    # Display consolidation statistics
    print(f"\n🔍 Score Consolidation Check:")
    print(f"  Methods before consolidation: {len(all_features)}")
    print(f"  Methods after consolidation: {len(canonical_methods)}")
    print(f"  Consolidation prevented potential double-counting of {len(all_features) - len(canonical_methods)} method variants")
    
else:
    # No consolidation needed - use combined scores as-is
    final_scores = combined_scores
    canonical_methods = all_features
    logger.info("  No variant consolidation applied (no variant mappings found)")

logger.info(f"✓ Final consolidated scores: {final_scores.shape}")
logger.info(f"  Final score range: [{final_scores.min():.4f}, {final_scores.max():.4f}]")

# =============================================================================
# STEP 6: METHOD ASSIGNMENT TO PAPERS
# =============================================================================
logger.info("Step 6: Assigning methods to papers using consolidated scores...")

# Assign top methods to each paper using the consolidated scores
# This creates columns Method_1, Method_2, etc. plus Primary_Method
df = assign_methods_improved(
    df, final_scores, canonical_methods, 
    top_n=TOP_METHODS_PER_PAPER, 
    min_score=MIN_ASSIGN_SCORE
)

# Additional diagnostic: Verify no double-counting occurred
assigned_methods = df[df['Primary_Method'] != '']['Primary_Method'].tolist()
method_assignment_counts = pd.Series(assigned_methods).value_counts()

print(f"\n🔍 Final Assignment Verification:")
print(f"  Papers assigned methods: {len(assigned_methods)}")
print(f"  Unique methods assigned: {len(method_assignment_counts)}")
print(f"  Top assigned methods:")

for method, count in method_assignment_counts.head(10).items():
    # Check if this method has variants that were consolidated
    variants = canonical_to_variants.get(method, [method])
    if len(variants) > 1:
        print(f"    {method}: {count} papers (consolidated from: {variants})")
    else:
        print(f"    {method}: {count} papers")

# =============================================================================
# STEP 7: SAVE RESULTS AND METADATA
# =============================================================================
logger.info("Step 7: Saving results and metadata...")

# Save method variant mappings for future reference and transparency
with open(os.path.join(SAVE_DIR, f"method_variant_groups_{suffix_string}.json"), 'w') as f:
    json.dump(canonical_to_variants, f, indent=2)

# Save consolidated score matrix for analysis and debugging
pd.DataFrame(final_scores, columns=canonical_methods).to_csv(
    os.path.join(SAVE_DIR, f"consolidated_method_scores_{suffix_string}.csv")
)

# Save final enhanced dataframe with method assignments
enhanced_analysis_filename = f"enhanced_method_analysis_{suffix_string}.csv"
df.to_csv(os.path.join(SAVE_DIR, enhanced_analysis_filename), index=False)

logger.info(f"✓ Results saved:")
logger.info(f"  Enhanced analysis: {enhanced_analysis_filename}")
logger.info(f"  Method variant groups: method_variant_groups_{suffix_string}.json")
logger.info(f"  Consolidated scores: consolidated_method_scores_{suffix_string}.csv")

# =============================================================================
# STEP 8: COMPREHENSIVE DIAGNOSTICS AND QUALITY ASSESSMENT
# =============================================================================
logger.info("Step 8: Running comprehensive diagnostics...")

def enhanced_method_diagnostics(df, scores, method_names, variant_groups):
    
    #Comprehensive diagnostics for method assignment quality and consolidation effectiveness.
    
    print("\n" + "="*80)
    print("COMPREHENSIVE METHOD DETECTION DIAGNOSTICS")
    print("="*80)
    
    # Basic assignment statistics
    n_papers = len(df)
    assigned_papers = (df['Primary_Method'] != '').sum()
    assignment_rate = 100 * assigned_papers / n_papers
    
    print(f"\n📊 ASSIGNMENT OVERVIEW:")
    print(f"  Total papers processed: {n_papers:,}")
    print(f"  Papers with methods assigned: {assigned_papers:,} ({assignment_rate:.1f}%)")
    print(f"  Papers without methods: {n_papers - assigned_papers:,} ({100-assignment_rate:.1f}%)")
    
    # Score distribution analysis
    print(f"\n📈 SCORE DISTRIBUTION ANALYSIS:")
    print(f"  Final score matrix shape: {scores.shape}")
    print(f"  Total canonical methods: {len(method_names)}")
    print(f"  Score range: [{scores.min():.4f}, {scores.max():.4f}]")
    print(f"  Mean score: {scores.mean():.4f}")
    print(f"  Standard deviation: {scores.std():.4f}")
    
    # Score threshold analysis
    thresholds = [0.001, 0.005, 0.01, 0.05, 0.1]
    for threshold in thresholds:
        count = (scores > threshold).sum()
        print(f"  Scores > {threshold}: {count:,} ({100*count/scores.size:.2f}% of all scores)")
    
    # Method popularity and assignment quality
    if assigned_papers > 0:
        print(f"\n🔥 TOP ASSIGNED METHODS:")
        method_counts = df['Primary_Method'].value_counts()
        
        for i, (method, count) in enumerate(method_counts.head(15).items()):
            if method:  # Skip empty strings
                percentage = 100 * count / assigned_papers
                # Check if method was consolidated from variants
                variants = variant_groups.get(method, [method])
                variant_info = f" (from {len(variants)} variants)" if len(variants) > 1 else ""
                print(f"  {i+1:2d}. {method}: {count:,} papers ({percentage:.1f}%){variant_info}")
    
    # Confidence distribution analysis
    if 'Method_Confidence' in df.columns:
        print(f"\n🎯 CONFIDENCE DISTRIBUTION:")
        conf_counts = df['Method_Confidence'].value_counts()
        for conf, count in conf_counts.items():
            percentage = 100 * count / n_papers
            print(f"  {conf}: {count:,} ({percentage:.1f}%)")
    
    # Consolidation effectiveness analysis
    print(f"\n🔧 CONSOLIDATION EFFECTIVENESS:")
    total_variants = sum(len(variants) for variants in variant_groups.values())
    consolidated_groups = len([v for v in variant_groups.values() if len(v) > 1])
    
    print(f"  Total method variants processed: {total_variants:,}")
    print(f"  Final canonical methods: {len(variant_groups):,}")
    print(f"  Groups with multiple variants: {consolidated_groups:,}")
    print(f"  Consolidation ratio: {total_variants/len(variant_groups):.2f}:1")
    
    # Quality assessment and recommendations
    print(f"\n⚠️  QUALITY ASSESSMENT:")
    
    if assignment_rate < 50:
        print(f"  ⚠️  Low assignment rate ({assignment_rate:.1f}%) - consider:")
        print(f"      -  Lowering MIN_ASSIGN_SCORE (current: {MIN_ASSIGN_SCORE})")
        print(f"      -  Reviewing method extraction quality")
        print(f"      -  Checking text preprocessing effectiveness")
    else:
        print(f"  ✅ Good assignment rate ({assignment_rate:.1f}%)")
    
    if scores.max() < 0.1:
        print(f"  ⚠️  Low maximum scores ({scores.max():.4f}) - scoring method may need adjustment")
    else:
        print(f"  ✅ Reasonable maximum scores ({scores.max():.4f})")
    
    zero_score_methods = (scores.max(axis=0) == 0).sum()
    if zero_score_methods > 0:
        zero_percentage = 100 * zero_score_methods / len(method_names)
        print(f"  ⚠️  {zero_score_methods} methods ({zero_percentage:.1f}%) have zero scores across all papers")
        print(f"      Consider reviewing method extraction or scoring parameters")
    else:
        print(f"  ✅ All methods have non-zero scores in at least some papers")
    
    print("\n" + "="*80)
    return {
        'assignment_rate': assignment_rate,
        'total_papers': n_papers,
        'assigned_papers': assigned_papers,
        'score_stats': {
            'min': scores.min(),
            'max': scores.max(),
            'mean': scores.mean(),
            'std': scores.std()
        }
    }

# Run comprehensive diagnostics
diagnostic_results = enhanced_method_diagnostics(df, final_scores, canonical_methods, canonical_to_variants)

# =============================================================================
# STEP 9: DISPLAY SAMPLE RESULTS FOR VERIFICATION
# =============================================================================
print("\n" + "="*80)
print("SAMPLE RESULTS FOR VERIFICATION")
print("="*80)

# Define columns to display in sample results
sample_cols = ['Primary_Method', 'Primary_Method_Score', 'Method_Confidence', 'Total_Method_Score']
available_cols = [col for col in sample_cols if col in df.columns]

# Show sample of papers WITH methods assigned
assigned_mask = df['Primary_Method'] != ''
if assigned_mask.sum() > 0:
    print(f"\n📄 SAMPLE PAPERS WITH METHODS ASSIGNED (first 10):")
    sample_assigned = df[assigned_mask][available_cols].head(10)
    print(sample_assigned.to_string(index=False))
    
    # Show distribution of assigned methods
    print(f"\n📊 METHOD ASSIGNMENT DISTRIBUTION:")
    for i in range(1, min(4, TOP_METHODS_PER_PAPER + 1)):  # Show top 3 method columns
        col_name = f'Method_{i}'
        if col_name in df.columns:
            non_empty = df[df[col_name] != ''][col_name].value_counts()
            print(f"  {col_name} - {len(non_empty)} unique methods assigned to {non_empty.sum()} papers")

# Show sample of papers WITHOUT methods for diagnostic purposes
unassigned_mask = df['Primary_Method'] == ''
if unassigned_mask.sum() > 0:
    print(f"\n❌ SAMPLE PAPERS WITHOUT METHODS (first 5 for diagnostic):")
    unassigned_sample = df[unassigned_mask].head(5)
    
    if 'processed_text' in df.columns:
        for idx, row in unassigned_sample.iterrows():
            text_preview = row.get('processed_text', '')[:150] + "..." if len(str(row.get('processed_text', ''))) > 150 else row.get('processed_text', '')
            print(f"  Paper {idx}: {text_preview}")

# Final completion message
print(f"\n✅ Enhanced Method Detection Pipeline Completed Successfully!")
print(f"📁 All results saved to: {SAVE_DIR}")
print(f"📊 Assignment Rate: {diagnostic_results['assignment_rate']:.1f}%")
print(f"🔧 Methods Consolidated: {len(method_phrases) if method_phrases else 0} → {len(canonical_methods)}")

logger.info("Enhanced method detection pipeline with consolidation completed successfully!")
logger.info(f"Credit usage: {credit_tracker.get_stats()}")
"""



In [None]:
def verify_method_consolidation(df, canonical_to_variants, original_method_list):
    """
    Comprehensive verification that consolidation worked properly.
    """
    print("="*80)
    print("🔍 METHOD CONSOLIDATION VERIFICATION")
    print("="*80)
    
    # Extract assigned methods from DataFrame
    assigned_methods = set(df[df['Primary_Method'] != '']['Primary_Method'].unique())
    
    # Extract canonical methods from groups
    canonical_methods = set(canonical_to_variants.keys()) 
    
    # Extract all original methods (for comparison)
    original_methods = set(original_method_list)
    
    print(f"\n📊 COUNTS:")
    print(f"  Original methods: {len(original_methods)}")
    print(f"  Canonical groups: {len(canonical_methods)}")
    print(f"  Assigned methods: {len(assigned_methods)}")
    
    # Check 1: Are assigned methods from canonical set?
    non_canonical_assigned = assigned_methods - canonical_methods
    if non_canonical_assigned:
        print(f"\n❌ PROBLEM: {len(non_canonical_assigned)} assigned methods are NOT canonical:")
        for method in list(non_canonical_assigned)[:10]:
            print(f"    '{method}'")
    else:
        print(f"\n✅ SUCCESS: All assigned methods are canonical groups")
    
    # Check 2: Are any original redundant methods still assigned?
    all_variants = set()
    for variants in canonical_to_variants.values():
        all_variants.update(variants)
    
    redundant_assigned = assigned_methods & (original_methods - canonical_methods)
    if redundant_assigned:
        print(f"\n❌ PROBLEM: {len(redundant_assigned)} redundant methods still assigned:")
        for method in list(redundant_assigned)[:10]:
            print(f"    '{method}' (should be consolidated)")
    else:
        print(f"\n✅ SUCCESS: No redundant methods in assignments")
    
    # Check 3: Show consolidation examples
    print(f"\n🔍 CONSOLIDATION EXAMPLES:")
    consolidation_examples = 0
    for canonical, variants in canonical_to_variants.items():
        if len(variants) > 1 and canonical in assigned_methods:
            print(f"  '{canonical}' consolidated: {variants}")
            consolidation_examples += 1
            if consolidation_examples >= 5:
                break
    
    # Check 4: Show assignment distribution
    print(f"\n📈 TOP ASSIGNED METHODS (canonical):")
    method_counts = df[df['Primary_Method'] != '']['Primary_Method'].value_counts()
    for i, (method, count) in enumerate(method_counts.head(10).items()):
        variants = canonical_to_variants.get(method, [method])
        variant_info = f" (from {len(variants)} variants)" if len(variants) > 1 else ""
        print(f"  {i+1:2d}. {method}: {count} papers{variant_info}")
    
    print("="*80)
    return len(non_canonical_assigned) == 0 and len(redundant_assigned) == 0

# Run verification
verification_passed = verify_method_consolidation(df, canonical_to_variants, method_phrases)


🔍 METHOD CONSOLIDATION VERIFICATION

📊 COUNTS:
  Original methods: 165
  Canonical groups: 91
  Assigned methods: 86

✅ SUCCESS: All assigned methods are canonical groups

✅ SUCCESS: No redundant methods in assignments

🔍 CONSOLIDATION EXAMPLES:
  'total harmonic distortion thd' consolidated: ['total harmonic distortion thd', 'total harmonic distortion']
  'deep reinforcement learning drl' consolidated: ['reinforcement learning drl', 'deep reinforcement learning drl']
  'loss of load probability' consolidated: ['lolp', 'loss of load probability']
  'system average interruption duration index' consolidated: ['system average interruption duration index', 'caidi']
  'monte carlo simulation' consolidated: ['monte carlo simulation', 'monte-carlo simulation']

📈 TOP ASSIGNED METHODS (canonical):
   1. artificial neural network: 566 papers (from 4 variants)
   2. monte carlo simulation: 439 papers (from 2 variants)
   3. particle swarm optimization: 433 papers (from 4 variants)
   4. genetic 

In [None]:
def spot_check_paper_assignments(df, canonical_to_variants, n_samples=10):
    """
    Show sample paper assignments with their consolidated method info.
    """
    print("🔍 SPOT CHECK: Sample Paper Method Assignments")
    print("-" * 60)
    
    # Get papers with assigned methods
    assigned_papers = df[df['Primary_Method'] != ''].head(n_samples)
    
    for idx, row in assigned_papers.iterrows():
        primary_method = row['Primary_Method']
        score = row.get('Primary_Method_Score', 'N/A')
        
        # Check if this method has variants
        variants = canonical_to_variants.get(primary_method, [primary_method])
        
        print(f"\nPaper {idx}:")
        print(f"  Assigned Method: '{primary_method}'")
        print(f"  Score: {score}")
        print(f"  Variants in Group: {variants}")
        print(f"  Group Size: {len(variants)} methods")
        
        # Show if consolidation occurred
        if len(variants) > 1:
            print(f"  ✅ CONSOLIDATED: {len(variants)-1} variants merged")
        else:
            print(f"  ℹ️  INDIVIDUAL: No variants to consolidate")

# Run spot check
spot_check_paper_assignments(df, canonical_to_variants)


🔍 SPOT CHECK: Sample Paper Method Assignments
------------------------------------------------------------

Paper 45:
  Assigned Method: 'load frequency control'
  Score: 0.14345188046100252
  Variants in Group: ['load frequency control']
  Group Size: 1 methods
  ℹ️  INDIVIDUAL: No variants to consolidate

Paper 54:
  Assigned Method: 'optimal utilization'
  Score: 0.14345188046100252
  Variants in Group: ['optimal utilization']
  Group Size: 1 methods
  ℹ️  INDIVIDUAL: No variants to consolidate

Paper 57:
  Assigned Method: 'probabilistic reliability'
  Score: 0.14345188046100252
  Variants in Group: ['probabilistic reliability']
  Group Size: 1 methods
  ℹ️  INDIVIDUAL: No variants to consolidate

Paper 121:
  Assigned Method: 'mean time between failures'
  Score: 0.5855398034893327
  Variants in Group: ['mtbf', 'mean time between failures']
  Group Size: 2 methods
  ✅ CONSOLIDATED: 1 variants merged

Paper 127:
  Assigned Method: 'probabilistic reliability'
  Score: 0.143451880461

In [None]:
def verify_score_matrix_methods(canonical_methods, final_scores):
    """
    Check that the final score matrix columns correspond to canonical methods.
    """
    print("🔍 SCORE MATRIX VERIFICATION")
    print("-" * 40)
    
    print(f"Score matrix shape: {final_scores.shape}")
    print(f"Canonical methods count: {len(canonical_methods)}")
    
    if final_scores.shape[1] == len(canonical_methods):
        print("✅ Score matrix columns match canonical method count")
    else:
        print("❌ Dimension mismatch between scores and canonical methods")
    
    print(f"\nFirst 10 canonical methods in score matrix:")
    for i, method in enumerate(canonical_methods[:10]):
        print(f"  {i:2d}. {method}")

# Run verification  
verify_score_matrix_methods(canonical_methods, final_scores)


🔍 SCORE MATRIX VERIFICATION
----------------------------------------
Score matrix shape: (28934, 91)
Canonical methods count: 91
✅ Score matrix columns match canonical method count

First 10 canonical methods in score matrix:
   0. stochastic optimization
   1. N-1
   2. interior point method
   3. wavelet transform
   4. injection shift factors
   5. multi-input multiple output
   6. first order reliability method
   7. optimal utilization
   8. optimal dispatch
   9. optimal power allocation


In [None]:
def show_before_after_comparison(original_methods, canonical_to_variants):
    """
    Show before/after consolidation comparison.
    """
    print("🔄 BEFORE/AFTER CONSOLIDATION COMPARISON")
    print("=" * 80)
    
    print(f"BEFORE: {len(original_methods)} original methods")
    print(f"AFTER:  {len(canonical_to_variants)} canonical groups")
    reduction = len(original_methods) - len(canonical_to_variants)
    reduction_pct = 100 * reduction / len(original_methods)
    print(f"REDUCTION: {reduction} methods ({reduction_pct:.1f}%)")
    
    print(f"\n📋 CONSOLIDATION EXAMPLES:")
    consolidation_count = 0
    for canonical, variants in canonical_to_variants.items():
        if len(variants) > 1:
            print(f"\n  GROUP: '{canonical}'")
            print(f"    Consolidated: {variants}")
            consolidation_count += 1
            if consolidation_count >= 8:
                break
    
    print(f"\n📊 SUMMARY:")
    multi_variant_groups = sum(1 for v in canonical_to_variants.values() if len(v) > 1)
    print(f"  Groups with multiple variants: {multi_variant_groups}")
    print(f"  Single-method groups: {len(canonical_to_variants) - multi_variant_groups}")

# Run comparison
show_before_after_comparison(method_phrases, canonical_to_variants)


🔄 BEFORE/AFTER CONSOLIDATION COMPARISON
BEFORE: 165 original methods
AFTER:  91 canonical groups
REDUCTION: 74 methods (44.8%)

📋 CONSOLIDATION EXAMPLES:

  GROUP: 'total harmonic distortion thd'
    Consolidated: ['total harmonic distortion thd', 'total harmonic distortion']

  GROUP: 'deep reinforcement learning drl'
    Consolidated: ['reinforcement learning drl', 'deep reinforcement learning drl']

  GROUP: 'loss of load probability'
    Consolidated: ['lolp', 'loss of load probability']

  GROUP: 'system average interruption duration index'
    Consolidated: ['system average interruption duration index', 'caidi']

  GROUP: 'monte carlo simulation'
    Consolidated: ['monte carlo simulation', 'monte-carlo simulation']

  GROUP: 'system average interruption frequency index'
    Consolidated: ['saifi', 'system average interruption frequency index']

  GROUP: 'particle swarm optimization'
    Consolidated: ['particle swarm optimization', 'pso algorithm', 'optimization pso algorithm', 

In [None]:
for n in method_phrases:
    print(n)

stochastic optimization
N-1
interior point method
wavelet transform
N-2
fault tree
multi-input multi-output
principal component analysis
injection shift factors
multi-input multiple output
first order reliability method
scopf
optimal utilization
wams
optimal dispatch
optimal power allocation
anfis
successive interference cancellation
tabu search
caidi
lole
two-step stochastic
unit commitment
kalman filter
mttf
particle swarm optimization
fast decoupled power flow
fault tree analysis
cost-benefit analysis
lodf
quadratic programming
regression model
hosting capacity assesment
arima
sequential quadratic programming
time-dependent
dynamic line rating
pem
support vector machine
genetic algorithm ga
q-learning
mean time to repair
neural network ann
hierarchical level ii
time-domain
load flow analysis
sliding mode control
differential evolution
sensitivity analysis
state estimation
vector autoregression
point estimate method
pmu
adaptive neuro-fuzzy
saidi
dynamic voltage
neural network
mixed 