In [1]:
# %%
# Cell 1: Imports and Setup
import os
import re
import json
import csv
import ast
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import configparser
import tiktoken
import logging
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Phrases
import openai
from difflib import SequenceMatcher

SAVE_DIR = "Saved_files_new"
os.makedirs(SAVE_DIR, exist_ok=True)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)


True

In [57]:
# %%
# Cell 2: OpenAI Setup and Utility (Updated for gpt-5-nano)
class CreditTracker:
    def __init__(self):
        self.total_tokens = 0
        self.total_cost = 0
        self.cost_per_1k_tokens = 0.00015
    def update(self, tokens):
        self.total_tokens += tokens
        self.total_cost += (tokens / 1000) * self.cost_per_1k_tokens
    def get_stats(self):
        return {"total_tokens": self.total_tokens, "total_cost": round(self.total_cost, 4)}

def initialize_openai():
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE')
    client = openai.OpenAI(api_key=api_key)
    return client, model_type

def num_tokens_from_string(string: str, model_name: str) -> int:
    """Get token count with fallback for unsupported models like gpt-5-nano"""
    try:
        encoding = tiktoken.encoding_for_model(model_name)
        return len(encoding.encode(string))
    except KeyError:
        # Fallback for unsupported models like gpt-5-nano
        if model_name.startswith('gpt-5-nano'):
            # Use o200k_base encoding as fallback for gpt-5-nano
            encoding = tiktoken.get_encoding("o200k_base")
            return len(encoding.encode(string))
        else:
            # For other unsupported models, use a reasonable approximation
            return len(string) // 4  # Rough approximation: 4 chars per token

client, model_type = initialize_openai()
credit_tracker = CreditTracker()


In [3]:
# %%
# Cell 3: Data Preprocessing Utilities

def extract_keywords_from_filename(filename):
    base = os.path.splitext(os.path.basename(filename))[0]
    parts = base.split('_')
    return [part for i, part in enumerate(parts) if i > 2 and part != 'results' and not part.isdigit()]

def get_custom_stop_words(search_keywords=None):
    stop_words = set(stopwords.words('english'))
    words_to_keep = set()
    if search_keywords:
        for keyword in search_keywords:
            keyword = keyword.lower()
            words_to_keep.add(keyword)
            for word in keyword.split():
                words_to_keep.add(word)
    stop_words = stop_words - words_to_keep
    scientific_terms = {'et', 'al', 'ref', 'reference', 'references', 'cited', 'cite',
        'fig', 'figure', 'figures', 'table', 'tables', 'chart', 'charts',
        'published', 'journal', 'conference', 'proceedings', 'vol', 'volume', 'pp', 'page', 'pages', 'doi'}
    return stop_words.union(scientific_terms)

def preprocess_text(text, search_keywords=None, min_word_length=2, remove_numbers=True):
    if not isinstance(text, (str, int, float)):
        return ''
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'--+', ' ', text)
    tokens = word_tokenize(text)
    stop_words = get_custom_stop_words(search_keywords)
    tokens = [t for t in tokens if len(t) >= min_word_length and t not in stop_words and len(t) > 1 and not t.isdigit()]
    lemmatizer = WordNetLemmatizer()
    try:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    except:
        pass
    return ' '.join(tokens)

def preprocess_dataframe(df, text_col, search_keywords, processed_col='processed_text'):
    df[text_col] = df[text_col].fillna('').astype(str)
    df[processed_col] = df[text_col].apply(lambda x: preprocess_text(x, search_keywords))
    return df[df[processed_col].str.strip() != '']

def clean_fields_of_study(s):
    valid_fields = ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics',
        'Medicine','Business','Environmental Science','Chemistry','Materials Science',
        'Geography','Biology','Geology','Political Science','Psychology','Com']
    if pd.isna(s) or s == '[]':
        return ["Unknown"]
    if isinstance(s, str):
        fields = [field.strip().strip("'\"") for field in s.strip('[]').split(',')]
        return [f if f in valid_fields else "Unknown" for f in fields] or ["Unknown"]
    return ["Unknown"]


In [4]:
# %%
# Cell 4: Data Loading & Cleaning

filename = "semantic_scholar_2025_02_14_reliability_resilience_power_systems_results.csv"
filepath = os.path.join("Saved_files", filename)
df = pd.read_csv(filepath, sep=";")
df['text'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')
search_keywords = extract_keywords_from_filename(filename)
df = preprocess_dataframe(df, text_col='text', search_keywords=search_keywords)
df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)
logger.info(f"Loaded and preprocessed {len(df)} papers")


2025-08-22 15:52:23,604 - INFO - Loaded and preprocessed 28934 papers


In [None]:
# %%
# Cell 5: Enhanced Method Detection Functions (Updated for gpt-5-nano)

def extract_candidate_terms(df, text_col='processed_text', max_features=20000):
    vectorizer = CountVectorizer(
        ngram_range=(1, 4), max_df=0.95, min_df=2, max_features=max_features, token_pattern=r'\b[\w-]+\b'
    )
    matrix = vectorizer.fit_transform(df[text_col].fillna(''))
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    return [term for term, freq in sorted(zip(terms, freqs), key=lambda x: x[1], reverse=True)]

def parse_llm_python_list(output_text):
    """Improved parsing function for LLM outputs"""
    import re
    import ast
    
    # Clean the output
    content = output_text.strip()
    
    # Remove markdown code blocks
    content = re.sub(r'```(?:python|json)?\n?', '', content)
    content = re.sub(r'```', '', content)
    
    # Look for list patterns
    list_patterns = [
        r'\[([^\]]+)\]',  # Standard list format
        r'List:\s*\[([^\]]+)\]',  # List: [items]
        r'Result:\s*\[([^\]]+)\]'  # Result: [items]
    ]
    
    for pattern in list_patterns:
        match = re.search(pattern, content, re.DOTALL | re.IGNORECASE)
        if match:
            try:
                # Try to parse as literal
                return ast.literal_eval('[' + match.group(1) + ']')
            except:
                # Fallback: split by comma and clean
                items = [item.strip().strip("'\"") for item in match.group(1).split(',')]
                return [item for item in items if item.strip()]
    
    # If no list found, try line by line
    lines = content.split('\n')
    items = []
    for line in lines:
        line = line.strip()
        if line and not line.startswith('#') and not line.startswith('//'):
            # Remove leading numbers, dashes, etc.
            line = re.sub(r'^\d+\.?\s*[-*]?\s*', '', line)
            line = line.strip("'\"")
            if line:
                items.append(line)
    
    return items[:1500]  # Limit to prevent huge lists

def get_method_phrases_enhanced(
    corpus_terms,
    client,
    model_type,
    credit_tracker,
    n_runs=3,
    temp=0.1,
    top_p=0.9,
    show_progress=True,
    batch_size=500 
):
    """
    Enhanced method extraction with batching, for gpt-5-nano and others.
    """
    import collections
    from math import ceil

    all_phrases_sets = []

    n_batches = ceil(len(corpus_terms) / batch_size)
    for batch_idx in range(n_batches):
        batch_terms = corpus_terms[batch_idx * batch_size : (batch_idx + 1) * batch_size]

        # Compose prompt for just this batch
        prompt = f"""You are analyzing scientific papers about power systems reliability and resilience.

From these terms: {batch_terms}

Extract ALL terms that represent:
1. Specific algorithms (e.g., genetic algorithm, particle swarm optimization)
2. Mathematical methods (e.g., monte carlo simulation, linear programming)
3. Analysis techniques (e.g., fault tree analysis, load flow analysis)
4. Optimization methods (e.g., unit commitment, optimal power flow)
5. Modeling approaches (e.g., neural network, markov chain)
6. Simulation methods (e.g., time series analysis, stochastic programming)
DO include: e.g. 'monte carlo simulation', 'unit commitment', 'load flow analysis', 'genetic algorithm', 'neural network', 'stochastic optimization', 'reinforcement learning', 'fault tree analysis'.\n
DO NOT include generic terms like 'framework', 'analysis', 'system', 'method', 'procedure', 'approach', 'application', 'performance', 'review', 'assesment', by themselves or in combination with only other generic terms
INCLUDE abbreviations, variants (e.g., OPF/optimal power flow) and compounds.

Return as a Python list, one method per item.
"""

        for i in range(n_runs):
            try:
                api_params = {
                    "model": model_type,
                    "messages": [{"role": "user", "content": prompt}],
                }
                if model_type.startswith('gpt-5-nano'):
                    api_params["max_completion_tokens"] = 5000
                else:
                    api_params["temperature"] = temp
                    api_params["top_p"] = top_p
                    api_params["max_tokens"] = 5000

                response = client.chat.completions.create(**api_params)
                content = response.choices[0].message.content
                phrases = parse_llm_python_list(content)
                phrases = [p.lower().strip() for p in phrases if p.strip() and len(p.strip()) > 2]
                all_phrases_sets.append(set(phrases))
                credit_tracker.update(num_tokens_from_string(content, model_type))
                if show_progress:
                    print(f"BATCH {batch_idx+1}/{n_batches}, run {i+1}: found {len(phrases)}")
                    print(f"  Sample: {phrases[:10]}")
            except Exception as e:
                logger.error(f"Error in LLM call for batch {batch_idx+1}, run {i+1}: {e}")
                all_phrases_sets.append(set())

    # Combine and count results
    all_flat = [p for s in all_phrases_sets for p in s]
    counts = collections.Counter(all_flat)
    sorted_methods = sorted(counts.keys(), key=lambda x: (-counts[x], x))
    print(f"\nTotal unique phrases: {len(counts)}")
    print(f"Most frequent (top 10): {sorted_methods[:10]}")
    return sorted_methods, counts


def load_method_phrases_from_csv(filename="extracted_method_phrases.csv"):
    path = os.path.join(SAVE_DIR, filename)
    if os.path.exists(path):
        with open(path, encoding='utf-8') as f:
            reader = csv.DictReader(f)
            method_phrases = []
            method_counts = []
            for row in reader:
                method_phrases.append(row["Method Phrase"])
                method_counts.append(int(row["Count"]))
            print(f"✓ Loaded {len(method_phrases)} method phrases from {path}")
            return method_phrases, method_counts
    else:
        logger.warning(f"File {path} not found.")
        return None, None

def build_method_variant_groups(method_list, client, model_type, credit_tracker, batch_size=50):
    """Groups method variants together while keeping them as separate entries"""
    variant_groups = {}
    processed_methods = set()
    
    for i in range(0, len(method_list), batch_size):
        batch = method_list[i:i + batch_size]
        batch = [m for m in batch if m not in processed_methods]
        
        if not batch:
            continue
            
        prompt = f"""Group these method terms by their semantic similarity. Methods that refer to the same core technique should be grouped together, but keep specific variants separate when they represent different approaches.

Methods: {batch}

For each group, identify:
1. A canonical name (most complete/descriptive)
2. All variants and abbreviations

Example format:
{{
  "optimal power flow": ["optimal power flow", "opf", "ac opf", "dc opf"],
  "security constrained optimal power flow": ["security constrained optimal power flow", "scopf", "scuc"],
  "monte carlo simulation": ["monte carlo simulation", "monte carlo", "mc simulation"]
}}

Return as a Python dictionary."""

        try:
            # Build API parameters for gpt-5-nano compatibility
            api_params = {
                "model": model_type,
                "messages": [
                    {"role": "system", "content": "You are a scientific method classification expert."},
                    {"role": "user", "content": prompt}
                ]
            }
            
            # gpt-5-nano specific parameters
            if model_type.startswith('gpt-5-nano'):
                api_params["max_completion_tokens"] = 1500
            else:
                api_params["temperature"] = 0.1
                api_params["top_p"] = 0.9
                api_params["max_tokens"] = 1500
            
            response = client.chat.completions.create(**api_params)
            content = response.choices[0].message.content
            credit_tracker.update(num_tokens_from_string(content, model_type))
            
            # Parse the dictionary response
            try:
                # Clean the response
                content = content.strip()
                if content.startswith('```'):	
                    content = re.sub(r'```(?:python|json)?\n?', '', content)
                    content = re.sub(r'```', '', content)	
                
                # Find dictionary pattern
                dict_match = re.search(r'\{.*\}', content, re.DOTALL)
                if dict_match:
                    groups = ast.literal_eval(dict_match.group(0))
                    variant_groups.update(groups)
                    processed_methods.update(batch)
                    
            except Exception as e:
                logger.warning(f"Failed to parse variant groups from batch: {e}")
                # Fallback: treat each method as its own group
                for method in batch:
                    variant_groups[method] = [method]
                processed_methods.update(batch)
                
        except Exception as e:
            logger.error(f"Error in variant grouping: {e}")
            # Fallback: treat each method as its own group
            for method in batch:
                variant_groups[method] = [method]
            processed_methods.update(batch)
    
    logger.info(f"Created {len(variant_groups)} method variant groups")
    return variant_groups

def create_variant_mapping(variant_groups):
    """Create mapping from any variant to its canonical form"""
    variant_to_canonical = {}
    canonical_to_variants = {}
    
    for canonical, variants in variant_groups.items():
        canonical_to_variants[canonical] = variants
        for variant in variants:
            variant_to_canonical[variant.lower()] = canonical
    
    return variant_to_canonical, canonical_to_variants


In [55]:
# %%
# Cell 6: Method Scoring Functions (Enhanced)

def compute_enhanced_tfidf_scores(processed_texts, method_variants_dict, ngram_range=(1, 4), min_df=1, max_df=0.95):
    """Compute TF-IDF scores for all method variants"""
    # Get all variants
    all_variants = []
    for variants in method_variants_dict.values():
        all_variants.extend(variants)
    
    # Create vocabulary from actual variants that exist in corpus
    existing_variants = []
    for variant in all_variants:
        # Check if variant appears in any document
        variant_pattern = r'\b' + re.escape(variant.lower()) + r'\b'
        found = False
        for text in processed_texts[:100]:  # Sample check for efficiency
            if re.search(variant_pattern, text.lower()):
                existing_variants.append(variant)
                found = True
                break
        if not found and len(existing_variants) < 1000:  # Keep checking if we don't have too many
            for text in processed_texts:
                if re.search(variant_pattern, text.lower()):
                    existing_variants.append(variant)
                    break
    
    print(f"Found {len(existing_variants)} variants that exist in corpus out of {len(all_variants)} total")
    
    if not existing_variants:
        logger.warning("No method variants found in corpus!")
        return np.zeros((len(processed_texts), 1)), ['no_methods_found']
    
    tfidf_vectorizer = TfidfVectorizer(
        vocabulary=existing_variants,
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        norm='l2',
        token_pattern=r'\b[\w-]+\b'
    )
    
    try:
        tfidf_matrix = tfidf_vectorizer.fit_transform(processed_texts)
        scores = tfidf_matrix.toarray()
        feature_names = tfidf_vectorizer.get_feature_names_out()
        return scores, feature_names
    except Exception as e:
        logger.error(f"TF-IDF computation failed: {e}")
        return np.zeros((len(processed_texts), len(existing_variants))), existing_variants

def compute_enhanced_lda_scores(processed_texts, method_variants_dict, ngram_range=(1, 3), n_topics=None, max_iter=20):
    """Compute LDA scores for method variants."""
    all_variants = []
    for variants in method_variants_dict.values():
        all_variants.extend(variants)

    if n_topics is None:
        n_topics = min(len(all_variants), 100)

    vectorizer = CountVectorizer(
        vocabulary=all_variants,
        ngram_range=ngram_range,
        token_pattern=r'\b[\w-]+\b'
    )

    try:
        doc_term_matrix = vectorizer.fit_transform(processed_texts)
        feature_names = vectorizer.get_feature_names_out()

        if n_topics >= 2 and doc_term_matrix.shape[1] > 0:
            lda = LatentDirichletAllocation(
                n_components=min(n_topics, doc_term_matrix.shape[1]),
                learning_method='batch',
                random_state=42,
                max_iter=max_iter
            )
            lda_matrix = lda.fit_transform(doc_term_matrix)
        else:
            lda_matrix = np.zeros((doc_term_matrix.shape, len(all_variants)))

        return lda_matrix, feature_names
    except Exception as e:
        logger.error(f"LDA computation failed: {e}")
        return np.zeros((len(processed_texts), len(all_variants))), all_variants
    
def compute_enhanced_compound_scores(df, method_variants_dict, processed_col='processed_text', window=150):
    """Enhanced compound scoring that handles variants"""
    n_docs = len(df)
    all_variants = []
    for variants in method_variants_dict.values():
        all_variants.extend(variants)
    
    n_methods = len(all_variants)
    scores = np.zeros((n_docs, n_methods), dtype=np.float32)
    docs = df[processed_col].fillna('').str.lower().tolist()
    
    for j, variant in enumerate(all_variants):
        variant_l = variant.lower()
        
        for i, text in enumerate(docs):
            # Full phrase match
            if variant_l in text:
                scores[i, j] = 1.0
            # Partial word match for compound terms
            elif len(variant_l.split()) > 1:
                words = variant_l.split()
                if all(word in text for word in words):
                    scores[i, j] = 0.7
            # Abbreviation handling
            elif len(variant_l) <= 5 and variant_l.upper() in text.upper():
                scores[i, j] = 0.8
    
    return scores, all_variants

def aggregate_variant_scores_to_canonical(scores, variant_names, variant_to_canonical):
    """Aggregate variant scores back to canonical method names"""
    canonical_methods = list(set(variant_to_canonical.values()))
    canonical_scores = np.zeros((scores.shape, len(canonical_methods)))
    
    canonical_to_idx = {method: i for i, method in enumerate(canonical_methods)}
    
    for j, variant in enumerate(variant_names):
        canonical = variant_to_canonical.get(variant.lower(), variant)
        if canonical in canonical_to_idx:
            canonical_idx = canonical_to_idx[canonical]
            canonical_scores[:, canonical_idx] += scores[:, j]  # Sum scores for variants
    
    return canonical_scores, canonical_methods

def assign_top_methods_enhanced(
    df, canonical_scores, canonical_methods, variant_scores, variant_names,
    top_n=5, min_score=0.01
):
    """Enhanced method assignment with granular variant tracking"""
    
    # Assign top canonical methods
    for rank in range(top_n):
        top_method = []
        top_score = []
        top_variants = []
        confidence = []

        for i, row in enumerate(canonical_scores):
            if np.allclose(row, row):  # All equal
                top_method.append("")
                top_score.append(0.0)
                top_variants.append("")
                confidence.append("")
                continue

            idxs = np.argsort(row)[::-1]
            if rank < len(idxs):
                method_idx = idxs[rank]
                method = canonical_methods[method_idx]
                score = row[method_idx]
                
                if score >= min_score:
                    # Find contributing variants
                    variant_contributions = []
                    for v_idx, variant in enumerate(variant_names):
                        if variant_scores[i, v_idx] > 0:
                            # Check if this variant belongs to the current canonical method
                            variant_canonical = variant_to_canonical.get(variant.lower(), variant)
                            if variant_canonical == method:
                                variant_contributions.append(f"{variant}({variant_scores[i, v_idx]:.2f})")
                    
                    top_method.append(method)
                    top_score.append(score)
                    top_variants.append("; ".join(variant_contributions[:3]))  # Top 3 variants
                    confidence.append("confident" if score > min_score * 2 else "low_confidence")
                else:
                    top_method.append("")
                    top_score.append(0.0)
                    top_variants.append("")
                    confidence.append("")
            else:
                top_method.append("")
                top_score.append(0.0)
                top_variants.append("")
                confidence.append("")

        df[f'Top_{rank+1}_Method'] = top_method
        df[f'Top_{rank+1}_Score'] = top_score
        df[f'Top_{rank+1}_Variants'] = top_variants
        df[f'Top_{rank+1}_Confidence'] = confidence

    # Set primary columns
    df['Primary_Method'] = df['Top_1_Method']
    df['Primary_Method_Score'] = df['Top_1_Score']
    df['Primary_Method_Variants'] = df['Top_1_Variants']
    df['Method_Confidence'] = df['Top_1_Confidence']

    return df


In [None]:
# %%
# Cell 7: c(Keep your existing functions)

def run_lda_topic_modeling(df, num_topics=10, num_words=25):
    tokenized_texts = df['processed_text'].apply(lambda x: x.split()).tolist()
    bigram = Phrases(tokenized_texts, min_count=10, threshold=50, delimiter='_')
    trigram = Phrases(bigram[tokenized_texts], threshold=50, delimiter='_')
    phrased = []
    for doc in tokenized_texts:
        bigrams_ = [w for w in bigram[doc] if '_' in w]
        trigrams_ = [w for w in trigram[bigram[doc]] if '_' in w]
        combined = doc + bigrams_ + trigrams_
        phrased.append(' '.join(combined))
    vectorizer = CountVectorizer(ngram_range=(1, 1), token_pattern=r'\b[\w_-]+\b', max_df=0.95, min_df=2, max_features=10000)
    doc_term_matrix = vectorizer.fit_transform(phrased)
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    topic_distributions = lda_model.fit_transform(doc_term_matrix)
    feature_names = vectorizer.get_feature_names_out()
    topic_keywords = {}
    for topic_idx, topic in enumerate(lda_model.components_):
        top_indices = topic.argsort()[:-num_words-1:-1]
        top_words = [feature_names[i] for i in top_indices]
        topic_keywords[topic_idx] = {'top_words': top_words}
    return lda_model, vectorizer, topic_distributions, topic_keywords

def assign_papers_to_topics(topic_distributions):
    paper_classifications = []
    for idx, dist in enumerate(topic_distributions):
        top_2_topics = np.argsort(dist)[-2:][::-1]
        primary_score = dist[top_2_topics]
        other_topics_sum = sum(dist) - primary_score
        dominance_ratio = primary_score / (other_topics_sum + 1e-10)
        paper_classifications.append({
            'paper_idx': idx,
            'primary_topic': top_2_topics[0],
            'secondary_topic': top_2_topics[1],
            'primary_score': primary_score,
            'dominance_ratio': dominance_ratio
        })
    return paper_classifications

def string_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def topic_name_llm_robust(
    lda_keywords, tfidf_ngrams, top_titles,
    client, model_type, credit_tracker,
    initial_iterations=3, max_iterations=10, similarity_threshold=0.7,
    temp=0, top_p=1.0
):
    prompt = (
        "Based on the following keywords and n-grams from LDA and TF-IDF, plus top paper titles, provide a concise topic name "
        "(bigram or trigram, single word if very specific):\n"
        f"LDA: {', '.join(lda_keywords)}\n"
        f"TFIDF: {', '.join(tfidf_ngrams)}\n"
        f"TITLES: {', '.join(top_titles)}\n"
        "Return ONLY the topic name."
    )
    iterations = initial_iterations
    from collections import Counter
    while iterations <= max_iterations:
        generated_names = []
        for _ in range(iterations):
            response = client.chat.completions.create(
                model=model_type,
                messages=[
                    {"role": "system", "content": "You are a science topic-naming assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=temp,
                top_p=top_p
            )
            content = response.choices[0].message.content.strip()
            if content:
                generated_names.append(content)
        for i, name in enumerate(generated_names):
            matches = [other for j, other in enumerate(generated_names)
                       if i != j and string_similarity(name, other) >= similarity_threshold]
            if len(matches) >= len(generated_names)//2:
                print(f"Topic name stabilized after {iterations} iterations: {name}")
                return name
        iterations += 2
        print(f"No majority topic name found, increasing iterations to {iterations}.")
    most_common = Counter(generated_names).most_common(1)
    print(f"Returning most common topic name after {max_iterations} iterations: {most_common}")
    return most_common

def get_top_titles_for_topic(df, paper_classifications, topic_idx, n_titles=10):
    dominant_papers = [p for p in paper_classifications if p['primary_topic'] == topic_idx]
    paper_infos = [
        (df.iloc[p['paper_idx']]['citationCount'] if 'citationCount' in df.columns else 0, df.iloc[p['paper_idx']]['title'])
        for p in dominant_papers if not pd.isna(df.iloc[p['paper_idx']]['title'])
    ]
    # Correctly sort by citation count (descending)
    top_titles = [title for _, title in sorted(paper_infos, key=lambda x: -x[0])[:n_titles]]
    return top_titles


def get_top_tfidf_ngrams_per_topic(df, tfidf_matrix, feature_names, topic_col='Primary_Topic_Index', top_k=10):
    tfidf_ngrams = {}
    for topic_idx in df[topic_col].dropna().unique():
        topic_idx = int(topic_idx)
        doc_indices = df[df[topic_col] == topic_idx].index
        if len(doc_indices) == 0:
            continue
        topic_tfidf = np.asarray(tfidf_matrix[doc_indices].mean(axis=0)).ravel()
        top_indices = topic_tfidf.argsort()[-top_k:][::-1]
        top_terms = [(feature_names[i], topic_tfidf[i]) for i in top_indices if topic_tfidf[i] > 0]
        tfidf_ngrams[topic_idx] = top_terms
    return tfidf_ngrams

def get_author_stats(paper_classifications, df_field, n_top=5):
    top_papers = {}
    author_topic_stats = {}
    
    for topic in set(p['primary_topic'] for p in paper_classifications):
        topic_papers = [p for p in paper_classifications if p['primary_topic'] == topic]
        
        # Fix: Handle various numpy array cases for dominance_ratio
        for p in topic_papers:
            dominance_ratio = p['dominance_ratio']
            
            if isinstance(dominance_ratio, np.ndarray):
                if dominance_ratio.size == 1:
                    p['dominance_ratio'] = float(dominance_ratio.item())
                else:
                    # Take the first element if it's a multi-element array
                    p['dominance_ratio'] = float(dominance_ratio.flat[0])
            elif hasattr(dominance_ratio, 'item'):
                p['dominance_ratio'] = float(dominance_ratio.item())
            else:
                p['dominance_ratio'] = float(dominance_ratio)
            
            # Also fix primary_score if needed
            primary_score = p['primary_score']
            if isinstance(primary_score, np.ndarray):
                if primary_score.size == 1:
                    p['primary_score'] = float(primary_score.item())
                else:
                    p['primary_score'] = float(primary_score.flat[0])
            elif hasattr(primary_score, 'item'):
                p['primary_score'] = float(primary_score.item())
            else:
                p['primary_score'] = float(primary_score)
        
        topic_papers.sort(key=lambda x: x['dominance_ratio'], reverse=True)
        top_papers[topic] = []
        
        for p in topic_papers[:n_top]:
            paper_idx = p['paper_idx']
            try:
                authors = df_field.iloc[paper_idx]['authors']
                if isinstance(authors, str):
                    try: 
                        authors = ast.literal_eval(authors)
                    except (ValueError, SyntaxError): 
                        authors = []
                if isinstance(authors, list):
                    author_list = []
                    for author in authors:
                        if isinstance(author, dict):
                            author_list.append({'name': author.get('name', 'Unknown'), 'id': author.get('authorId', 'Unknown')})
                else: 
                    author_list = []
                    
                top_papers[topic].append({
                    'paperId': df_field.iloc[paper_idx].get('paperId',''),
                    'title': df_field.iloc[paper_idx].get('title',''),
                    'authors': author_list,
                    'score': float(p['primary_score']),
                    'dominance_ratio': float(p['dominance_ratio'])
                })
            except Exception as e: 
                continue
                
    return top_papers, author_topic_stats



In [13]:
# %%
# Cell 8: Utility Functions for Saving

def save_term_frequencies(df, suffix_string, save_dir=SAVE_DIR, max_keywords=5000):
    """Save .json containing keywords, bigrams, trigrams with their counts for later visualization."""
    freq_data = {}
    processed_text = df['processed_text'].fillna('').astype(str)
    
    for n in range(1, 4):
        vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english', max_features=max_keywords)
        matrix = vectorizer.fit_transform(processed_text)
        terms = vectorizer.get_feature_names_out()
        freqs = matrix.sum(axis=0).A1
        
        # Fix: Access the frequency (x[1]) for sorting, not the whole tuple (x)
        freq_dict = {term: int(freq) for term, freq in sorted(zip(terms, freqs), key=lambda x: -x[1])}
        
        if n == 1: 
            freq_data['keywords'] = freq_dict
        elif n == 2: 
            freq_data['bigrams'] = freq_dict
        elif n == 3: 
            freq_data['trigrams'] = freq_dict
    
    out_fn = os.path.join(save_dir, f'term_frequencies_{suffix_string}.json')
    with open(out_fn, 'w', encoding='utf-8') as f:
        json.dump(freq_data, f, indent=2)
    print(f"✓ Saved term frequency summary to {out_fn}")
    return out_fn


def save_author_and_venue_frequencies(df, suffix_string, save_dir=SAVE_DIR):
    if 'authors' in df.columns:
        authors_all = []
        for item in df['authors']:
            if isinstance(item, str) and item.strip():
                try:
                    obj = eval(item) if (item.strip().startswith("[") or item.strip().startswith("{")) else item.strip()
                except Exception:
                    obj = item.strip()
            else:
                obj = item
            if isinstance(obj, list):
                for author in obj:
                    if isinstance(author, dict) and 'name' in author:
                        authors_all.append(author['name'])
                    elif isinstance(author, str):
                        authors_all.append(author)
            elif isinstance(obj, dict) and 'name' in obj:
                authors_all.append(obj['name'])
            elif isinstance(obj, str):
                authors_all.append(obj)
        author_counts = pd.Series(authors_all).value_counts().reset_index()
        author_counts.columns = ['Author', 'Frequency']
        author_fn = os.path.join(save_dir, f"semantic_scholar_{suffix_string}_author_analysis.csv")
        author_counts.to_csv(author_fn, sep=';', encoding='utf-8', index=False)
        print(f"✓ Saved author frequencies: {author_fn}")
    else:
        print("No 'authors' column found in DF: skipping author frequencies.")
        
    if 'venue' in df.columns:
        venue_counts = df['venue'].value_counts().reset_index()
        venue_counts.columns = ['Venue', 'Frequency']
        venue_fn = os.path.join(save_dir, f"semantic_scholar_{suffix_string}_venue_frequencies.csv")
        venue_counts.to_csv(venue_fn, sep=';', encoding='utf-8', index=False)
        print(f"✓ Saved venue frequencies: {venue_fn}")
    else:
        print("No 'venue' column found in DF: skipping venue frequencies.")

def save_topic_analysis_outputs(
    df, lda_model, lda_vectorizer, topic_distributions, topic_keywords, topic_names, topic_ngrams,
    author_stats, top_papers, tfidf_ngrams, suffix_string
):
    topic_metadata = {
        "topics": {int(k): v for k,v in topic_keywords.items()},
        "topic_names": {int(k): v for k,v in topic_names.items()},
        "topic_ngrams": {int(k): v for k,v in topic_ngrams.items()},
    }
    with open(os.path.join(SAVE_DIR, f"topics_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump(topic_metadata, f, indent=2)
    with open(os.path.join(SAVE_DIR, f"topic_names_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump({int(k):v for k,v in topic_names.items()}, f, indent=2)
    np.save(os.path.join(SAVE_DIR, f"topic_distributions_{suffix_string}.npy"), topic_distributions)
    import joblib
    joblib.dump(lda_model, os.path.join(SAVE_DIR, f"lda_model_{suffix_string}.joblib"))
    joblib.dump(lda_vectorizer, os.path.join(SAVE_DIR, f"lda_vectorizer_{suffix_string}.joblib"))
    with open(os.path.join(SAVE_DIR, f"top_papers_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump({int(k): v for k, v in top_papers.items()}, f, ensure_ascii=False, indent=2, default=str)
    pd.DataFrame.from_dict(author_stats, orient='index').to_csv(
        os.path.join(SAVE_DIR, f"author_stats_{suffix_string}.csv"))
    with open(os.path.join(SAVE_DIR, f"topic_specific_tfidf_ngrams_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump({int(k):[(term,float(score)) for term,score in v] for k,v in topic_ngrams.items()}, f, indent=2)

def diagnostics_enhanced(df, canonical_scores, variant_scores, canonical_methods, variant_names):
    n_docs, n_canonical = canonical_scores.shape
    n_variants = variant_scores.shape[1]
    
    print("=== ENHANCED DIAGNOSTICS ===")
    print(f"Total documents: {n_docs}")
    print(f"Canonical methods: {n_canonical}")
    print(f"Method variants: {n_variants}")
    print(f"Canonical coverage: {(canonical_scores > 0).any(axis=1).sum()}/{n_docs} ({100*(canonical_scores>0).any(axis=1).mean():.1f}%)")
    print(f"Variant coverage: {(variant_scores > 0).any(axis=1).sum()}/{n_docs} ({100*(variant_scores>0).any(axis=1).mean():.1f}%)")
    
    if 'Primary_Method' in df.columns:
        print("\nMethod distribution (top 10):")
        method_dist = df['Primary_Method'].value_counts().head(10)
        for method, count in method_dist.items():
            if method:  # Skip empty strings
                print(f"  {method}: {count}")
    
    if 'Method_Confidence' in df.columns:
        print("\nConfidence distribution:")
        conf_dist = df['Method_Confidence'].value_counts()
        for conf, count in conf_dist.items():
            if conf:  # Skip empty strings
                print(f"  {conf}: {count}")
    
    print(f"\nCanonical methods sample: {canonical_methods[:5]}")
    print(f"Variant methods sample: {variant_names[:10]}")
    print(f"\nCanonical scores stats: mean={canonical_scores.mean():.3f}, std={canonical_scores.std():.3f}")
    print(f"Variant scores stats: mean={variant_scores.mean():.3f}, std={variant_scores.std():.3f}")


In [23]:
# %%
# Cell 9: Topic Analysis Workflow
"""
NUM_TOPICS = 12
NUM_TOPIC_WORDS = 15
TOPIC_LLM_ITER_INIT = 3
TOPIC_LLM_ITER_MAX = 9
TOPIC_LLM_SIM_THRESH = 0.72
TOPIC_LLM_TEMP = 1
TOPIC_LLM_TOP_P = 1.0

current_date = datetime.now().strftime("%Y_%m_%d")
keyword_str = '_'.join(extract_keywords_from_filename(filename)) if 'filename' in locals() else ""
suffix_string = f"{current_date}_{keyword_str}"
save_term_frequencies(df, suffix_string)
save_author_and_venue_frequencies(df, suffix_string)

logger.info("Starting topic modeling workflow...")  
lda_model, lda_vectorizer, topic_distributions, topic_keywords = run_lda_topic_modeling(
    df, num_topics=NUM_TOPICS, num_words=NUM_TOPIC_WORDS)
logger.info("✓ LDA topic modeling completed.")

paper_classifications = assign_papers_to_topics(topic_distributions)
df['Primary_Topic_Index'] = [int(p['primary_topic']) for p in paper_classifications]
df['Primary_Score'] = [p['primary_score'] for p in paper_classifications]
df['Dominance_Ratio'] = [p['dominance_ratio'] for p in paper_classifications]

logger.info("✓ Papers assigned to topics based on LDA distributions.")

topic_tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3), min_df=2, max_df=0.95, token_pattern=r'\b[\w_-]+\b'
)
topic_tfidf_matrix = topic_tfidf_vectorizer.fit_transform(df['processed_text'])
topic_tfidf_feature_names = topic_tfidf_vectorizer.get_feature_names_out()

topic_ngrams = get_top_tfidf_ngrams_per_topic(
    df, topic_tfidf_matrix, topic_tfidf_feature_names, topic_col='Primary_Topic_Index', top_k=10)

logger.info("✓ Extracted topic-specific TF-IDF n-grams for naming.")

topic_names = {}
for topic_idx, keywords in topic_keywords.items():
    lda_ngrams = keywords['top_words'][:NUM_TOPIC_WORDS]
    tfidf_ng = [ngram for ngram, _ in topic_ngrams.get(topic_idx, [])][:NUM_TOPIC_WORDS]
    top_titles = get_top_titles_for_topic(df, paper_classifications, topic_idx, n_titles=10)
    topic_name = topic_name_llm_robust(
        lda_ngrams, tfidf_ng, top_titles,
        client, model_type, credit_tracker,
        initial_iterations=TOPIC_LLM_ITER_INIT,
        max_iterations=TOPIC_LLM_ITER_MAX,
        similarity_threshold=TOPIC_LLM_SIM_THRESH,
        temp=TOPIC_LLM_TEMP, top_p=TOPIC_LLM_TOP_P
    )
    topic_names[topic_idx] = topic_name
    logger.info(f"Topic {topic_idx}: {topic_name if topic_name else 'Unnamed'}")

df['Primary_Topic'] = df['Primary_Topic_Index'].map(lambda x: topic_names.get(x, f"Topic_{x}"))
logger.info("✓ Topic naming and assignment completed.")
"""
top_papers, author_stats = get_author_stats(paper_classifications, df, n_top=5)

save_topic_analysis_outputs(df, lda_model, lda_vectorizer, topic_distributions, topic_keywords, topic_names, topic_ngrams, author_stats, top_papers, topic_ngrams, suffix_string)
print("\nSample topics and names:")
print({k: topic_names[k] for k in list(topic_names)[:5]})



Sample topics and names:
{0: 'Photovoltaic Power Systems', 1: 'Optical Wireless Communications', 2: 'Sensor Networks', 3: 'Smart Grid Sensor Networks', 4: 'Power system control'}


In [70]:
# %%
# Cell 10: Enhanced Method Extraction and Assignment Workflow (Updated for gpt-5-nano)

# Parameters
MAX_FEATURES = 15000
TFIDF_WEIGHT = 0.4
LDA_WEIGHT = 0.3
COMPOUND_WEIGHT = 0.3
TOP_METHODS_PER_PAPER = 5
MIN_ASSIGN_SCORE = 0.01
BATCH_SIZE = 2000

# LLM parameters (gpt-5-nano compatible)
METHOD_LLM_N_RUNS = 3

logger.info("=== Starting Enhanced Method Detection Pipeline ===")


method_phrases, method_counts = load_method_phrases_from_csv(filename=os.path.join(SAVE_DIR, "Saved_files_new\extracted_method_phrases.csv"))

if (method_phrases is None) or (len(method_phrases) < 3):
    # 1. Extract candidate terms
    logger.info("Step 1: Extracting candidate terms...")
    candidate_terms = extract_candidate_terms(df, text_col='processed_text', max_features=MAX_FEATURES)
    logger.info(f"✓ Extracted {len(candidate_terms)} candidate terms")
    print(f"Sample candidate terms: {candidate_terms[:10]}")
    # 2. Enhanced LLM method extraction (gpt-5-nano compatible)
    logger.info("Step 2: Enhanced LLM method extraction...")
    method_phrases, method_counts = get_method_phrases_enhanced(
    candidate_terms,
    client,
    model_type,
    credit_tracker,
    n_runs=METHOD_LLM_N_RUNS,
    batch_size=BATCH_SIZE
    )
    save_method_phrases_to_csv(method_phrases, method_counts)
else:
    logger.info(f"Loaded {len(method_phrases)} method phrases from existing CSV")


if not method_phrases:
    logger.error("No method phrases extracted! Check your LLM configuration and prompts.")
    raise RuntimeError("Method extraction failed - no phrases found")

logger.info(f"✓ Extracted {len(method_phrases)} method phrases")
print(f"Sample methods: {method_phrases[:10]}")

# 3. Build variant groups while keeping granular methods
logger.info("Step 3: Building method variant groups...")
variant_groups = build_method_variant_groups(
    method_phrases, client, model_type, credit_tracker, batch_size=50
)

# Create mappings
variant_to_canonical, canonical_to_variants = create_variant_mapping(variant_groups)
logger.info(f"✓ Created {len(canonical_to_variants)} canonical methods with {len(variant_to_canonical)} total variants")

print("\nSample variant groups:")
for canonical, variants in list(canonical_to_variants.items())[:5]:
    print(f"  {canonical}: {variants}")

# 4. Compute scores for all variants
logger.info("Step 4: Computing enhanced scoring matrices...")

logger.info("  4a: TF-IDF scoring...")
tfidf_scores, tfidf_feature_names = compute_enhanced_tfidf_scores(
    df['processed_text'], canonical_to_variants
)
logger.info(f"  ✓ TF-IDF: {tfidf_scores.shape}")

logger.info("  4b: LDA scoring...")  
lda_scores, lda_feature_names = compute_enhanced_lda_scores(
    df['processed_text'], canonical_to_variants, n_topics=50
)
logger.info(f"  ✓ LDA: {lda_scores.shape}")

logger.info("  4c: Compound scoring...")
compound_scores, compound_feature_names = compute_enhanced_compound_scores(
    df, canonical_to_variants
)
logger.info(f"  ✓ Compound: {compound_scores.shape}")

# 5. Combine variant scores
logger.info("Step 5: Combining variant scores...")
# Ensure all score matrices have the same feature order
common_features = list(set(tfidf_feature_names) & set(lda_feature_names) & set(compound_feature_names))
logger.info(f"  Common features across all methods: {len(common_features)}")

if len(common_features) == 0:
    logger.warning("No common features found - using TF-IDF features as reference")
    common_features = tfidf_feature_names

# Align score matrices
def align_scores(scores, current_features, target_features):
    if list(current_features) == list(target_features):
        return scores
    
    aligned_scores = np.zeros((scores.shape[0], len(target_features)))  # Fixed shape access
    current_to_idx = {feat: i for i, feat in enumerate(current_features)}
    
    for j, feat in enumerate(target_features):
        if feat in current_to_idx:
            aligned_scores[:, j] = scores[:, current_to_idx[feat]]
    
    return aligned_scores

tfidf_aligned = align_scores(tfidf_scores, tfidf_feature_names, common_features)
lda_aligned = align_scores(lda_scores, lda_feature_names, common_features)
compound_aligned = align_scores(compound_scores, compound_feature_names, common_features)

# Combined variant scores
combined_variant_scores = (TFIDF_WEIGHT * tfidf_aligned + 
                          LDA_WEIGHT * lda_aligned + 
                          COMPOUND_WEIGHT * compound_aligned)

logger.info(f"  ✓ Combined variant scores: {combined_variant_scores.shape}")

# 6. Aggregate to canonical methods
logger.info("Step 6: Aggregating to canonical methods...")
canonical_scores, canonical_methods = aggregate_variant_scores_to_canonical(
    combined_variant_scores, common_features, variant_to_canonical
)
logger.info(f"✓ Canonical scores: {canonical_scores.shape} for {len(canonical_methods)} methods")

# 7. Assign methods to papers
logger.info("Step 7: Assigning methods to papers...")
df = assign_top_methods_enhanced(
    df, canonical_scores, canonical_methods, 
    combined_variant_scores, common_features,
    top_n=TOP_METHODS_PER_PAPER, min_score=MIN_ASSIGN_SCORE
)
logger.info("✓ Method assignment completed")

# 8. Save results
logger.info("Step 8: Saving results...")

# Save variant groups
with open(os.path.join(SAVE_DIR, f"method_variant_groups_{suffix_string}.json"), 'w') as f:
    json.dump(canonical_to_variants, f, indent=2)

# Save score matrices
pd.DataFrame(canonical_scores, columns=canonical_methods).to_csv(
    os.path.join(SAVE_DIR, f"canonical_method_scores_{suffix_string}.csv")
)

pd.DataFrame(combined_variant_scores, columns=common_features).to_csv(
    os.path.join(SAVE_DIR, f"variant_method_scores_{suffix_string}.csv")
)

# Save final dataframe
df.to_csv(os.path.join(SAVE_DIR, f"enhanced_method_analysis_{suffix_string}.csv"), index=False)

# 9. Diagnostics
logger.info("Step 9: Running diagnostics...")
diagnostics_enhanced(df, canonical_scores, combined_variant_scores, canonical_methods, common_features)

# 10. Display results
print("\n=== SAMPLE RESULTS ===")
sample_cols = ['Primary_Method', 'Primary_Method_Score', 'Primary_Method_Variants', 'Method_Confidence']
available_cols = [col for col in sample_cols if col in df.columns]
print(df[available_cols].head(10))

print(f"\n=== METHOD STATISTICS ===")
if 'Primary_Method' in df.columns:
    method_stats = df['Primary_Method'].value_counts()
    print(f"Papers with methods assigned: {(df['Primary_Method'] != '').sum()}/{len(df)} ({100*(df['Primary_Method'] != '').mean():.1f}%)")
    print(f"Top 10 methods:")
    for method, count in method_stats.head(10).items():
        if method:
            print(f"  {method}: {count}")

print(f"\n✓ Enhanced method detection pipeline completed!")
print(f"✓ Results saved to {SAVE_DIR}")
logger.info("Enhanced method detection pipeline completed successfully!")


2025-08-23 11:57:07,199 - INFO - === Starting Enhanced Method Detection Pipeline ===
2025-08-23 11:57:07,201 - INFO - Step 1: Extracting candidate terms...


KeyboardInterrupt: 

In [None]:
#save method phrases from list to csv
import re
import os
import csv
def save_method_phrases_to_csv(method_phrases, method_counts, filename="extracted_method_phrases.csv"):
    """Save method phrases and their counts to a CSV file."""
    #save to Save directory
    filename = os.path.join(SAVE_DIR, filename)
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Method Phrase", "Count"])
        for phrase, count in zip(method_phrases, method_counts):
            # Clean phrase to remove newlines and excessive whitespace
            clean_phrase = re.sub(r'\s+', ' ', phrase.replace('\n', ' ')).strip()
            writer.writerow([clean_phrase, count])  
    print(f"✓ Saved method phrases to {filename}")
#usage
save_method_phrases_to_csv(method_phrases, method_counts)

✓ Saved method phrases to extracted_method_phrases.csv
