In [1]:
#Imports and Setup
# %%
import os
import re
import json
import csv
import ast
import numpy as np
import pandas as pd
from collections import Counter
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import configparser
import tiktoken
import logging
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Phrases
import openai
import random
from difflib import SequenceMatcher
import joblib

# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)

SAVE_DIR = "Saved_files_new"
os.makedirs(SAVE_DIR, exist_ok=True)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('omw-1.4', quiet=True)


True

In [2]:
#OpenAI Setup and Credit Tracking
# %%
class CreditTracker:
    def __init__(self):
        self.total_tokens = 0
        self.total_cost = 0
        self.cost_per_1k_tokens = 0.00015
    
    def update(self, tokens):
        self.total_tokens += tokens
        self.total_cost += (tokens / 1000) * self.cost_per_1k_tokens
    
    def get_stats(self):
        return {"total_tokens": self.total_tokens, "total_cost": round(self.total_cost, 4)}

def initialize_openai():
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE')
    client = openai.OpenAI(api_key=api_key)
    return client, model_type

def num_tokens_from_string(string: str, model_name: str) -> int:
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(string))

client, model_type = initialize_openai()
credit_tracker = CreditTracker()


In [3]:
#Utility Functions
# %%
def extract_keywords_from_filename(filename):
    base = os.path.splitext(os.path.basename(filename))[0]
    parts = base.split('_')
    keywords = []
    for i, part in enumerate(parts):
        if i > 2 and part != 'results':
            keywords.append(part)
    keywords = [kw for kw in keywords if not kw.isdigit()]
    return keywords

def keywords_to_filename_part(keywords):
    return '_'.join([kw.lower().replace(' ', '_') for kw in keywords])

def get_custom_stop_words(search_keywords=None):
    stop_words = set(stopwords.words('english'))
    words_to_keep = set()
    if search_keywords:
        for keyword in search_keywords:
            keyword = keyword.lower()
            words_to_keep.add(keyword)
            for word in keyword.split():
                words_to_keep.add(word)
    stop_words = stop_words - words_to_keep
    scientific_terms = {'et', 'al','ref','reference','references','cited','cite',
        'fig','figure','figures','table','tables','chart','charts',
        'published','journal','conference','proceedings','vol','volume','pp','page','pages','doi'}
    return stop_words.union(scientific_terms)

def string_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()


In [4]:
 #Text Processing Functions
# %%
def preprocess_text(text, search_keywords, min_word_length=2, remove_numbers=False):
    if not isinstance(text, (str, int, float)):
        return ''
    text = str(text).lower()
    
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'--+', ' ', text)
    
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if len(token) >= min_word_length]
    
    stop_words = get_custom_stop_words(search_keywords)
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if len(token) > 1 and not token.isdigit()]
    
    lemmatizer = WordNetLemmatizer()
    
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1].upper()
        tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                   "N": nltk.corpus.wordnet.NOUN,
                   "V": nltk.corpus.wordnet.VERB,
                   "R": nltk.corpus.wordnet.ADV}
        return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)
    
    try:
        tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    except:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    tokens = [token for token in tokens if token.strip()]
    return ' '.join(tokens)

def preprocess_dataframe(df, text_col, search_keywords, processed_col='processed_text'):
    df[text_col] = df[text_col].fillna('').astype(str)
    df[processed_col] = df[text_col].apply(lambda x: preprocess_text(x, search_keywords))
    return df[df[processed_col].str.strip() != '']

def detect_phrases(df, text_col='processed_text'):
    tokenized_texts = df[text_col].apply(lambda x: x.split()).tolist()
    bigram = Phrases(tokenized_texts, min_count=10, threshold=50, delimiter='_')
    trigram = Phrases(bigram[tokenized_texts], threshold=50, delimiter='_')
    
    phrased_texts = []
    for doc in tokenized_texts:
        bigrams_ = [w for w in bigram[doc] if '_' in w]
        trigrams_ = [w for w in trigram[bigram[doc]] if '_' in w]
        combined = doc + bigrams_ + trigrams_
        phrased_texts.append(' '.join(combined))
    
    return phrased_texts


In [5]:
# Data Loading and Preprocessing
filename = "semantic_scholar_2025_02_14_reliability_resilience_power_systems_results.csv"
filepath = os.path.join("Saved_files", filename)
df = pd.read_csv(filepath, sep=";")
df['text'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')
search_keywords = extract_keywords_from_filename(filename)
df = preprocess_dataframe(df, text_col='text', search_keywords=search_keywords)

def clean_fields_of_study(s):
    valid_fields = ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics',
                    'Medicine','Business','Environmental Science','Chemistry','Materials Science',
                    'Geography','Biology','Geology','Political Science','Psychology','Com']
    if pd.isna(s) or s == '[]':
        return ["Unknown"]
    if isinstance(s, str):
        fields = [field.strip().strip("'\"") for field in s.strip('[]').split(',')]
        cleaned_fields = []
        for field in fields:
            if field in valid_fields:
                cleaned_fields.append(field)
            else:
                cleaned_fields.append("Unknown")
        return cleaned_fields if cleaned_fields else ["Unknown"]
    return ["Unknown"]

df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)
logger.info(f"Loaded and preprocessed {len(df)} papers")


2025-08-20 10:08:46,034 - INFO - Loaded and preprocessed 28934 papers


In [6]:
# Topic Modeling with LDA
def extract_topic_keywords(lda_model, feature_names, num_words=10):
    topic_keywords = {}
    for topic_idx, topic in enumerate(lda_model.components_):
        top_indices = topic.argsort()[:-num_words-1:-1]
        top_words = [feature_names[i] for i in top_indices]
        word_weights = [(feature_names[i], topic[i]) for i in top_indices]
        topic_keywords[topic_idx] = {
            'top_words': top_words,
            'word_weights': word_weights
        }
    return topic_keywords

def model_topics(df, num_topics=10, num_words=100):
    """LDA analysis with unigrams + phrases"""
    phrased_texts = detect_phrases(df)
    
    vectorizer = CountVectorizer(
        ngram_range=(1, 1),
        token_pattern=r'\b[\w_-]+\b',
        max_df=0.95,
        min_df=2,
        max_features=10000
    )
    
    doc_term_matrix = vectorizer.fit_transform(phrased_texts)
    
    logger.info(f"Fitting LDA model with {num_topics} topics")
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    topic_distributions = lda_model.fit_transform(doc_term_matrix)
    
    feature_names = vectorizer.get_feature_names_out()
    topic_keywords = extract_topic_keywords(lda_model, feature_names, num_words)
    
    return lda_model, vectorizer, topic_distributions, df, topic_keywords

def classify_papers(topic_distributions, df_field):
    paper_classifications = []
    
    for idx, dist in enumerate(topic_distributions):
        top_2_topics = np.argsort(dist)[-2:][::-1]
        
        primary_score = dist[top_2_topics[0]]
        other_topics_sum = sum(dist) - primary_score
        dominance_ratio = primary_score / (other_topics_sum + 1e-10)
        
        paper_classifications.append({
            'paper_idx': idx,
            'primary_topic': top_2_topics[0],
            'secondary_topic': top_2_topics[1],
            'primary_score': primary_score,
            'dominance_ratio': dominance_ratio
        })
    
    return paper_classifications

lda_model, vectorizer, topic_distributions, df_topic, topic_keywords = model_topics(df, num_topics=10, num_words=25)
paper_classifications = classify_papers(topic_distributions, df)

# Add topic assignments to dataframe
df['Primary_Topic_Index'] = [int(p['primary_topic']) for p in paper_classifications]
df['Primary_Score'] = [p['primary_score'] for p in paper_classifications]
df['Dominance_Ratio'] = [p['dominance_ratio'] for p in paper_classifications]

logger.info("✓ Topic modeling and paper classification completed")


2025-08-20 10:08:46,422 - INFO - collecting all words and their counts
2025-08-20 10:08:46,424 - INFO - PROGRESS: at sentence #0, processed 0 words and 0 word types
2025-08-20 10:08:47,993 - INFO - PROGRESS: at sentence #10000, processed 1528760 words and 873903 word types
2025-08-20 10:08:49,703 - INFO - PROGRESS: at sentence #20000, processed 3012255 words and 1486274 word types
2025-08-20 10:08:51,086 - INFO - collected 1928697 token types (unigram + bigrams) from a corpus of 4300658 words and 28934 sentences
2025-08-20 10:08:51,086 - INFO - merged Phrases<1928697 vocab, min_count=10, threshold=50, max_vocab_size=40000000>
2025-08-20 10:08:51,086 - INFO - Phrases lifecycle event {'msg': 'built Phrases<1928697 vocab, min_count=10, threshold=50, max_vocab_size=40000000> in 4.66s', 'datetime': '2025-08-20T10:08:51.086440', 'gensim': '4.3.2', 'python': '3.11.13 (main, Jun 12 2025, 12:41:34) [MSC v.1943 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-0

In [7]:
# Topic Naming with LLM
def generate_topic_name_multiple_advanced(lda_keywords, tfidf_ngrams, top_titles, client, model_type, credit_tracker, search_keywords=None, initial_iterations=3, max_iterations=10, similarity_threshold=0.7):
    iterations = initial_iterations
    while iterations <= max_iterations:
        generated_names = []
        for _ in range(iterations):
            name = generate_topic_name_advanced(lda_keywords, tfidf_ngrams, top_titles, client, model_type, credit_tracker, search_keywords=search_keywords)
            if name:
                generated_names.append(name)
        
        # Check for dominant topic
        for i, name in enumerate(generated_names):
            similar_names = [other_name for j, other_name in enumerate(generated_names) 
                             if i != j and string_similarity(name, other_name) >= similarity_threshold]
            if len(similar_names) >= len(generated_names) // 2:
                return name
        
        iterations += 2
        print(f"No clear common topic name found. Increasing iterations to {iterations}.")
    
    from collections import Counter
    return Counter(generated_names).most_common(1)[0][0]

def generate_topic_name_advanced(lda_keywords, tfidf_ngrams, top_titles, client, model_type, credit_tracker, search_keywords=None):
    prompt = f"""Based on the following keywords and n-grams from LDA and TF-IDF analysis and the titles of the most cited papers with this topic as dominant, provide a concise, specific, and descriptive topic name (preferably a bigram or trigram, but a single word is allowed if most fitting):

LDA keywords and n-grams:
{', '.join(lda_keywords)}

TF-IDF n-grams:
{', '.join(tfidf_ngrams)}

Most cited paper titles:
{'; '.join(top_titles)}

Concise topic name:"""
    
    try:
        tokens = num_tokens_from_string(prompt, model_type)
        credit_tracker.update(tokens)
        response = client.chat.completions.create(
            model=model_type,
            messages=[
                {"role": "system", "content": "You are a helpful scientific assistant that generates concise topic names based on keywords and paper titles."},
                {"role": "user", "content": prompt}
            ]
        )
        content = response.choices[0].message.content.strip()
        credit_tracker.update(num_tokens_from_string(content, model_type))
        return content
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def get_top_titles_for_topic(df, paper_classifications, topic_idx, n_titles=20):
    dominant_papers = [p for p in paper_classifications if p['primary_topic'] == topic_idx]
    paper_infos = [
        (df.iloc[p['paper_idx']]['citationCount'], df.iloc[p['paper_idx']]['title'])
        for p in dominant_papers
        if not pd.isna(df.iloc[p['paper_idx']]['title'])
    ]
    top_titles = [title for _, title in sorted(paper_infos, key=lambda x: -x[0])[:n_titles]]
    return top_titles

def get_top_tfidf_ngrams_per_topic(df, tfidf_matrix, feature_names, topic_col='Primary_Topic_Index', top_k=10):
    tfidf_ngrams = {}
    for topic_idx in df[topic_col].dropna().unique():
        topic_idx = int(topic_idx)
        doc_indices = df[df[topic_col] == topic_idx].index
        if len(doc_indices) == 0:
            continue
        topic_tfidf = np.asarray(tfidf_matrix[doc_indices].mean(axis=0)).ravel()
        top_indices = topic_tfidf.argsort()[-top_k:][::-1]
        top_terms = [(feature_names[i], topic_tfidf[i]) for i in top_indices if topic_tfidf[i] > 0]
        tfidf_ngrams[topic_idx] = top_terms
    return tfidf_ngrams

# Generate TF-IDF n-grams for topic naming
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.95,
    token_pattern=r'\b[\w_-]+\b'
)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'])
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_ngrams = get_top_tfidf_ngrams_per_topic(
    df, tfidf_matrix, feature_names,
    topic_col='Primary_Topic_Index', top_k=10
)

# Generate topic names using LLM
topic_names = {}
for topic_idx, keywords in topic_keywords.items():
    lda_ngrams = keywords['top_words'][:20]
    tfidf_ng = [ngram for ngram, _ in tfidf_ngrams.get(topic_idx, [])][:20]
    top_titles = get_top_titles_for_topic(df, paper_classifications, topic_idx, n_titles=10)
    
    topic_name = generate_topic_name_multiple_advanced(
        lda_ngrams, tfidf_ng, top_titles, client, model_type, credit_tracker, search_keywords=search_keywords
    )
    if topic_name:
        topic_names[topic_idx] = topic_name
    logger.info(f"Topic {topic_idx}: {topic_name if topic_name else 'Unnamed'}")

# Add topic names to dataframe
df['Primary_Topic'] = df['Primary_Topic_Index'].map(lambda x: topic_names.get(x, f"Topic_{x}"))

logger.info("✓ Topic naming completed")


2025-08-20 10:11:50,843 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 10:11:51,811 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 10:11:52,542 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 10:11:52,550 - INFO - Topic 0: Smart Grid Communication Systems
2025-08-20 10:11:53,503 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 10:11:54,569 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 10:11:55,749 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 10:11:55,756 - INFO - Topic 1: Renewable Energy Storage Systems
2025-08-20 10:11:57,013 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 10:11:57,906 - INFO - HTTP Request: POST https:/

In [8]:
# %%

def extract_candidate_terms(df, text_col='processed_text', max_features=1000):
    """Extract candidate keywords and n-grams from processed text for LLM prompt testing"""
    vectorizer = CountVectorizer(
        ngram_range=(1, 3),
        max_df=0.95,
        min_df=2,
        max_features=max_features,
        token_pattern=r'\b[\w-]+\b'
    )
    matrix = vectorizer.fit_transform(df[text_col].fillna(''))
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    # Sort terms by frequency descending
    sorted_terms = sorted(zip(terms, freqs), key=lambda x: x[1], reverse=True)
    return [term for term, freq in sorted_terms]

def extract_candidate_terms_enhanced(df, text_col='processed_text', max_features=100000):
    """Enhanced candidate term extraction with 1-5 word n-grams and specificity filters"""
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer(
        ngram_range=(1, 5),
        max_df=0.98,
        min_df=2,
        max_features=max_features,
        token_pattern=r'\b[\w-]+\b'
    )
    matrix = vectorizer.fit_transform(df[text_col].fillna(''))
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1

    sorted_terms = sorted(zip(terms, freqs), key=lambda x: x[1], reverse=True)

    generic_words = {
        'method', 'approach', 'technique', 'analysis', 'system', 'model',
        'process', 'procedure', 'strategy', 'framework', 'methodology', 'study',
        'research', 'paper', 'work', 'application', 'use', 'data', 'result',
        'performance', 'evaluation', 'assessment'
    }
    filtered_terms = []
    for term, freq in sorted_terms:
        words = term.split()
        if len(words) == 1:
            if len(term) > 6 and term not in generic_words:
                filtered_terms.append(term)
        else:
            significant_words = [w for w in words if len(w) > 4 and w not in generic_words]
            if significant_words and freq >= max(1, len(words) - 2):
                filtered_terms.append(term)
    logger.info(f"Filtered {len(filtered_terms)} specific terms from {len(sorted_terms)} total candidates")
    return filtered_terms

def extract_candidate_terms(df, text_col='processed_text', max_features=1000):
    """Extract candidate keywords and n-grams from processed text for LLM prompt testing"""
    vectorizer = CountVectorizer(
        ngram_range=(1, 3),
        max_df=0.95,
        min_df=2,
        max_features=max_features,
        token_pattern=r'\b[\w-]+\b'
    )
    matrix = vectorizer.fit_transform(df[text_col].fillna(''))
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    # Sort terms by frequency descending
    sorted_terms = sorted(zip(terms, freqs), key=lambda x: x[1], reverse=True)
    return [term for term, freq in sorted_terms]

def get_method_phrases(corpus_terms, client, model_type, credit_tracker):
    """LLM filters candidate n-grams to only methods/techniques, excluding general concepts"""
    import ast
    sample_terms = ', '.join(corpus_terms[:50])
    prompt = f"""Here are the most frequent terms from a corpus of scientific papers:
{sample_terms}

Based on these, this is a corpus about power systems/electrical engineering and reliability analysis.

From the full list of terms: {', '.join(corpus_terms)}

Extract ONLY the terms that represent specific methodologies, techniques, or named technical approaches,
NOT general topics or components. 

Do not include generic phrases ("analysis", "system", "generation"), company/product names, equipment, or phenomena.

Do include: things like "monte carlo simulation", "optimal power flow", "state estimation", "fault tree analysis", "genetic algorithm", "unit commitment", "markov chain monte carlo", etc.

Return as a single Python list of strings, no code blocks or extra output.
"""

    response = client.chat.completions.create(
        model=model_type,
        messages=[{"role": "user", "content": prompt}]
    )

    content = response.choices[0].message.content
    try:
        return ast.literal_eval(content)
    except Exception:
        # Robust fallback parsing if not pure Python list
        content = content.replace('[','').replace(']','').replace('"','').replace("'",'')
        return [term.strip() for term in content.split(',') if len(term.strip()) > 2]

def get_method_abbreviation_dict(method_phrases, client, model_type, credit_tracker, batch_size=100):
    """
    Uses OpenAI to match each method phrase to a list of abbreviations/variants, in manageable batches.
    Returns {canonical: [abbr, alias, ...]}
    """
    import ast, re
    results = {}
    for i in range(0, len(method_phrases), batch_size):
        batch = method_phrases[i:i+batch_size]
        prompt = f"""For each of the following phrases, extract ONLY specific named computational, statistical, or engineering methods, techniques, or algorithms. Examples: 'monte carlo simulation', 'optimal power flow', 'support vector machine', 'markov chain monte carlo', 'unit commitment', 'fault tree analysis', 'finite element method', etc.

DO NOT return categories or families such as 'Power system analysis methods', 'Statistical methods', or 'Reliability analysis techniques'.

For each method, provide all common abbreviations, acronyms, and alternative names as used in engineering and scientific literature.

Format your response strictly as a Python dictionary, with only method names as keys and all abbreviations/aliases/alternatives as a list (see below):

{{
  "optimal power flow": ["opf", "ac opf", "dc opf", "optimal power flow analysis"],
  "monte carlo simulation": ["monte carlo", "mcs", "mc simulation"],
  "unit commitment": ["uc", "unit commit", "unit commitment problem"],
  "automatic generation control": ["agc"],
  "markov chain monte carlo": ["mcmc"],
  "fault tree analysis": ["fta"],
  "finite element method": ["fem", "finite element analysis", "fea"],
  "principal component analysis": ["pca"]
}}

Important requirements:
- DO NOT use umbrella categories of methods.
- Each dictionary key must be a real, specific, technical method phrase.
- Do NOT include any explanations, categories, or code blocks.
- Only output the dictionary.
Methods:
{chr(10).join(batch)}
"""
        response = client.chat.completions.create(
            model=model_type,
            messages=[
                {"role": "system", "content": "You are a scientific abbreviation expert."},
                {"role": "user", "content": prompt}
            ]
        )
        content = response.choices[0].message.content.strip()
        # print(content)  # DEBUG if desired
        start, end = content.find('{'), content.rfind('}')+1
        method_dict = {}
        if start >= 0 and end > start:
            try:
                method_dict = ast.literal_eval(content[start:end])
            except Exception as e:
                logger.warning(f"Failed to parse dictionary from LLM batch: {e}")
        results.update(method_dict)
    logger.info(f"LLM mapped {len(results)} methods to abbreviations/variants.")
    return results

def expand_method_abbreviations_with_llm(method_dict, client, model_type, credit_tracker):
    """Use LLM to expand abbreviations for any methods missing comprehensive variants"""
    methods_needing_expansion = [m for m, v in method_dict.items() if len(v) < 2]
    if not methods_needing_expansion:
        logger.info("All methods have sufficient abbreviation variants")
        return method_dict
    logger.info(f"Expanding abbreviations for {len(methods_needing_expansion)} methods")
    expansion_prompt = f"""For the following scientific/engineering methods, provide ALL common abbreviations, acronyms, and alternative names.

Methods to expand:
{', '.join(methods_needing_expansion[:100])}

For each method, provide a comprehensive list including:
- Official abbreviations/acronyms
- Common shortened forms
- Alternative spellings (US/UK variants like optimization/optimisation)
- Related terms that refer to the very same method
- Field-specific variations

Format your response EXACTLY as a valid Python dictionary, for example:
{{
  "reinforcement learning": ["rl", "deep reinforcement learning", "drl", "reinforcement learning algorithm"],
  "wavelet transform": ["wt", "discrete wavelet transform", "dwt", "continuous wavelet transform", "cwt", "wavelet analysis"],
  "optimization": ["optimisation", "optimize", "optimise", "optimal", "optimization algorithm"]
}}

Important requirements:
- Your ENTIRE reply must be a single Python dictionary. 
- Do NOT include explanations, commentary, introductions, or code blocks.
- Do NOT use triple backticks or headings of any kind.
- The first character you output must be '{{' and the last must be '}}'.
- If you have only one method, your dictionary should still use the same format.

Return ONLY the dictionary, with keys as method names and values as lists of abbreviations/variants."""
    try:
        response = client.chat.completions.create(
            model=model_type,
            messages=[
                {"role": "system", "content": "You are an expert in scientific terminology..."},
                {"role": "user", "content": expansion_prompt}
            ]
        )
        content = response.choices.message.content.strip()
        credit_tracker.update(len(content.split()))
        if content.startswith('```python'):
            content = content.replace('``````', '')
        elif content.startswith('```'):
            content = content.replace('```', '')
        start = content.find('{')
        end = content.rfind('}') + 1
        if start >= 0 and end > start:
            content = content[start:end]
        expansion_dict = ast.literal_eval(content)
        for method, new_variants in expansion_dict.items():
            if method in method_dict:
                all_variants = list(set(method_dict[method] + new_variants))
                method_dict[method] = all_variants
        logger.info(f"Successfully expanded abbreviations for {len(expansion_dict)} methods")
    except Exception as e:
        logger.warning(f"Failed to expand abbreviations: {e}")
    return method_dict

def build_abbr_to_canonical_map(method_dict):
    abbr_map = {}
    for canonical, variants in method_dict.items():
        abbr_map[canonical.lower()] = canonical
        for v in variants:
            abbr_map[v.lower()] = canonical
    return abbr_map

import re
def standardize_methods_in_text(text, abbr_to_canonical):
    # Sort by descending length, so longest patterns are replaced first
    sorted_vars = sorted(abbr_to_canonical, key=lambda x: -len(x))
    for var in sorted_vars:
        pattern = r'\b' + re.escape(var) + r'\b'  # whole word, case-insensitive
        text = re.sub(pattern, abbr_to_canonical[var], text, flags=re.IGNORECASE)
    return text


In [9]:
# partiall workflow, use rahter the new one further down...
# Extract method phrases (customize max_features as token budget allows)
candidate_terms = extract_candidate_terms_enhanced(df, text_col='processed_text', max_features=10000)
top_cands = candidate_terms[:500]  # or whatever fits in prompt batch

# LLM abbreviation mapping (in manageable batches if necessary)
method_dict = get_method_abbreviation_dict(top_cands, client, model_type, credit_tracker, batch_size=100)

# Build alias→canonical map
abbr_to_canonical_map = build_abbr_to_canonical_map(method_dict)

# Standardize all texts before actual analysis
df['standardized_text'] = df['processed_text'].apply(
    lambda t: standardize_methods_in_text(t, abbr_to_canonical_map)
)

2025-08-20 10:13:12,413 - INFO - Filtered 7659 specific terms from 10000 total candidates
2025-08-20 10:13:19,830 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 10:13:22,750 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 10:13:27,299 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 10:13:33,041 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 10:13:36,209 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 10:13:36,209 - INFO - LLM mapped 97 methods to abbreviations/variants.


In [10]:
#  Multi-Label Method Scoring (Enhanced with Primary Method Mapping)
def compute_method_scores_streamlined_enhanced(df, vocab, method_to_primary_map, processed_col='processed_text', 
                                             w_tfidf=0.6, w_compound=0.4, top_k=3):
    """
    Compute combined scores for all methods with streamlined output and primary method mapping.
    """
    n_docs = len(df)
    n_methods = len(vocab)
    
    logger.info(f"Computing method scores for {n_docs} documents and {n_methods} methods")
    
    # TF-IDF scores
    tfidf_vectorizer = TfidfVectorizer(
        vocabulary=vocab, ngram_range=(1, 5), min_df=1, max_df=0.999,  # Extended to 5-grams
        norm='l2', token_pattern=r'\b[\w_-]+\b'
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[processed_col])
    tfidf_scores = tfidf_matrix.toarray()
    
    # Compound/proximity scores
    compound_scores = compute_compound_scores_enhanced(df, vocab, processed_col)
    
    # Combined scores
    combined_scores = w_tfidf * tfidf_scores + w_compound * compound_scores
    
    # Group scores by primary method (combine abbreviations and variants)
    primary_methods = list(set(method_to_primary_map.values()))
    primary_combined_scores = np.zeros((n_docs, len(primary_methods)))
    primary_method_index = {method: i for i, method in enumerate(primary_methods)}
    
    # Aggregate scores for each primary method
    for j, variant in enumerate(vocab):
        primary_method = method_to_primary_map.get(variant, variant)
        if primary_method in primary_method_index:
            primary_idx = primary_method_index[primary_method]
            primary_combined_scores[:, primary_idx] = np.maximum(
                primary_combined_scores[:, primary_idx], 
                combined_scores[:, j]
            )
    
    # Prepare ALL new columns at once to avoid fragmentation
    logger.info("Preparing all new columns for efficient addition...")
    new_columns = {}
    
    # 1. Primary method score columns (aggregated)
    for i, method in enumerate(primary_methods):
        safe_name = f"method_{method.replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '').replace(',', '')}"
        if len(safe_name) > 50:
            safe_name = safe_name[:50]
        new_columns[safe_name] = primary_combined_scores[:, i]
    
    # 2. Top-k columns based on primary methods
    tfidf_primary_scores = np.zeros((n_docs, len(primary_methods)))
    compound_primary_scores = np.zeros((n_docs, len(primary_methods)))
    
    # Aggregate TF-IDF and compound scores for primary methods
    for j, variant in enumerate(vocab):
        primary_method = method_to_primary_map.get(variant, variant)
        if primary_method in primary_method_index:
            primary_idx = primary_method_index[primary_method]
            tfidf_primary_scores[:, primary_idx] = np.maximum(
                tfidf_primary_scores[:, primary_idx], 
                tfidf_scores[:, j]
            )
            compound_primary_scores[:, primary_idx] = np.maximum(
                compound_primary_scores[:, primary_idx], 
                compound_scores[:, j]
            )
    
    tfidf_topk_idx = np.argsort(tfidf_primary_scores, axis=1)[:, -top_k:][:, ::-1]
    compound_topk_idx = np.argsort(compound_primary_scores, axis=1)[:, -top_k:][:, ::-1]
    
    for k in range(top_k):
        new_columns[f'tfidf_top_{k+1}_method'] = [primary_methods[idx[k]] for idx in tfidf_topk_idx]
        new_columns[f'tfidf_top_{k+1}_score'] = [tfidf_primary_scores[i, idx[k]] for i, idx in enumerate(tfidf_topk_idx)]
        new_columns[f'compound_top_{k+1}_method'] = [primary_methods[idx[k]] for idx in compound_topk_idx]
        new_columns[f'compound_top_{k+1}_score'] = [compound_primary_scores[i, idx[k]] for i, idx in enumerate(compound_topk_idx)]
    
    # Add all columns at once using efficient concatenation
    logger.info(f"Adding {len(new_columns)} columns efficiently...")
    new_df = pd.DataFrame(new_columns, index=df.index)
    df = pd.concat([df, new_df], axis=1)
    
    logger.info(f"✓ Added {len(primary_methods)} primary method columns and {top_k*4} top-k columns efficiently")
    return df, primary_combined_scores, tfidf_primary_scores, compound_primary_scores, primary_methods

def compute_compound_scores_enhanced(df, vocab, processed_col='processed_text', window=300, ratio_thresh=0.5):
    """Enhanced compound/proximity scores with better phrase detection."""
    n_docs = len(df)
    n_terms = len(vocab)
    scores = np.zeros((n_docs, n_terms), dtype=np.float32)
    
    docs = df[processed_col].fillna('').str.lower().tolist()
    
    logger.info(f"Computing enhanced compound scores for {n_terms} method phrases...")
    
    for j, phrase in enumerate(vocab):
        if j % 10 == 0:
            logger.info(f"Processing phrase {j+1}/{n_terms}")
            
        phrase_l = phrase.lower()
        phrase_words = [w for w in phrase_l.split() if len(w) > 0]
        sig_words = [w for w in phrase_words if len(w) > 3]
        
        for i, text in enumerate(docs):
            # Exact phrase match
            if phrase_l in text:
                scores[i, j] = 1.0
                continue
            
            # Multi-word phrase processing
            if len(phrase_words) > 1:
                present = sum(1 for w in sig_words if w in text) if sig_words else 0
                coverage = present / len(sig_words) if sig_words else 0.0
                
                # Enhanced proximity check with flexible ordering
                prox_hit = False
                for k in range(len(phrase_words)-1):
                    w1, w2 = phrase_words[k], phrase_words[k+1]
                    pos = text.find(w1)
                    if pos >= 0:
                        # Check both directions within window
                        nearby = text[max(0, pos-window//2):pos+window]
                        if w2 in nearby:
                            prox_hit = True
                            break
                
                # Score based on coverage and proximity
                if coverage >= ratio_thresh or prox_hit:
                    base_score = 0.6 + 0.4 * coverage
                    if prox_hit:
                        base_score = min(1.0, base_score * 1.2)  # Bonus for proximity
                    scores[i, j] = max(scores[i, j], base_score)
            else:
                # Single technical term with context bonus
                if len(phrase_l) > 6 and phrase_l in text:
                    # Check for technical context (nearby technical words)
                    pos = text.find(phrase_l)
                    context = text[max(0, pos-50):pos+50]
                    technical_indicators = ['algorithm', 'method', 'analysis', 'model', 'system', 'technique']
                    context_bonus = 1.1 if any(ind in context for ind in technical_indicators) else 1.0
                    scores[i, j] = max(scores[i, j], min(1.0, 0.7 * context_bonus))
    
    logger.info("✓ Enhanced compound scores computed")
    return scores

def assign_primary_method_and_confidence_enhanced(df, combined_scores, primary_methods, 
                                                th_super=0.85, th_high=0.6, th_low=0.2):
    """Assign primary method and confidence level using primary method aggregation."""
    n_docs = len(df)
    assigned_methods = []
    confidences = []
    
    for i in range(n_docs):
        scores = combined_scores[i]
        max_idx = np.argmax(scores)
        max_score = scores[max_idx]
        best_method = primary_methods[max_idx]
        
        if max_score >= th_super:
            confidence = 'super_high'
        elif max_score >= th_high:
            confidence = 'high'
        elif max_score >= th_low:
            confidence = 'low'
        else:
            confidence = 'not_detected'
            best_method = 'LowConfidence'
        
        assigned_methods.append(best_method)
        confidences.append(confidence)
    
    # Add these final columns efficiently
    final_columns = {
        'Primary_Method': assigned_methods,
        'Method_Confidence': confidences
    }
    
    final_df = pd.DataFrame(final_columns, index=df.index)
    df = pd.concat([df, final_df], axis=1)
    
    return df
"""
# Execute enhanced method scoring with primary method mapping
logger.info("Starting enhanced method scoring with primary method mapping...")
df, primary_combined_scores, tfidf_primary_scores, compound_primary_scores, primary_methods = compute_method_scores_streamlined_enhanced(
    df, method_phrases_aug, method_to_primary_map, processed_col='standardized_text', 
    w_tfidf=0.6, w_compound=0.4, top_k=3
)

df = assign_primary_method_and_confidence_enhanced(
    df, primary_combined_scores, primary_methods, 
    th_super=0.85, th_high=0.6, th_low=0.2
)

logger.info("✓ Enhanced method scoring completed without fragmentation warnings")
"""



In [11]:
# 1. Extract broad candidate n-grams from the corpus
candidate_terms = extract_candidate_terms(df, text_col='processed_text', max_features=10000)

# 2. Use LLM to filter for method/technique phrases only
method_phrases = get_method_phrases(candidate_terms, client, model_type, credit_tracker)

# 3. Use LLM to build abbreviation/synonym dictionary
method_dict = get_method_abbreviation_dict(method_phrases, client, model_type, credit_tracker, batch_size=100)

abbr_to_canonical_map = build_abbr_to_canonical_map(method_dict)

# 4. Standardize text: replace all abbreviations/variants with full canonical names
df['standardized_text'] = df['processed_text'].apply(
    lambda t: standardize_methods_in_text(t, abbr_to_canonical_map)
)

# 5. Build method vocabulary for analysis
method_vocabulary = sorted(method_dict.keys())

# 6. TF-IDF and compound analysis WITH NO AGGREGATION
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    vocabulary=method_vocabulary,
    ngram_range=(1, 3),
    min_df=1,
    max_df=0.999,
    norm='l2',
    token_pattern=r'\b[\w\s_-]+\b'
)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['standardized_text'])
tfidf_scores = tfidf_matrix.toarray()

compound_scores = compute_compound_scores_enhanced(df, method_vocabulary, processed_col='standardized_text')

# 7. Combine scores, and assign top/method columns as you like!
w_tfidf = 0.6
w_compound = 0.4
combined_scores = w_tfidf * tfidf_scores + w_compound * compound_scores

def assign_top_method_and_confidence(df, combined_scores, method_vocabulary, th_super=0.85, th_high=0.6, th_low=0.2):
    n_docs = len(df)
    top_methods = []
    confidences = []
    for i in range(n_docs):
        idx = np.argmax(combined_scores[i])
        max_score = combined_scores[i, idx]
        method = method_vocabulary[idx]
        if max_score >= th_super:
            conf = 'super_high'
        elif max_score >= th_high:
            conf = 'high'
        elif max_score >= th_low:
            conf = 'low'
        else:
            conf = 'not_detected'
            method = 'LowConfidence'
        top_methods.append(method)
        confidences.append(conf)
    df['Primary_Method'] = top_methods
    df['Method_Confidence'] = confidences
    return df

df = assign_top_method_and_confidence(df, combined_scores, method_vocabulary)


2025-08-20 10:16:12,350 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 10:16:23,624 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 10:16:23,645 - INFO - LLM mapped 55 methods to abbreviations/variants.
2025-08-20 10:17:33,470 - INFO - Computing enhanced compound scores for 55 method phrases...
2025-08-20 10:17:33,470 - INFO - Processing phrase 1/55
2025-08-20 10:17:35,607 - INFO - Processing phrase 11/55
2025-08-20 10:17:37,642 - INFO - Processing phrase 21/55
2025-08-20 10:17:39,658 - INFO - Processing phrase 31/55
2025-08-20 10:17:41,502 - INFO - Processing phrase 41/55
2025-08-20 10:17:42,886 - INFO - Processing phrase 51/55
2025-08-20 10:17:43,847 - INFO - ✓ Enhanced compound scores computed


In [12]:
print(method_vocabulary)

['adaptive control', 'automatic generation control', 'clustering algorithm', 'cognitive radio', 'contingency analysis', 'decision support', 'deep reinforcement learning', 'detection technique', 'distributed consensus', 'dynamic line rating', 'dynamic programming', 'emergency response modeling', 'energy management strategy', 'fault tree analysis', 'feedback control', 'forensic analysis', 'fuzzy logic', 'genetic algorithm', 'grid optimization', 'historical data analysis', 'interference management', 'linear programming', 'load flow analysis', 'load forecasting', 'machine learning', 'markov chain monte carlo', 'model predictive control', 'monte carlo simulation', 'multi-agent systems', 'multi-objective optimization', 'network resource allocation', 'neural network', 'optimal power flow', 'optimal scheduling', 'particle swarm optimization', 'path planning', 'performance assessment', 'power system optimization', 'reliability analysis', 'risk assessment', 'sensitivity analysis', 'signal proces

In [13]:
# Author Analysis
def get_top_papers(paper_classifications, df_field, n_top=5):
    """Get top papers per topic with author analysis"""
    top_papers = {}
    author_topic_stats = {}
    
    for topic in set(p['primary_topic'] for p in paper_classifications):
        topic_papers = [p for p in paper_classifications if p['primary_topic'] == topic]
        topic_papers.sort(key=lambda x: x['dominance_ratio'], reverse=True)
        top_papers[topic] = []
        
        for p in topic_papers[:n_top]:
            paper_idx = p['paper_idx']
            try:
                authors = df_field.iloc[paper_idx]['authors']
                
                if isinstance(authors, str):
                    try:
                        authors = ast.literal_eval(authors)
                    except (ValueError, SyntaxError):
                        authors = []
                
                if isinstance(authors, list):
                    author_list = []
                    for author in authors:
                        if isinstance(author, dict):
                            author_list.append({
                                'name': author.get('name', 'Unknown'),
                                'id': author.get('authorId', 'Unknown')
                            })
                else:
                    author_list = []
                
                top_papers[topic].append({
                    'paperId': df_field.iloc[paper_idx]['paperId'],
                    'title': df_field.iloc[paper_idx]['title'],
                    'abstract': df_field.iloc[paper_idx]['abstract'],
                    'authors': author_list,
                    'score': float(p['primary_score']),
                    'dominance_ratio': float(p['dominance_ratio'])
                })
                
                # Author statistics
                for author in author_list:
                    author_id = author['id']
                    if author_id not in author_topic_stats:
                        author_topic_stats[author_id] = {
                            'name': author['name'],
                            'topics': {},
                            'total_papers': 0,
                            'top_papers': 0
                        }
                    
                    if topic not in author_topic_stats[author_id]['topics']:
                        author_topic_stats[author_id]['topics'][topic] = {
                            'paper_count': 0,
                            'avg_dominance': 0,
                            'top_papers': []
                        }
                    
                    author_stats = author_topic_stats[author_id]['topics'][topic]
                    author_stats['paper_count'] += 1
                    author_stats['avg_dominance'] = (
                        (author_stats['avg_dominance'] * (author_stats['paper_count'] - 1) + 
                         float(p['dominance_ratio'])) / author_stats['paper_count']
                    )
                    author_stats['top_papers'].append({
                        'title': df_field.iloc[paper_idx]['title'],
                        'dominance_ratio': float(p['dominance_ratio'])
                    })
                    
                    author_topic_stats[author_id]['total_papers'] += 1
                    author_topic_stats[author_id]['top_papers'] += 1
                    
            except Exception as e:
                logger.warning(f"Error processing paper {paper_idx}: {e}")
                continue
    
    return top_papers, author_topic_stats

def save_author_analysis(author_stats, filename):
    """Save author analysis to CSV"""
    author_data = []
    for author_id, stats in author_stats.items():
        if stats['total_papers'] >= 2:  # Only authors with multiple papers
            for topic, topic_stats in stats['topics'].items():
                author_data.append({
                    'author_id': author_id,
                    'author_name': stats['name'],
                    'topic': topic,
                    'paper_count': topic_stats['paper_count'],
                    'avg_dominance': topic_stats['avg_dominance'],
                    'total_papers': stats['total_papers']
                })
    
    author_df = pd.DataFrame(author_data)
    author_df.to_csv(filename, index=False)
    logger.info(f"Author analysis saved to {filename}")

# Perform author analysis
top_papers, author_stats = get_top_papers(paper_classifications, df, n_top=5)
logger.info("✓ Author analysis completed")


2025-08-20 10:17:44,193 - INFO - ✓ Author analysis completed


In [14]:
def comprehensive_diagnostics_granular(
    df,
    combined_scores,
    tfidf_scores,
    compound_scores,
    method_vocabulary,
    method_dict=None
):
    n_docs = len(df)
    n_methods = len(method_vocabulary)

    print("=== METHOD ASSIGNMENT DIAGNOSTICS ===")
    print(f"Total documents: {n_docs}")
    print(f"Total unique methods (canonical): {n_methods}")

    if method_dict is not None:
        n_variants = sum(1 + len(variants) for variants in method_dict.values())
        print(f"Total method variants (including abbreviations): {n_variants}")

    # Coverage statistics
    tfidf_nonzero = (tfidf_scores > 0).any(axis=1).sum()
    compound_nonzero = (compound_scores > 0).any(axis=1).sum()
    combined_nonzero = (combined_scores > 0).any(axis=1).sum()
    print("\nCoverage:")
    print(f"  TF-IDF coverage: {tfidf_nonzero}/{n_docs} ({100*tfidf_nonzero/n_docs:.1f}%)")
    print(f"  Compound coverage: {compound_nonzero}/{n_docs} ({100*compound_nonzero/n_docs:.1f}%)")
    print(f"  Combined coverage: {combined_nonzero}/{n_docs} ({100*combined_nonzero/n_docs:.1f}%)")

    # Confidence distribution (robust, always force to Series)
    if 'Method_Confidence' in df.columns:
        col = df['Method_Confidence']
        # Ensure it's a Series, not a DataFrame (should be 1D)
        if isinstance(col, pd.DataFrame):
            col = col.iloc[:, 0]
        conf_dist = col.value_counts()
        print("\nConfidence distribution:")
        for conf, count in conf_dist.items():
            print(f"  {conf}: {count} ({100*count/n_docs:.1f}%)")

    # Top assigned methods
    if 'Primary_Method' in df.columns:
        col = df['Primary_Method']
        if isinstance(col, pd.DataFrame):
            col = col.iloc[:, 0]
        method_dist = col.value_counts().head(20)
        print("\nTop 20 assigned methods:")
        for method, count in method_dist.items():
            print(f"  {method}: {count}")

    # Show sample methods
    print("\nSample methods (from canonical method vocabulary):")
    for method in method_vocabulary[:10]:
        print(f"  {method}")

    if method_dict is not None:
        print("\nAbbreviation mapping examples:")
        for i, (canonical, variants) in enumerate(list(method_dict.items())[:5]):
            print(f"  {canonical}: {', '.join(variants[:5])}")

    # Score statistics
    print("\nScore statistics (all methods):")
    print(f"  Combined scores - Mean: {combined_scores.mean():.4f}, Std: {combined_scores.std():.4f}")
    print(f"  TF-IDF scores - Mean: {tfidf_scores.mean():.4f}, Std: {tfidf_scores.std():.4f}")
    print(f"  Compound scores - Mean: {compound_scores.mean():.4f}, Std: {compound_scores.std():.4f}")


In [15]:
comprehensive_diagnostics_granular(
    df,
    combined_scores,
    tfidf_scores,
    compound_scores,
    method_vocabulary,
    method_dict=method_dict
)


=== METHOD ASSIGNMENT DIAGNOSTICS ===
Total documents: 28934
Total unique methods (canonical): 55
Total method variants (including abbreviations): 118

Coverage:
  TF-IDF coverage: 0/28934 (0.0%)
  Compound coverage: 28423/28934 (98.2%)
  Combined coverage: 28423/28934 (98.2%)

Confidence distribution:
  low: 28423 (98.2%)
  not_detected: 511 (1.8%)

Top 20 assigned methods:
  power system optimization: 9708
  distributed consensus: 1746
  optimal power flow: 1634
  automatic generation control: 1289
  energy management strategy: 1269
  dynamic line rating: 1251
  historical data analysis: 1003
  system reliability evaluation: 977
  grid optimization: 935
  network resource allocation: 674
  adaptive control: 654
  reliability analysis: 599
  detection technique: 585
  load flow analysis: 561
  LowConfidence: 511
  decision support: 469
  genetic algorithm: 446
  performance assessment: 436
  monte carlo simulation: 408
  machine learning: 350

Sample methods (from canonical method voc

In [16]:
# Save Results and Supporting Files
# %%
def save_supporting_files_enhanced(lda_model, vectorizer, topic_distributions, suffix_string, 
                                 author_stats, top_papers, topic_keywords, tfidf_ngrams, method_dict):
    """Save all supporting analysis files including method dictionary"""
    
    # Save topic terms
    topic_terms_serializable = {}
    for topic_idx, keywords in topic_keywords.items():
        topic_terms_serializable[int(topic_idx)] = {
            'top_words': keywords['top_words'],
            'word_weights': [(word, float(weight)) for word, weight in keywords['word_weights']]
        }
    
    topic_filename = os.path.join(SAVE_DIR, f"lda_topic_terms_{suffix_string}.json")
    with open(topic_filename, 'w', encoding='utf-8') as f:
        json.dump(topic_terms_serializable, f, ensure_ascii=False, indent=2)
    
    # Save TF-IDF terms
    tfidf_filename = os.path.join(SAVE_DIR, f"tfidf_topic_terms_{suffix_string}.json")
    with open(tfidf_filename, 'w', encoding='utf-8') as f:
        json.dump({int(k): [(term, float(score)) for term, score in v] 
                  for k, v in tfidf_ngrams.items()}, f, ensure_ascii=False, indent=2)
    
    # Save method dictionary with abbreviations
    method_dict_filename = os.path.join(SAVE_DIR, f"method_abbreviations_{suffix_string}.json")
    with open(method_dict_filename, 'w', encoding='utf-8') as f:
        json.dump(method_dict, f, ensure_ascii=False, indent=2)
    
    # Save LDA components
    lda_filename = os.path.join(SAVE_DIR, f"lda_model_{suffix_string}.joblib")
    joblib.dump(lda_model, lda_filename)
    
    vectorizer_filename = os.path.join(SAVE_DIR, f"vectorizer_{suffix_string}.joblib")
    joblib.dump(vectorizer, vectorizer_filename)
    
    distributions_filename = os.path.join(SAVE_DIR, f"topic_distributions_{suffix_string}.npy")
    np.save(distributions_filename, topic_distributions)
    
    # Save top papers
    top_papers_filename = os.path.join(SAVE_DIR, f"top_papers_{suffix_string}.json")
    with open(top_papers_filename, 'w', encoding='utf-8') as f:
        json.dump({int(k): v for k, v in top_papers.items()}, f, ensure_ascii=False, indent=2, default=str)
    
    # Save author analysis
    author_filename = os.path.join(SAVE_DIR, f"author_analysis_{suffix_string}.csv")
    save_author_analysis(author_stats, author_filename)
    
    logger.info("✓ All enhanced supporting files saved")

# Save main results
current_date = datetime.now().strftime("%Y_%m_%d")
keyword_str = keywords_to_filename_part(search_keywords) if search_keywords else ""
suffix_string = f"{current_date}{keyword_str}"

output_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{suffix_string}_enhanced_analysis.csv")
df.to_csv(output_filename, sep=';', encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC, escapechar='\\')

# Save supporting files
save_supporting_files_enhanced(
    lda_model=lda_model,
    vectorizer=vectorizer, 
    topic_distributions=topic_distributions,
    suffix_string=suffix_string,
    author_stats=author_stats,
    top_papers=top_papers,
    topic_keywords=topic_keywords,
    tfidf_ngrams=tfidf_ngrams,
    method_dict=method_dict
)

print(f"\n🎉 Enhanced analysis completed successfully!")
print(f"Main results saved to: {output_filename}")
print(f"Method abbreviation dictionary saved with {len(method_dict)} primary methods")
print(f"Total method variants: {sum(len(variants) for variants in method_dict.values())}")
print(f"API token usage: {credit_tracker.get_stats()}")

# Display sample results
display_cols = ['Primary_Method', 'Method_Confidence', 'Primary_Topic', 
                'tfidf_top_1_method', 'tfidf_top_1_score', 
                'compound_top_1_method', 'compound_top_1_score']
available_cols = [col for col in display_cols if col in df.columns]
print(f"\nSample enhanced results:")
print(df[available_cols].head(10))

# Show some method examples
print(f"\nMethod abbreviation examples:")
for i, (primary, variants) in enumerate(list(method_dict.items())[:5]):
    print(f"  {primary}: {', '.join(variants)}")


2025-08-20 10:17:48,604 - INFO - Author analysis saved to Saved_files_new\author_analysis_2025_08_20reliability_resilience_power_systems.csv
2025-08-20 10:17:48,606 - INFO - ✓ All enhanced supporting files saved



🎉 Enhanced analysis completed successfully!
Main results saved to: Saved_files_new\semantic_scholar_2025_08_20reliability_resilience_power_systems_enhanced_analysis.csv
Method abbreviation dictionary saved with 55 primary methods
Total method variants: 63
API token usage: {'total_tokens': 8890, 'total_cost': 0.0013}

Sample enhanced results:
                  Primary_Method Method_Confidence  \
0           reliability analysis               low   
1  system reliability evaluation               low   
2               load forecasting               low   
3               decision support               low   
4      power system optimization               low   
5      power system optimization               low   
6      power system optimization               low   
7      power system optimization               low   
8          distributed consensus               low   
9      power system optimization               low   

                    Primary_Topic  
0         Power Converte