In [19]:
# %%
# Cell 1: Imports and Setup
import os
import re
import json
import csv
import ast
import numpy as np
import pandas as pd
from collections import Counter
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import configparser
import tiktoken
import logging
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Phrases
import openai
from difflib import SequenceMatcher

SAVE_DIR = "Saved_files_new"
os.makedirs(SAVE_DIR, exist_ok=True)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [20]:
# %%
# Cell 2: OpenAI Setup and Utility
class CreditTracker:
    def __init__(self):
        self.total_tokens = 0
        self.total_cost = 0
        self.cost_per_1k_tokens = 0.00015
    def update(self, tokens):
        self.total_tokens += tokens
        self.total_cost += (tokens / 1000) * self.cost_per_1k_tokens
    def get_stats(self):
        return {"total_tokens": self.total_tokens, "total_cost": round(self.total_cost, 4)}

def initialize_openai():
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE')
    client = openai.OpenAI(api_key=api_key)
    return client, model_type

def num_tokens_from_string(string: str, model_name: str) -> int:
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(string))

client, model_type = initialize_openai()
credit_tracker = CreditTracker()

In [21]:
# %%
# Cell 3: Data Preprocessing Utilities

def extract_keywords_from_filename(filename):
    base = os.path.splitext(os.path.basename(filename))[0]
    parts = base.split('_')
    return [part for i, part in enumerate(parts) if i > 2 and part != 'results' and not part.isdigit()]

def get_custom_stop_words(search_keywords=None):
    stop_words = set(stopwords.words('english'))
    words_to_keep = set()
    if search_keywords:
        for keyword in search_keywords:
            keyword = keyword.lower()
            words_to_keep.add(keyword)
            for word in keyword.split():
                words_to_keep.add(word)
    stop_words = stop_words - words_to_keep
    scientific_terms = {'et', 'al', 'ref', 'reference', 'references', 'cited', 'cite',
        'fig', 'figure', 'figures', 'table', 'tables', 'chart', 'charts',
        'published', 'journal', 'conference', 'proceedings', 'vol', 'volume', 'pp', 'page', 'pages', 'doi'}
    return stop_words.union(scientific_terms)

def preprocess_text(text, search_keywords=None, min_word_length=2, remove_numbers=True):
    if not isinstance(text, (str, int, float)):
        return ''
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'--+', ' ', text)
    tokens = word_tokenize(text)
    stop_words = get_custom_stop_words(search_keywords)
    tokens = [t for t in tokens if len(t) >= min_word_length and t not in stop_words and len(t) > 1 and not t.isdigit()]
    lemmatizer = WordNetLemmatizer()
    try:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    except:
        pass
    return ' '.join(tokens)

def preprocess_dataframe(df, text_col, search_keywords, processed_col='processed_text'):
    df[text_col] = df[text_col].fillna('').astype(str)
    df[processed_col] = df[text_col].apply(lambda x: preprocess_text(x, search_keywords))
    return df[df[processed_col].str.strip() != '']

def clean_fields_of_study(s):
    valid_fields = ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics',
        'Medicine','Business','Environmental Science','Chemistry','Materials Science',
        'Geography','Biology','Geology','Political Science','Psychology','Com']
    if pd.isna(s) or s == '[]':
        return ["Unknown"]
    if isinstance(s, str):
        fields = [field.strip().strip("'\"") for field in s.strip('[]').split(',')]
        return [f if f in valid_fields else "Unknown" for f in fields] or ["Unknown"]
    return ["Unknown"]

In [22]:
# %%
# Cell 4: Data Loading & Cleaning

filename = "semantic_scholar_2025_02_14_reliability_resilience_power_systems_results.csv"
filepath = os.path.join("Saved_files", filename)
df = pd.read_csv(filepath, sep=";")
df['text'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')
search_keywords = extract_keywords_from_filename(filename)
df = preprocess_dataframe(df, text_col='text', search_keywords=search_keywords)
df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)
logger.info(f"Loaded and preprocessed {len(df)} papers")


KeyboardInterrupt: 

In [None]:
# %%
# Cell 5: Method Phrase Extraction & Standardization (Iterative LLM)

def extract_candidate_terms(df, text_col='processed_text', max_features=20000):
    vectorizer = CountVectorizer(
        ngram_range=(1, 4), max_df=0.95, min_df=2, max_features=max_features, token_pattern=r'\b[\w-]+\b'
    )
    matrix = vectorizer.fit_transform(df[text_col].fillna(''))
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    return [term for term, freq in sorted(zip(terms, freqs), key=lambda x: x[1], reverse=True)]

def get_method_phrases_robust(
    corpus_terms,
    client,
    model_type,
    credit_tracker,
    n_runs=3,
    temp=0,
    top_p=1.0,
    show_progress=True
):
    """
    Robustly extract method/technique phrases from candidate terms using an LLM.
    The prompt is self-contained in the function for maximum reproducibility.
    Handles common LLM output issues (markdown, formatting) safely.
    Returns a sorted method list and their frequency counts across runs.
    """
    import collections
    import re
    import ast

    def parse_llm_python_list(output_text):
        """
        Parse a Python list from any LLM string output, robust to markdown, stray text, or malformed formatting.
        """
        content = re.sub(r"```python", "", output_text, flags=re.DOTALL)	
        list_match = re.search(r'$$.*?$$', content, re.DOTALL)
        if list_match:
            try:
                return ast.literal_eval(list_match.group(0))
            except Exception:
                temp = list_match.group(0).strip("[]")
                return [item.strip().strip("'\"") for item in temp.split(",") if item.strip()]
        content = content.replace("\n", " ").replace(";", ",")
        return [item.strip().strip("'\"") for item in content.split(",") if item.strip()]

    # Construct prompt inside function
    sample_terms = ', '.join(corpus_terms[:100])
     
    

    prompt = (
        "Here are the most frequent terms from a corpus of scientific papers:\n"
        f"{sample_terms}\n"
        "From the full list: " + ", ".join(corpus_terms) + "\n"
        "Extract ONLY the terms that represent specific methodologies, techniques, or named approaches. "
        "Focus on computational, statistical, engineering, and reliability methods.\n"
        "DO include: e.g. 'monte carlo simulation', 'unit commitment', 'load flow analysis', 'genetic algorithm', "
        "'neural network', 'stochastic optimization', 'reinforcement learning', 'fault tree analysis'.\n"
        "DO NOT include generic terms like 'framework', 'analysis', 'system', 'method', 'procedure', 'approach', 'application', 'performance', 'review', 'assesment', "
        "by themselves or in combination with only other generic terms.\n"
        "Return as a single-line Python list; comma separated, no extra formatting."
    )

    all_phrases_sets = []
    for i in range(n_runs):
        response = client.chat.completions.create(
            model=model_type,
            messages=[{"role": "user", "content": prompt}],
            temperature=temp,
            top_p=top_p
        )
        content = response.choices[0].message.content
        phrases = parse_llm_python_list(content)
        phrases = [p.lower() for p in phrases if p.strip()]
        all_phrases_sets.append(set(phrases))
        credit_tracker.update(len(content))
        if show_progress:
            print(f"Run {i+1}: found {len(phrases)} method phrases.")
    # Count occurrences, flatten all phrases
    all_flat = [p for s in all_phrases_sets for p in s]
    counts = collections.Counter(all_flat)
    sorted_methods = sorted(counts, key=lambda x: (-counts[x], x))
    print(f"\nTotal unique phrases: {len(counts)}. Most stable top 10: {sorted_methods[:10]}")
    return sorted_methods, counts


def filter_methods_with_llm(
    method_list, client, model_type, credit_tracker,
    n_batched=25, verbose=True, fallback_to_input=True, min_methods=20, temp=0.0, top_p=1.0
):
    from tqdm import tqdm
    filtered = []
    all_rejected = False

    for i in tqdm(range(0, len(method_list), n_batched)):
        batch = method_list[i:i + n_batched]
        
        prompt = (
            "You are a scientific methods editor.\n"
            "Here is a list of candidate phrases:\n"
            f"{batch}\n\n"
            "For each phrase, KEEP if it is:"
            "\n- a specifically named algorithm, computational/statistical technique"
            "\n- a well-known engineering analysis or optimization procedure that would be used in a scientific context"
            "\n- a specific mathematical/statistical model or method"
            "\nREMOVE if it is a general term that describes a generic group of methods (e.g. statistical methods) research area, system/process label, or property."
            "\nIf unsure, KEEP the phrase."
            "\nReturn ONLY a Python list (no markdown, code block, or explanation)."
        )
        try:
            response = client.chat.completions.create(
                model=model_type,
                messages=[{"role": "user", "content": prompt}],
                temperature=temp,
                top_p=top_p
            )
            content = response.choices[0].message.content
            batch_filtered = parse_llm_python_list(content)
            if verbose and not batch_filtered:
                print(f"LLM WARNING: All phrases removed from batch (starting at {i}). Batch was: {batch}")
            filtered.extend([p for p in batch_filtered if isinstance(p, str) and p.strip()])
            credit_tracker.update(len(content))
        except Exception as ex:
            print(f"LLM FILTER ERROR (batch {i}): {ex}\nBatch: {batch}")
            continue

    if len(filtered) < min_methods:
        all_rejected = True
        if verbose:
            print(f"\n[DEBUG] LLM filter returned only {len(filtered)} methods, falling back to unfiltered list of top {min_methods} from input.")
        if fallback_to_input:
            filtered = list(method_list)[:min_methods]
    if verbose:
        print(f"[DEBUG] Number of method phrases after LLM filtering: {len(filtered)}")
        print(f"[DEBUG] Sample output: {filtered[:5]}")

    return filtered, all_rejected


def get_method_abbreviation_dict(method_phrases, client, model_type, credit_tracker, batch_size=100):
    import ast
    results = {}
    for i in range(0, len(method_phrases), batch_size):
        batch = method_phrases[i:i+batch_size]
        prompt = f"""For each of the following phrases, extract ALL common scientific abbreviations, synonyms, and aliases for methods/techniques.
Methods:\n{chr(10).join(batch)}
Return as Python dict: {{'canonical method': [aliases, ...]}}"""
        response = client.chat.completions.create(
            model=model_type,
            messages=[{"role": "system", "content": "You are a scientific abbreviation expert."},
                      {"role": "user", "content": prompt}]
        )
        content = response.choices[0].message.content.strip()
        start, end = content.find('{'), content.rfind('}')+1
        method_dict = {}
        if start >= 0 and end > start:
            try:
                method_dict = ast.literal_eval(content[start:end])
            except Exception as e:
                logger.warning(f"Failed to parse dictionary from LLM batch: {e}")
        results.update(method_dict)
    logger.info(f"LLM mapped {len(results)} methods to abbreviations/variants.")
    return results

def build_abbr_to_canonical_map(method_dict):
    abbr_map = {}
    for canonical, variants in method_dict.items():
        abbr_map[canonical.lower()] = canonical
        for v in variants:
            abbr_map[v.lower()] = canonical
    return abbr_map

import re
import ast

def parse_llm_python_list(output_text):
    """
    Parse a Python list from any LLM string output, robust to markdown, stray text, or malformed formatting.
    """
    # Remove common start/end code block markers and markdown fences
    content = re.sub(r"```python", "", output_text, flags=re.DOTALL)
    # Find the first list in the string
    list_match = re.search(r'$$.*?$$', content, re.DOTALL)
    if list_match:
        try:
            return ast.literal_eval(list_match.group(0))
        except Exception:
            # fallback: comma-split and strip (will overparse, but better than nothing)
            temp = list_match.group(0).strip("[]")
            return [item.strip().strip("'\"") for item in temp.split(",") if item.strip()]
    # Fallback: comma-split the whole thing
    content = content.replace("\n", " ").replace(";", ",")
    return [item.strip().strip("'\"") for item in content.split(",") if item.strip()]


def standardize_methods_in_text(text, abbr_to_canonical):
    import re
    sorted_vars = sorted(abbr_to_canonical, key=lambda x: -len(x))
    for var in sorted_vars:
        pattern = r'\b' + re.escape(var) + r'\b'
        text = re.sub(pattern, abbr_to_canonical[var], text, flags=re.IGNORECASE)
    return text



In [None]:
# %%
# Cell 6: Method Scoring Functions

def compute_tfidf_scores(processed_texts, method_phrases, ngram_range=(1, 4), min_df=1, max_df=0.95, norm='l2'):
    tfidf_vectorizer = TfidfVectorizer(
        vocabulary=method_phrases, ngram_range=ngram_range,
        min_df=min_df, max_df=max_df, norm=norm
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(processed_texts)
    scores = tfidf_matrix.toarray()
    feature_names = tfidf_vectorizer.get_feature_names_out()
    return scores, feature_names

def compute_lda_scores(processed_texts, method_phrases, ngram_range=(1, 3), n_topics=100, max_iter=20):
    vectorizer = CountVectorizer(
        vocabulary=method_phrases, ngram_range=ngram_range, token_pattern=r'\b[\w-]+\b'
    )
    doc_term_matrix = vectorizer.fit_transform(processed_texts)
    feature_names = vectorizer.get_feature_names_out()
    if n_topics >= 2:
        lda = LatentDirichletAllocation(n_components=n_topics, learning_method='batch',
                                       random_state=42, max_iter=max_iter)
        lda_matrix = lda.fit_transform(doc_term_matrix)
    else:
        lda_matrix = np.zeros((doc_term_matrix.shape[0], len(method_phrases)))
    return lda_matrix, feature_names

def compute_compound_scores(df, method_phrases, processed_col='standardized_text', window=150, min_word_len=4):
    n_docs = len(df)
    n_methods = len(method_phrases)
    scores = np.zeros((n_docs, n_methods), dtype=np.float32)
    docs = df[processed_col].fillna('').str.lower().tolist()
    for j, phrase in enumerate(method_phrases):
        phrase_l = phrase.lower()
        words = [w for w in phrase_l.split() if len(w) >= min_word_len]
        for i, text in enumerate(docs):
            # Only count if full phrase or all tokens are present
            if phrase_l in text:
                scores[i, j] = 1.0
            elif len(words) > 1 and all(w in text for w in words):
                scores[i, j] = 0.8  # partial but very strong
            else:
                scores[i, j] = 0.0
    return scores


def combine_method_scores(tfidf_scores, lda_scores, compound_scores, weights=(0.4, 0.3, 0.3)):
    return weights[0]*tfidf_scores + weights[1]*lda_scores + weights[2]*compound_scores

def assign_top_methods_by_total_score(
    df, total_scores, method_names,
    tfidf_scores, lda_scores, compound_scores,
    top_n=3, min_score=0.03
):
    """
    Assigns top_n methods based on total_scores, with confidence defined by individual method scores.
    - 'super_confident': all method scores above min_score and at least 2 of 3 methods agree on assignment at this rank.
    - 'confident': all method scores above min_score for assignment at this rank.
    - 'low_confidence': assigned even if some scores are below threshold.
    - '' (empty): No method assigned if all scores are equal for all methods in a row.
    """
    for rank in range(top_n):
        top_method = []
        top_score = []
        top_confidence = []

        for i, row in enumerate(total_scores):
            # If all methods are exactly equal, assign nothing.
            if np.allclose(row, row[0]):
                top_method.append("")
                top_score.append(0.0)
                top_confidence.append("")
                continue

            idxs = np.argsort(row)[::-1]
            nth_idx = idxs[rank] if rank < len(idxs) else None

            if nth_idx is None:
                top_method.append("")
                top_score.append(0.0)
                top_confidence.append("")
                continue

            assigned_method = method_names[nth_idx]
            score = row[nth_idx]

            tfidf = tfidf_scores[i][nth_idx]
            lda = lda_scores[i][nth_idx]
            comp = compound_scores[i][nth_idx]

            above_thresh = [(tfidf >= min_score), (lda >= min_score), (comp >= min_score)]
            agree2 = (
                (assigned_method == method_names[np.argmax(tfidf_scores[i])]) +
                (assigned_method == method_names[np.argmax(lda_scores[i])]) +
                (assigned_method == method_names[np.argmax(compound_scores[i])])
            )

            if all(above_thresh):
                if agree2 >= 2:
                    confidence = "super_confident"
                else:
                    confidence = "confident"
            else:
                confidence = "low_confidence"

            # If all three individual method scores are zero (row is all zeros), treat as no assignment
            if tfidf == lda == comp == 0:
                top_method.append("")
                top_score.append(0.0)
                top_confidence.append("")
            else:
                top_method.append(assigned_method)
                top_score.append(score)
                top_confidence.append(confidence)

        df[f'Top_{rank+1}_Method'] = top_method
        df[f'Top_{rank+1}_Score'] = top_score
        df[f'Top_{rank+1}_Confidence'] = top_confidence

    # Set primary assignment columns
    df['Primary_Method'] = df['Top_1_Method']
    df['Primary_Method_Score'] = df['Top_1_Score']
    df['Method_Confidence'] = df['Top_1_Confidence']

    return df


In [None]:
# %%
# Cell 7: Topic Modeling + Naming (Iterative/Consensus LLM) + Author Functions

def run_lda_topic_modeling(df, num_topics=10, num_words=25):
    tokenized_texts = df['processed_text'].apply(lambda x: x.split()).tolist()
    bigram = Phrases(tokenized_texts, min_count=10, threshold=50, delimiter='_')
    trigram = Phrases(bigram[tokenized_texts], threshold=50, delimiter='_')
    phrased = []
    for doc in tokenized_texts:
        bigrams_ = [w for w in bigram[doc] if '_' in w]
        trigrams_ = [w for w in trigram[bigram[doc]] if '_' in w]
        combined = doc + bigrams_ + trigrams_
        phrased.append(' '.join(combined))
    vectorizer = CountVectorizer(ngram_range=(1, 1), token_pattern=r'\b[\w_-]+\b', max_df=0.95, min_df=2, max_features=10000)
    doc_term_matrix = vectorizer.fit_transform(phrased)
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    topic_distributions = lda_model.fit_transform(doc_term_matrix)
    feature_names = vectorizer.get_feature_names_out()
    topic_keywords = {}
    for topic_idx, topic in enumerate(lda_model.components_):
        top_indices = topic.argsort()[:-num_words-1:-1]
        top_words = [feature_names[i] for i in top_indices]
        topic_keywords[topic_idx] = {'top_words': top_words}
    return lda_model, vectorizer, topic_distributions, topic_keywords

def assign_papers_to_topics(topic_distributions):
    paper_classifications = []
    for idx, dist in enumerate(topic_distributions):
        top_2_topics = np.argsort(dist)[-2:][::-1]
        primary_score = dist[top_2_topics[0]]
        other_topics_sum = sum(dist) - primary_score
        dominance_ratio = primary_score / (other_topics_sum + 1e-10)
        paper_classifications.append({
            'paper_idx': idx,
            'primary_topic': top_2_topics[0],
            'secondary_topic': top_2_topics[1],
            'primary_score': primary_score,
            'dominance_ratio': dominance_ratio
        })
    return paper_classifications

def string_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def topic_name_llm_robust(
    lda_keywords, tfidf_ngrams, top_titles,
    client, model_type, credit_tracker,
    initial_iterations=3, max_iterations=10, similarity_threshold=0.7,
    temp=0, top_p=1.0
):
    """
    Iteratively calls LLM for topic naming, detects dominant result by string similarity.
    """
    prompt = (
        "Based on the following keywords and n-grams from LDA and TF-IDF, plus top paper titles, provide a concise topic name "
        "(bigram or trigram, single word if very specific):\n"
        f"LDA: {', '.join(lda_keywords)}\n"
        f"TFIDF: {', '.join(tfidf_ngrams)}\n"
        f"TITLES: {', '.join(top_titles)}\n"
        "Return ONLY the topic name."
    )
    iterations = initial_iterations
    from collections import Counter
    while iterations <= max_iterations:
        generated_names = []
        for _ in range(iterations):
            response = client.chat.completions.create(
                model=model_type,
                messages=[
                    {"role": "system", "content": "You are a science topic-naming assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=temp,
                top_p=top_p
            )
            content = response.choices[0].message.content.strip()
            if content:
                generated_names.append(content)
        # Majority/consensus with string similarity threshold
        for i, name in enumerate(generated_names):
            matches = [other for j, other in enumerate(generated_names)
                       if i != j and string_similarity(name, other) >= similarity_threshold]
            if len(matches) >= len(generated_names)//2:  # Majority found
                print(f"Topic name stabilized after {iterations} iterations: {name}")
                return name
        iterations += 2
        print(f"No majority topic name found, increasing iterations to {iterations}.")
    most_common = Counter(generated_names).most_common(1)[0]
    print(f"Returning most common topic name after {max_iterations} iterations: {most_common}")
    return most_common

def get_top_titles_for_topic(df, paper_classifications, topic_idx, n_titles=10):
    dominant_papers = [p for p in paper_classifications if p['primary_topic'] == topic_idx]
    paper_infos = [
        (df.iloc[p['paper_idx']]['citationCount'] if 'citationCount' in df.columns else 0, df.iloc[p['paper_idx']]['title'])
        for p in dominant_papers if not pd.isna(df.iloc[p['paper_idx']]['title'])
    ]
    top_titles = [title for _, title in sorted(paper_infos, key=lambda x: -x[0])[:n_titles]]
    return top_titles
def get_top_tfidf_ngrams_per_topic(df, tfidf_matrix, feature_names, topic_col='Primary_Topic_Index', top_k=10):
    tfidf_ngrams = {}
    for topic_idx in df[topic_col].dropna().unique():
        topic_idx = int(topic_idx)
        doc_indices = df[df[topic_col] == topic_idx].index
        if len(doc_indices) == 0:
            continue
        topic_tfidf = np.asarray(tfidf_matrix[doc_indices].mean(axis=0)).ravel()
        top_indices = topic_tfidf.argsort()[-top_k:][::-1]
        top_terms = [(feature_names[i], topic_tfidf[i]) for i in top_indices if topic_tfidf[i] > 0]
        tfidf_ngrams[topic_idx] = top_terms
    return tfidf_ngrams
def get_author_stats(paper_classifications, df_field, n_top=5):
    top_papers = {}
    author_topic_stats = {}
    for topic in set(p['primary_topic'] for p in paper_classifications):
        topic_papers = [p for p in paper_classifications if p['primary_topic'] == topic]
        topic_papers.sort(key=lambda x: x['dominance_ratio'], reverse=True)
        top_papers[topic] = []
        for p in topic_papers[:n_top]:
            paper_idx = p['paper_idx']
            try:
                authors = df_field.iloc[paper_idx]['authors']
                if isinstance(authors, str):
                    try: authors = ast.literal_eval(authors)
                    except (ValueError, SyntaxError): authors = []
                if isinstance(authors, list):
                    author_list = []
                    for author in authors:
                        if isinstance(author, dict):
                            author_list.append({'name': author.get('name', 'Unknown'), 'id': author.get('authorId', 'Unknown')})
                else: author_list = []
                top_papers[topic].append({
                    'paperId': df_field.iloc[paper_idx].get('paperId',''),
                    'title': df_field.iloc[paper_idx].get('title',''),
                    'authors': author_list,
                    'score': float(p['primary_score']),
                    'dominance_ratio': float(p['dominance_ratio'])
                })
            except Exception as e: continue
    return top_papers, author_topic_stats
# %%
# Cell X: Save term frequency (keywords, bigrams, trigrams) for visualization

def save_term_frequencies(df, suffix_string, save_dir=SAVE_DIR, max_keywords=5000):
    """Save .json containing keywords, bigrams, trigrams with their counts for later visualization."""
    freq_data = {}
    processed_text = df['processed_text'].fillna('').astype(str)
    for n in range(1, 4):
        vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english', max_features=max_keywords)
        matrix = vectorizer.fit_transform(processed_text)
        terms = vectorizer.get_feature_names_out()
        freqs = matrix.sum(axis=0).A1
        freq_dict = {term: int(freq) for term, freq in sorted(zip(terms, freqs), key=lambda x: -x[1])}
        if n == 1: freq_data['keywords'] = freq_dict
        elif n == 2: freq_data['bigrams'] = freq_dict
        elif n == 3: freq_data['trigrams'] = freq_dict
    out_fn = os.path.join(save_dir, f'term_frequencies_{suffix_string}.json')
    with open(out_fn, 'w', encoding='utf-8') as f:
        json.dump(freq_data, f, indent=2)
    print(f"✓ Saved term frequency summary to {out_fn}")
    return out_fn

# %%
# Cell Y: Save author and venue frequencies for barplot visualization

def save_author_and_venue_frequencies(df, suffix_string, save_dir=SAVE_DIR):
    """Save author and venue frequencies for visualization."""
    # Authors
    if 'authors' in df.columns:
        authors_all = []
        for item in df['authors']:
            # Handle dict, list, or string
            if isinstance(item, str) and item.strip():
                try:
                    obj = eval(item) if (item.strip().startswith("[") or item.strip().startswith("{")) else item.strip()
                except Exception:
                    obj = item.strip()
            else:
                obj = item
            if isinstance(obj, list):
                for author in obj:
                    if isinstance(author, dict) and 'name' in author:
                        authors_all.append(author['name'])
                    elif isinstance(author, str):
                        authors_all.append(author)
            elif isinstance(obj, dict) and 'name' in obj:
                authors_all.append(obj['name'])
            elif isinstance(obj, str):
                authors_all.append(obj)
        author_counts = pd.Series(authors_all).value_counts().reset_index()
        author_counts.columns = ['Author', 'Frequency']
        author_fn = os.path.join(save_dir, f"semantic_scholar_{suffix_string}_author_analysis.csv")
        author_counts.to_csv(author_fn, sep=';', encoding='utf-8', index=False)
        print(f"✓ Saved author frequencies: {author_fn}")
    else:
        print("No 'authors' column found in DF: skipping author frequencies.")
    # Venues
    if 'venue' in df.columns:
        venue_counts = df['venue'].value_counts().reset_index()
        venue_counts.columns = ['Venue', 'Frequency']
        venue_fn = os.path.join(save_dir, f"semantic_scholar_{suffix_string}_venue_frequencies.csv")
        venue_counts.to_csv(venue_fn, sep=';', encoding='utf-8', index=False)
        print(f"✓ Saved venue frequencies: {venue_fn}")
    else:
        print("No 'venue' column found in DF: skipping venue frequencies.")



def save_topic_analysis_outputs(
    df, lda_model, lda_vectorizer, topic_distributions, topic_keywords, topic_names, topic_ngrams,
    author_stats, top_papers, tfidf_ngrams, suffix_string
):
    topic_metadata = {
        "topics": {int(k): v for k,v in topic_keywords.items()},
        "topic_names": {int(k): v for k,v in topic_names.items()},
        "topic_ngrams": {int(k): v for k,v in topic_ngrams.items()},
    }
    with open(os.path.join(SAVE_DIR, f"topics_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump(topic_metadata, f, indent=2)
    with open(os.path.join(SAVE_DIR, f"topic_names_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump({int(k):v for k,v in topic_names.items()}, f, indent=2)
    np.save(os.path.join(SAVE_DIR, f"topic_distributions_{suffix_string}.npy"), topic_distributions)
    import joblib
    joblib.dump(lda_model, os.path.join(SAVE_DIR, f"lda_model_{suffix_string}.joblib"))
    joblib.dump(lda_vectorizer, os.path.join(SAVE_DIR, f"lda_vectorizer_{suffix_string}.joblib"))
    with open(os.path.join(SAVE_DIR, f"top_papers_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump({int(k): v for k, v in top_papers.items()}, f, ensure_ascii=False, indent=2, default=str)
    pd.DataFrame.from_dict(author_stats, orient='index').to_csv(
        os.path.join(SAVE_DIR, f"author_stats_{suffix_string}.csv"))
    with open(os.path.join(SAVE_DIR, f"topic_specific_tfidf_ngrams_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump({int(k):[(term,float(score)) for term,score in v] for k,v in topic_ngrams.items()}, f, indent=2)

In [None]:
# %%
# Cell 8: Diagnostics Function

def diagnostics_with_scores(
    df, tfidf_scores, lda_scores, compound_scores, combined_scores, method_names
):
    n_docs, n_methods = tfidf_scores.shape
    print("=== DIAGNOSTICS ===")
    print(f"Total documents: {n_docs}")
    print(f"Methods: {n_methods}")
    print(f"TF-IDF coverage: {(tfidf_scores > 0).any(axis=1).sum()}/{n_docs} ({100*(tfidf_scores>0).any(axis=1).mean():.1f}%)")
    print(f"LDA coverage: {(lda_scores > 0).any(axis=1).sum()}/{n_docs} ({100*(lda_scores>0).any(axis=1).mean():.1f}%)")
    print(f"Compound coverage: {(compound_scores > 0).any(axis=1).sum()}/{n_docs} ({100*(compound_scores>0).any(axis=1).mean():.1f}%)")
    print(f"Combined coverage: {(combined_scores > 0).any(axis=1).sum()}/{n_docs} ({100*(combined_scores>0).any(axis=1).mean():.1f}%)")
    if 'Primary_Method' in df.columns:
        print("\nMethod label distribution (top 10):")
        print(df['Primary_Method'].value_counts().head(10))
    if 'Method_Confidence' in df.columns:
        print("\nMethod confidence distribution:")
        print(df['Method_Confidence'].value_counts())
    print("\nMethod vocabulary sample:", ', '.join(method_names[:10]))
    print(f"\nTFIDF stats: mean={tfidf_scores.mean():.3f}, std={tfidf_scores.std():.3f}")
    print(f"LDA stats: mean={lda_scores.mean():.3f}, std={lda_scores.std():.3f}")
    print(f"Compound stats: mean={compound_scores.mean():.3f}, std={compound_scores.std():.3f}")
    print(f"Combined stats: mean={combined_scores.mean():.3f}, std={combined_scores.std():.3f}")


In [None]:
# Cell 9: Topic Analysis Workflow (with robust LLM topic naming)

# -- Parameters for topic workflow --
NUM_TOPICS = 12
NUM_TOPIC_WORDS = 15
TOPIC_LLM_ITER_INIT = 3      # Initial LLM naming iterations for topic consensus
TOPIC_LLM_ITER_MAX = 9       # Max LLM naming iterations
TOPIC_LLM_SIM_THRESH = 0.72  # Majority string similarity threshold
TOPIC_LLM_TEMP = 0           # LLM temperature for topic naming, higher = more creative, lower = more deterministic, min= 0, max=1.0
TOPIC_LLM_TOP_P = 1.0        # LLM top_p for topic naming, Indicates what portion of the probability mass to consider, 1.0 = all, 0.9 = top 90%, etc.


current_date = datetime.now().strftime("%Y_%m_%d")
keyword_str = '_'.join(extract_keywords_from_filename(filename)) if 'filename' in locals() else ""
suffix_string = f"{current_date}{keyword_str}"
save_term_frequencies(df, suffix_string)
# Usage example after suffix_string was defined as above:
save_author_and_venue_frequencies(df, suffix_string)


logger.info("Starting topic modeling workflow...")  
lda_model, lda_vectorizer, topic_distributions, topic_keywords = run_lda_topic_modeling(
    df, num_topics=NUM_TOPICS, num_words=NUM_TOPIC_WORDS)
logger.info("✓ LDA topic modeling completed.")
logger.info(f"Identified {len(topic_keywords)} topics with {NUM_TOPIC_WORDS} top words each.")
logger.info("Assigning papers to topics based on LDA distributions...")
paper_classifications = assign_papers_to_topics(topic_distributions)
df['Primary_Topic_Index'] = [int(p['primary_topic']) for p in paper_classifications]
df['Primary_Score'] = [p['primary_score'] for p in paper_classifications]
df['Dominance_Ratio'] = [p['dominance_ratio'] for p in paper_classifications]

logger.info("✓ Papers assigned to topics based on LDA distributions.")
logger.info(f"Total papers assigned to topics: {df['Primary_Topic_Index'].nunique()}")

logger.info("Startting Topic n-gram extraction and naming...")
# Per-topic TF-IDF n-grams for naming/interpretation
topic_tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3), min_df=2, max_df=0.95, token_pattern=r'\b[\w_-]+\b'
)
topic_tfidf_matrix = topic_tfidf_vectorizer.fit_transform(df['processed_text'])
topic_tfidf_feature_names = topic_tfidf_vectorizer.get_feature_names_out()

topic_ngrams = get_top_tfidf_ngrams_per_topic(
    df, topic_tfidf_matrix, topic_tfidf_feature_names, topic_col='Primary_Topic_Index', top_k=10)

logger.info("✓ Extracted topic-specific TF-IDF n-grams for naming.")
logger.info("Starting iterative LLM topic naming...")
# Iterative LLM topic naming
topic_names = {}
for topic_idx, keywords in topic_keywords.items():
    lda_ngrams = keywords['top_words'][:NUM_TOPIC_WORDS]
    tfidf_ng = [ngram for ngram, _ in topic_ngrams.get(topic_idx, [])][:NUM_TOPIC_WORDS]
    top_titles = get_top_titles_for_topic(df, paper_classifications, topic_idx, n_titles=10)
    topic_name = topic_name_llm_robust(
        lda_ngrams, tfidf_ng, top_titles,
        client, model_type, credit_tracker,
        initial_iterations=TOPIC_LLM_ITER_INIT,
        max_iterations=TOPIC_LLM_ITER_MAX,
        similarity_threshold=TOPIC_LLM_SIM_THRESH,
        temp=TOPIC_LLM_TEMP, top_p=TOPIC_LLM_TOP_P
    )
    topic_names[topic_idx] = topic_name
    logger.info(f"Topic {topic_idx}: {topic_name if topic_name else 'Unnamed'}")
df['Primary_Topic'] = df['Primary_Topic_Index'].map(lambda x: topic_names.get(x, f"Topic_{x}"))
logger.info("✓ Topic naming and assignment completed.")
logger.info(f"Total unique topics named: {len(topic_names)}")


# Author analysis and top papers per topic
logger.info("Starting author statistics and top papers extraction...")
top_papers, author_stats = get_author_stats(paper_classifications, df, n_top=5)

# Save topic/author analysis results
logger.info("Saving topic analysis outputs...")
current_date = datetime.now().strftime("%Y_%m_%d")
keyword_str = '_'.join(search_keywords) if search_keywords else ""
suffix_string = f"{current_date}{keyword_str}"



save_topic_analysis_outputs(df, lda_model, lda_vectorizer, topic_distributions, topic_keywords, topic_names, topic_ngrams, author_stats, top_papers, topic_ngrams, suffix_string)
print("\nSample topics and names:")
print({k: topic_names[k] for k in list(topic_names)[:5]})
print("\nTop authors and top papers by topic (first 2):")
print(dict(list(top_papers.items())[:2]))

✓ Saved term frequency summary to Saved_files_new\term_frequencies_2025_08_23reliability_resilience_power_systems.json


2025-08-23 16:41:14,937 - INFO - Starting topic modeling workflow...


✓ Saved author frequencies: Saved_files_new\semantic_scholar_2025_08_23reliability_resilience_power_systems_author_analysis.csv
✓ Saved venue frequencies: Saved_files_new\semantic_scholar_2025_08_23reliability_resilience_power_systems_venue_frequencies.csv


2025-08-23 16:41:15,212 - INFO - collecting all words and their counts
2025-08-23 16:41:15,212 - INFO - PROGRESS: at sentence #0, processed 0 words and 0 word types
2025-08-23 16:41:16,694 - INFO - PROGRESS: at sentence #10000, processed 1524957 words and 863761 word types
2025-08-23 16:41:18,154 - INFO - PROGRESS: at sentence #20000, processed 3004878 words and 1467564 word types
2025-08-23 16:41:19,369 - INFO - collected 1902495 token types (unigram + bigrams) from a corpus of 4290297 words and 28934 sentences
2025-08-23 16:41:19,370 - INFO - merged Phrases<1902495 vocab, min_count=10, threshold=50, max_vocab_size=40000000>
2025-08-23 16:41:19,372 - INFO - Phrases lifecycle event {'msg': 'built Phrases<1902495 vocab, min_count=10, threshold=50, max_vocab_size=40000000> in 4.16s', 'datetime': '2025-08-23T16:41:19.371104', 'gensim': '4.3.2', 'python': '3.11.13 (main, Jun 12 2025, 12:41:34) [MSC v.1943 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-0

BadRequestError: Error code: 400 - {'error': {'message': "Unsupported value: 'temperature' does not support 0 with this model. Only the default (1) value is supported.", 'type': 'invalid_request_error', 'param': 'temperature', 'code': 'unsupported_value'}}

In [None]:
""" Old workflow, # %%
# Cell 10: Method Extraction and Assignment Workflow (Iterative LLM, With Parameterization)

# ----------- Parameters for Method Extraction Workflow -----------
MAX_FEATURES = 10000         # Max n-grams for candidate extraction
NGRAM_RANGE = (1, 4)         # For TF-IDF/LDA
WINDOW_COMPOUND = 150        # Window for proximity scoring
MIN_WORD_LEN = 4             # Minimum token length
TFIDF_WEIGHT = 0.5           # Weight for TF-IDF
LDA_WEIGHT = 0.3             # Weight for LDA
COMPOUND_WEIGHT = 0.2        # Weight for compound
TOP_METHODS_PER_PAPER = 4    # Number of methods assigned per paper
MIN_ASSIGN_SCORE = 0.02      # Minimum combined score
BATCH_SIZE_LLM = 100         # LLM batch for abbreviations

# LLM parameters for method phrase extraction
METHOD_LLM_N_RUNS = 3
METHOD_LLM_TEMP = 0.05
METHOD_LLM_TOP_P = 0.95

logger.info("Starting pipeline for method detection and assignment...")

# 1. Extract broad candidate n-grams from the corpus
candidate_terms = extract_candidate_terms(
    df, text_col='processed_text', max_features=MAX_FEATURES)
logger.info(f"Step 1: Extracted {len(candidate_terms)} candidate terms from the corpus.")

# 2. Use robust/iterative LLM to filter for method/technique phrases
method_phrases_all, method_phrase_counts = get_method_phrases_robust(
    candidate_terms,
    client,
    model_type,
    credit_tracker,
    n_runs=METHOD_LLM_N_RUNS,
    temp=METHOD_LLM_TEMP,
    top_p=METHOD_LLM_TOP_P
)
logger.info(f"Step 2: Extracted {len(method_phrases_all)} method phrases from the corpus.")
method_phrases = filter_methods_with_llm(
    method_phrases_all,
    client,
    model_type,
    credit_tracker,
    n_batched=100
)
logger.info(f"Filtered out {len(method_phrases_all) - len(method_phrases)} generic phrases, leaving {len(method_phrases)} specific method phrases.")

# 3. Use LLM to build abbreviation/synonym dictionary
logger.info("Building method abbreviation dictionary with LLM...")
method_dict = get_method_abbreviation_dict(
    method_phrases, client, model_type, credit_tracker, batch_size=BATCH_SIZE_LLM)
logger. info(f"Get_method_abbreviation_dict found {len(method_dict)} methods with abbreviations/synonyms.")
abbr_to_canonical_map = build_abbr_to_canonical_map(method_dict)
logger.info(f"Built abbreviation map for {len(abbr_to_canonical_map)} methods.")
logger
df['standardized_text'] = df['processed_text'].apply(
    lambda t: standardize_methods_in_text(t, abbr_to_canonical_map))
logger.info("Standardized method phrases in the corpus text.")
method_vocabulary = sorted(method_dict.keys())
logger.info(f"Step 3: Built abbreviation map for {len(method_vocabulary)} methods.")

# 4. Compute all method score matrices
logger.info("Computing method score matrices...")
tfidf_scores, method_names = compute_tfidf_scores(
    df['standardized_text'], method_vocabulary, ngram_range=NGRAM_RANGE)
logger.info(f"Step 4a: Computed TF-IDF scores for {len(method_names)} methods.")
lda_n_topics = len(method_vocabulary)
logger.info(f"Step 4b: Using {lda_n_topics} topics for LDA scoring.")
lda_scores, lda_names = compute_lda_scores(
    df['standardized_text'], method_vocabulary, ngram_range=NGRAM_RANGE, n_topics=lda_n_topics)
assert list(method_names) == list(lda_names)

logger.info(f"Step 4c: Computing compound scores for {len(method_names)} methods.")
compound_scores = compute_compound_scores(
    df, method_names, processed_col='standardized_text',
    window=WINDOW_COMPOUND, min_word_len=MIN_WORD_LEN)
logger.info(f"Step 4d: Combined scores from TF-IDF, LDA, and compound methods.")
combined_scores = combine_method_scores(
    tfidf_scores, lda_scores, compound_scores,
    weights=(TFIDF_WEIGHT, LDA_WEIGHT, COMPOUND_WEIGHT))
logger.info(f"Step 4: Computed complete method score matrices with {len(method_names)} methods.")

# 5. Assign methods to papers with confidence
df = assign_top_methods_by_total_score(
    df, combined_scores, method_names,
    top_n=TOP_METHODS_PER_PAPER, min_score=MIN_ASSIGN_SCORE)
logger.info(f"Step 5: Assigned top {TOP_METHODS_PER_PAPER} methods to {len(df)} papers with confidence levels.")

# 6. Save all matrix DataFrames for visualization
for scores, label in zip([tfidf_scores, lda_scores, compound_scores, combined_scores],
                         ["tfidf", "lda", "compound", "combined"]):
    pd.DataFrame(scores, columns=method_names, index=df.index).to_csv(
        os.path.join(SAVE_DIR, f"semantic_scholar_{suffix_string}_method_{label}_scores.csv")
    )
logger.info(f"Step 6: Saved method score matrices to {SAVE_DIR}.")

# 7. Run diagnostics
diagnostics_with_scores(df, tfidf_scores, lda_scores, compound_scores, combined_scores, method_names)
logger.info(f"Step 7: Diagnostics completed.")

# 8. Output short preview
print(df[['Primary_Method', 'Primary_Method_Score', 'Method_Confidence', 'Top_1_Method', 'Top_1_Score']].head())
logger.info(f"Step 8: Output preview of method assignments.")

# 9. Final saving of the DataFrame with method assignments
df.to_csv(os.path.join(SAVE_DIR, f"semantic_scholar_{suffix_string}_all_results.csv"))
logger.info(f"Step 9: Saved final DataFrame with method assignments to {SAVE_DIR}.")
"""

' Old workflow, # %%\n# Cell 10: Method Extraction and Assignment Workflow (Iterative LLM, With Parameterization)\n\n# ----------- Parameters for Method Extraction Workflow -----------\nMAX_FEATURES = 10000         # Max n-grams for candidate extraction\nNGRAM_RANGE = (1, 4)         # For TF-IDF/LDA\nWINDOW_COMPOUND = 150        # Window for proximity scoring\nMIN_WORD_LEN = 4             # Minimum token length\nTFIDF_WEIGHT = 0.5           # Weight for TF-IDF\nLDA_WEIGHT = 0.3             # Weight for LDA\nCOMPOUND_WEIGHT = 0.2        # Weight for compound\nTOP_METHODS_PER_PAPER = 4    # Number of methods assigned per paper\nMIN_ASSIGN_SCORE = 0.02      # Minimum combined score\nBATCH_SIZE_LLM = 100         # LLM batch for abbreviations\n\n# LLM parameters for method phrase extraction\nMETHOD_LLM_N_RUNS = 3\nMETHOD_LLM_TEMP = 0.05\nMETHOD_LLM_TOP_P = 0.95\n\nlogger.info("Starting pipeline for method detection and assignment...")\n\n# 1. Extract broad candidate n-grams from the corpus\

In [None]:
"""# Old workflow %%
# Cell 10: Method Extraction and Assignment Workflow (Iterative LLM, With Parameterization, Robust Filtering)

# ----------- Parameters for Method Extraction Workflow -----------
MAX_FEATURES = 10000         # Max n-grams for candidate extraction
NGRAM_RANGE = (1, 4)         # For TF-IDF/LDA
WINDOW_COMPOUND = 150        # Window for proximity scoring
MIN_WORD_LEN = 4             # Minimum token length
TFIDF_WEIGHT = 0.5           # Weight for TF-IDF
LDA_WEIGHT = 0.3             # Weight for LDA
COMPOUND_WEIGHT = 0.2        # Weight for compound
TOP_METHODS_PER_PAPER = 4    # Number of methods assigned per paper
MIN_ASSIGN_SCORE = 0.2      # Minimum combined score
BATCH_SIZE_LLM = 100         # LLM batch for abbreviations

# LLM parameters for method phrase extraction
METHOD_LLM_N_RUNS = 3
METHOD_LLM_TEMP = 0.05
METHOD_LLM_TOP_P = 0.95



# --- Begin workflow ---

logger.info("Starting pipeline for method detection and assignment...")

# 1. Extract candidate n-grams
candidate_terms = extract_candidate_terms(
    df, text_col='processed_text', max_features=MAX_FEATURES)
logger.info(f"Step 1: Extracted {len(candidate_terms)} candidate terms from the corpus.")

# 2. LLM method extraction
method_phrases_all, method_phrase_counts = get_method_phrases_robust(
    candidate_terms,
    client,
    model_type,
    credit_tracker,
    n_runs=METHOD_LLM_N_RUNS,
    temp=METHOD_LLM_TEMP,
    top_p=METHOD_LLM_TOP_P
)
logger.info(f"Step 2: Extracted {len(method_phrases_all)} method phrases from the corpus.")
print("DEBUG: Raw LLM method extraction sample:", method_phrases_all[:10])

# 3. LLM filtering (robust, with logging/fallback)
method_phrases, filter_all = filter_methods_with_llm(
    method_phrases_all,
    client,
    model_type,
    credit_tracker,
    n_batched=100,
    verbose=True,
    fallback_to_input=True,
    min_methods=20
)

logger.info(f"Filtered out {len(method_phrases_all) - len(method_phrases)} generic phrases, leaving {len(method_phrases)} specific method phrases.")

if filter_all:
    logger.warning("All method phrases filtered out by LLM—fallback or relaxation engaged.")

# --- STOP if vocabulary is empty ---
if not method_phrases:
    logger.error("No method phrases after filtering. Stopping workflow! Adjust LLM extraction, candidate set, or prompt.")
    raise RuntimeError("Critical: Method phrase vocabulary is empty after LLM filtering. Workflow terminated.")

# 4. Use LLM to build abbreviation/synonym dictionary
logger.info("Building method abbreviation dictionary with LLM...")
method_dict = get_method_abbreviation_dict(
    method_phrases, client, model_type, credit_tracker, batch_size=BATCH_SIZE_LLM)
logger.info(f"Get_method_abbreviation_dict found {len(method_dict)} methods with abbreviations/synonyms.")
abbr_to_canonical_map = build_abbr_to_canonical_map(method_dict)
logger.info(f"Built abbreviation map for {len(abbr_to_canonical_map)} methods.")

# 5. Standardize and check vocabulary
df['standardized_text'] = df['processed_text'].apply(
    lambda t: standardize_methods_in_text(t, abbr_to_canonical_map))
method_vocabulary = sorted(method_dict.keys())
print("DEBUG: Final method vocabulary (sample):", method_vocabulary[:10])
if not method_vocabulary:
    logger.error("No methods left after LLM filtering and abbreviation mapping. Stopping workflow!")
    raise RuntimeError("Critical: Method vocabulary is empty after all processing.")

logger.info(f"Step 3: Ready with {len(method_vocabulary)} final methods.")

# 6. Compute all method score matrices
logger.info("Computing method score matrices...")
tfidf_scores, method_names = compute_tfidf_scores(
    df['standardized_text'], method_vocabulary, ngram_range=NGRAM_RANGE)
logger.info(f"Step 4a: Computed TF-IDF scores for {len(method_names)} methods.")
lda_n_topics = len(method_vocabulary)
logger.info(f"Step 4b: Using {lda_n_topics} topics for LDA scoring.")
lda_scores, lda_names = compute_lda_scores(
    df['standardized_text'], method_vocabulary, ngram_range=NGRAM_RANGE, n_topics=lda_n_topics)
assert list(method_names) == list(lda_names)

logger.info(f"Step 4c: Computing compound scores for {len(method_names)} methods.")
compound_scores = compute_compound_scores(
    df, method_names, processed_col='standardized_text',
    window=WINDOW_COMPOUND, min_word_len=MIN_WORD_LEN)
logger.info(f"Step 4d: Combined scores from TF-IDF, LDA, and compound methods.")
combined_scores = combine_method_scores(
    tfidf_scores, lda_scores, compound_scores,
    weights=(TFIDF_WEIGHT, LDA_WEIGHT, COMPOUND_WEIGHT)
)
logger.info(f"Step 4: Computed complete method score matrices with {len(method_names)} methods.")

# 7. Assign methods to papers with confidence
df = assign_top_methods_by_total_score(
    df, combined_scores, method_names,
    top_n=TOP_METHODS_PER_PAPER, min_score=MIN_ASSIGN_SCORE)
logger.info(f"Step 5: Assigned top {TOP_METHODS_PER_PAPER} methods to {len(df)} papers with confidence levels.")

# 8. Save all matrix DataFrames for visualization
for scores, label in zip([tfidf_scores, lda_scores, compound_scores, combined_scores],
                         ["tfidf", "lda", "compound", "combined"]):
    pd.DataFrame(scores, columns=method_names, index=df.index).to_csv(
        os.path.join(SAVE_DIR, f"semantic_scholar_{suffix_string}_method_{label}_scores.csv")
    )
logger.info(f"Step 6: Saved method score matrices to {SAVE_DIR}.")

# 9. Run diagnostics
diagnostics_with_scores(df, tfidf_scores, lda_scores, compound_scores, combined_scores, method_names)
logger.info(f"Step 7: Diagnostics completed.")

# 10. Output short preview
print(df[['Primary_Method', 'Primary_Method_Score', 'Method_Confidence', 'Top_1_Method', 'Top_1_Score']].head())
logger.info(f"Step 8: Output preview of method assignments.")

# 11. Final saving of the DataFrame with method assignments
df.to_csv(os.path.join(SAVE_DIR, f"semantic_scholar_{suffix_string}_all_results.csv"))
logger.info(f"Step 9: Saved final DataFrame with method assignments to {SAVE_DIR}.")
"""



In [None]:
# New workflow 
# ===============================================
# ========== Method Extraction Workflow =========
# ========== (Robust LLM/Pipeline) =============
# ===============================================

# ----------- Parameters for Method Extraction Workflow -----------
MAX_FEATURES = 10000         # Max n-grams for candidate extraction
NGRAM_RANGE = (1, 4)         # For TF-IDF/LDA
WINDOW_COMPOUND = 150        # Window for proximity scoring
MIN_WORD_LEN = 4             # Minimum token length
TFIDF_WEIGHT = 0.5           # Weight for TF-IDF
LDA_WEIGHT = 0.3             # Weight for LDA
COMPOUND_WEIGHT = 0.2        # Weight for compound
TOP_METHODS_PER_PAPER = 5    # Number of methods assigned per paper
MIN_ASSIGN_SCORE = 0.2       # Minimum combined score
BATCH_SIZE_LLM = 100         # LLM batch for abbreviations

# LLM parameters for method phrase extraction
METHOD_LLM_N_RUNS = 3
METHOD_LLM_TEMP = 0.05
METHOD_LLM_TOP_P = 0.95

logger.info("Starting pipeline for method detection and assignment...")

# 1. Extract candidate n-grams
candidate_terms = extract_candidate_terms(
    df, text_col='processed_text', max_features=MAX_FEATURES
)
logger.info(f"Step 1: Extracted {len(candidate_terms)} candidate terms from the corpus.")

# 2. LLM method extraction
method_phrases_all, method_phrase_counts = get_method_phrases_robust(
    candidate_terms,
    client,
    model_type,
    credit_tracker,
    n_runs=METHOD_LLM_N_RUNS,
    temp=METHOD_LLM_TEMP,
    top_p=METHOD_LLM_TOP_P
)
logger.info(f"Step 2: Extracted {len(method_phrases_all)} method phrases from the corpus.")
print("DEBUG: Raw LLM method extraction sample:", method_phrases_all[:10])

# 3. LLM filtering (robust, with logging/fallback)
method_phrases, filter_all = filter_methods_with_llm(
    method_phrases_all,
    client,
    model_type,
    credit_tracker,
    n_batched=BATCH_SIZE_LLM,
    verbose=True,
    fallback_to_input=True,
    min_methods=20,
    temp=METHOD_LLM_TEMP,
    top_p=METHOD_LLM_TOP_P
)

logger.info(f"Filtered out {len(method_phrases_all) - len(method_phrases)} generic phrases, leaving {len(method_phrases)} specific method phrases.")

if filter_all:
    logger.warning("All method phrases filtered out by LLM—fallback or relaxation engaged.")

# --- STOP if vocabulary is empty ---
if not method_phrases:
    logger.error("No method phrases after filtering. Stopping workflow! Adjust LLM extraction, candidate set, or prompt.")
    raise RuntimeError("Critical: Method phrase vocabulary is empty after LLM filtering. Workflow terminated.")

# 4. Use LLM to build abbreviation/synonym dictionary
logger.info("Building method abbreviation dictionary with LLM...")
method_dict = get_method_abbreviation_dict(
    method_phrases, client, model_type, credit_tracker, batch_size=BATCH_SIZE_LLM)
logger.info(f"Get_method_abbreviation_dict found {len(method_dict)} methods with abbreviations/synonyms.")
abbr_to_canonical_map = build_abbr_to_canonical_map(method_dict)
logger.info(f"Built abbreviation map for {len(abbr_to_canonical_map)} methods.")

# 5. Standardize and check vocabulary
df['standardized_text'] = df['processed_text'].apply(
    lambda t: standardize_methods_in_text(t, abbr_to_canonical_map))
method_vocabulary = sorted(method_dict.keys())
print("DEBUG: Final method vocabulary (sample):", method_vocabulary[:10])
if not method_vocabulary:
    logger.error("No methods left after LLM filtering and abbreviation mapping. Stopping workflow!")
    raise RuntimeError("Critical: Method vocabulary is empty after all processing.")

logger.info(f"Step 3: Ready with {len(method_vocabulary)} final methods.")

# 6. Compute all method score matrices
logger.info("Computing method score matrices...")
tfidf_scores, method_names = compute_tfidf_scores(
    df['standardized_text'], method_vocabulary, ngram_range=NGRAM_RANGE
)
logger.info(f"Step 4a: Computed TF-IDF scores for {len(method_names)} methods.")
lda_n_topics = len(method_vocabulary)
logger.info(f"Step 4b: Using {lda_n_topics} topics for LDA scoring.")
lda_scores, lda_names = compute_lda_scores(
    df['standardized_text'], method_vocabulary, ngram_range=NGRAM_RANGE, n_topics=lda_n_topics
)
assert list(method_names) == list(lda_names)

logger.info(f"Step 4c: Computing compound scores for {len(method_names)} methods.")
compound_scores = compute_compound_scores(
    df, method_names, processed_col='standardized_text',
    window=WINDOW_COMPOUND, min_word_len=MIN_WORD_LEN
)
logger.info(f"Step 4d: Combined scores from TF-IDF, LDA, and compound methods.")
combined_scores = combine_method_scores(
    tfidf_scores, lda_scores, compound_scores,
    weights=(TFIDF_WEIGHT, LDA_WEIGHT, COMPOUND_WEIGHT)
)
logger.info(f"Step 4: Computed complete method score matrices with {len(method_names)} methods.")

# 7. Assign methods to papers with confidence
df = assign_top_methods_by_total_score(
    df, combined_scores, method_names,
    top_n=TOP_METHODS_PER_PAPER, min_score=MIN_ASSIGN_SCORE
)
logger.info(f"Step 5: Assigned top {TOP_METHODS_PER_PAPER} methods to {len(df)} papers with confidence levels.")

# 8. Save all matrix DataFrames for visualization
for scores, label in zip([tfidf_scores, lda_scores, compound_scores, combined_scores],
                         ["tfidf", "lda", "compound", "combined"]):
    pd.DataFrame(scores, columns=method_names, index=df.index).to_csv(
        os.path.join(SAVE_DIR, f"semantic_scholar_{suffix_string}_method_{label}_scores.csv")
    )
logger.info(f"Step 6: Saved method score matrices to {SAVE_DIR}.")

# 9. Run diagnostics
diagnostics_with_scores(df, tfidf_scores, lda_scores, compound_scores, combined_scores, method_names)
logger.info(f"Step 7: Diagnostics completed.")

# 10. Output short preview
print(df[['Primary_Method', 'Primary_Method_Score', 'Method_Confidence', 'Top_1_Method', 'Top_1_Score']].head())
logger.info(f"Step 8: Output preview of method assignments.")

# 11. Final saving of the DataFrame with method assignments
df.to_csv(os.path.join(SAVE_DIR, f"semantic_scholar_{suffix_string}_all_results.csv"))
logger.info(f"Step 9: Saved final DataFrame with method assignments to {SAVE_DIR}.")


NameError: name 'logger' is not defined