In [1]:
# Cell 1: Imports and Setup
import os
import re
import json
import csv
import ast
import numpy as np
import pandas as pd
from collections import Counter
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import configparser
import tiktoken
import logging
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Phrases
import openai

SAVE_DIR = "Saved_files_new"
os.makedirs(SAVE_DIR, exist_ok=True)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)


True

In [2]:
# Cell 2: OpenAI Setup
class CreditTracker:
    def __init__(self):
        self.total_tokens = 0
        self.total_cost = 0
        self.cost_per_1k_tokens = 0.00015
    def update(self, tokens):
        self.total_tokens += tokens
        self.total_cost += (tokens / 1000) * self.cost_per_1k_tokens
    def get_stats(self):
        return {"total_tokens": self.total_tokens, "total_cost": round(self.total_cost, 4)}

def initialize_openai():
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE')
    client = openai.OpenAI(api_key=api_key)
    return client, model_type

def num_tokens_from_string(string: str, model_name: str) -> int:
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(string))

client, model_type = initialize_openai()
credit_tracker = CreditTracker()


In [3]:
# Cell 3: Utility Functions

def extract_keywords_from_filename(filename):
    base = os.path.splitext(os.path.basename(filename))[0]
    parts = base.split('_')
    return [part for i, part in enumerate(parts) if i > 2 and part != 'results' and not part.isdigit()]

def get_custom_stop_words(search_keywords=None):
    stop_words = set(stopwords.words('english'))
    words_to_keep = set()
    if search_keywords:
        for keyword in search_keywords:
            keyword = keyword.lower()
            words_to_keep.add(keyword)
            for word in keyword.split():
                words_to_keep.add(word)
    stop_words = stop_words - words_to_keep
    scientific_terms = {'et', 'al', 'ref', 'reference', 'references', 'cited', 'cite',
        'fig', 'figure', 'figures', 'table', 'tables', 'chart', 'charts',
        'published', 'journal', 'conference', 'proceedings', 'vol', 'volume', 'pp', 'page', 'pages', 'doi'}
    return stop_words.union(scientific_terms)

def preprocess_text(text, search_keywords=None, min_word_length=2, remove_numbers=True):
    if not isinstance(text, (str, int, float)):
        return ''
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'--+', ' ', text)
    tokens = word_tokenize(text)
    stop_words = get_custom_stop_words(search_keywords)
    tokens = [t for t in tokens if len(t) >= min_word_length and t not in stop_words and len(t) > 1 and not t.isdigit()]
    lemmatizer = WordNetLemmatizer()
    try:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    except:
        pass
    return ' '.join(tokens)

def preprocess_dataframe(df, text_col, search_keywords, processed_col='processed_text'):
    df[text_col] = df[text_col].fillna('').astype(str)
    df[processed_col] = df[text_col].apply(lambda x: preprocess_text(x, search_keywords))
    return df[df[processed_col].str.strip() != '']

def clean_fields_of_study(s):
    valid_fields = ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics',
        'Medicine','Business','Environmental Science','Chemistry','Materials Science',
        'Geography','Biology','Geology','Political Science','Psychology','Com']
    if pd.isna(s) or s == '[]':
        return ["Unknown"]
    if isinstance(s, str):
        fields = [field.strip().strip("'\"") for field in s.strip('[]').split(',')]
        return [f if f in valid_fields else "Unknown" for f in fields] or ["Unknown"]
    return ["Unknown"]


In [4]:
# Cell 4: Data Loading & Cleaning
filename = "semantic_scholar_2025_02_14_reliability_resilience_power_systems_results.csv"
filepath = os.path.join("Saved_files", filename)
df = pd.read_csv(filepath, sep=";")
df['text'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')
search_keywords = extract_keywords_from_filename(filename)
df = preprocess_dataframe(df, text_col='text', search_keywords=search_keywords)
df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)
logger.info(f"Loaded and preprocessed {len(df)} papers")


2025-08-20 22:36:57,205 - INFO - Loaded and preprocessed 28934 papers


In [5]:
# Cell 5: Method Phrase Extraction & Standardization

def extract_candidate_terms(df, text_col='processed_text', max_features=20000):
    vectorizer = CountVectorizer(
        ngram_range=(1, 4), max_df=0.95, min_df=2, max_features=max_features, token_pattern=r'\b[\w-]+\b'
    )
    matrix = vectorizer.fit_transform(df[text_col].fillna(''))
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    return [term for term, freq in sorted(zip(terms, freqs), key=lambda x: x[1], reverse=True)]


import ast
import collections

def get_method_phrases(
    corpus_terms,
    client,
    model_type,
    credit_tracker,
    consensus_runs=3,
    consensus_threshold=0.7,
    temp=0
):
    """
    Calls LLM multiple times with temperature=0 and top_p=1, collecting method sets,
    and retains only those terms that appear in at least consensus_threshold proportion of runs.
    """
    sample_terms = ', '.join(corpus_terms[:100])
    prompt = (
        "Here are the most frequent terms from a corpus of scientific papers:\n"
        f"{sample_terms}\n"
        "From the full list: " + ", ".join(corpus_terms) + "\n"
        "Extract ONLY the terms that represent specific methodologies, techniques, or named approaches. "
        "Focus on computational, statistical, engineering, and reliability methods.\n"
        "DO include: e.g. 'monte carlo simulation', 'unit commitment', 'load flow analysis', 'genetic algorithm', "
        "'neural network', 'stochastic optimization', 'reinforcement learning', 'fault tree analysis'.\n"
        "DO NOT include: 'framework', 'analysis', 'system', 'method', 'procedure', 'approach', 'application', 'performance', 'review'.\n"
        "Return as a single-line Python list; comma separated, no extra formatting."
    )

    all_results = []
    for run_idx in range(consensus_runs):
        response = client.chat.completions.create(
            model=model_type,
            messages=[{"role": "user", "content": prompt}],
            temperature=temp,
            top_p=1.0
        )
        content = response.choices[0].message.content
        try:
            results = ast.literal_eval(content)
        except Exception:
            content = content.replace('[', '').replace(']', '').replace('"', '').replace("'", '')
            results = [term.strip() for term in content.split(',') if len(term.strip()) > 3]
        # Lowercase and strip for stable matching
        all_results.append(set(t.lower() for t in results if t.strip()))
        credit_tracker.update(len(content))
        print(f"Run {run_idx+1}: Found {len(results)} method phrases.")

    # Consensus: keep terms found in >= threshold * runs
    counts = collections.Counter(term for result in all_results for term in result)
    min_hits = max(1, int(consensus_runs * consensus_threshold))
    stable_phrases = [term for term, cnt in counts.items() if cnt >= min_hits]

    print(f"\n{len(stable_phrases)} consensus method phrases found in >= {min_hits}/{consensus_runs} runs.")
    if not stable_phrases:
        print("Warning: No stable method phrases found. Consider lowering consensus threshold or increasing runs.")
    return sorted(stable_phrases)


def get_method_abbreviation_dict(method_phrases, client, model_type, credit_tracker, batch_size=100):
    import ast
    results = {}
    for i in range(0, len(method_phrases), batch_size):
        batch = method_phrases[i:i+batch_size]
        prompt = f"""For each of the following phrases, extract ALL common scientific abbreviations, synonyms, and aliases for methods/techniques.
Methods:\n{chr(10).join(batch)}
Return as Python dict: {{'canonical method': [aliases, ...]}}"""
        response = client.chat.completions.create(
            model=model_type,
            messages=[{"role": "system", "content": "You are a scientific abbreviation expert."},
                      {"role": "user", "content": prompt}]
        )
        content = response.choices[0].message.content.strip()
        start, end = content.find('{'), content.rfind('}')+1
        method_dict = {}
        if start >= 0 and end > start:
            try:
                method_dict = ast.literal_eval(content[start:end])
            except Exception as e:
                logger.warning(f"Failed to parse dictionary from LLM batch: {e}")
        results.update(method_dict)
    logger.info(f"LLM mapped {len(results)} methods to abbreviations/variants.")
    return results

def build_abbr_to_canonical_map(method_dict):
    abbr_map = {}
    for canonical, variants in method_dict.items():
        abbr_map[canonical.lower()] = canonical
        for v in variants:
            abbr_map[v.lower()] = canonical
    return abbr_map

def standardize_methods_in_text(text, abbr_to_canonical):
    import re
    sorted_vars = sorted(abbr_to_canonical, key=lambda x: -len(x))
    for var in sorted_vars:
        pattern = r'\b' + re.escape(var) + r'\b'
        text = re.sub(pattern, abbr_to_canonical[var], text, flags=re.IGNORECASE)
    return text


In [6]:
# Cell 6: Method Scoring Functions

def compute_tfidf_scores(processed_texts, method_phrases, ngram_range=(1, 4), min_df=1, max_df=0.95, norm='l2'):
    tfidf_vectorizer = TfidfVectorizer(
        vocabulary=method_phrases, ngram_range=ngram_range,
        min_df=min_df, max_df=max_df, norm=norm
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(processed_texts)
    scores = tfidf_matrix.toarray()
    feature_names = tfidf_vectorizer.get_feature_names_out()
    return scores, feature_names

def compute_lda_scores(processed_texts, method_phrases, ngram_range=(1, 3), n_topics=100, max_iter=20):
    vectorizer = CountVectorizer(
        vocabulary=method_phrases, ngram_range=ngram_range, token_pattern=r'\b[\w-]+\b'
    )
    doc_term_matrix = vectorizer.fit_transform(processed_texts)
    feature_names = vectorizer.get_feature_names_out()
    if n_topics >= 2:
        lda = LatentDirichletAllocation(n_components=n_topics, learning_method='batch',
                                       random_state=42, max_iter=max_iter)
        lda_matrix = lda.fit_transform(doc_term_matrix)
    else:
        lda_matrix = np.zeros((doc_term_matrix.shape[0], len(method_phrases)))
    return lda_matrix, feature_names

def compute_compound_scores(df, method_phrases, processed_col='standardized_text', window=150, min_word_len=4):
    n_docs = len(df)
    n_methods = len(method_phrases)
    scores = np.zeros((n_docs, n_methods), dtype=np.float32)
    docs = df[processed_col].fillna('').str.lower().tolist()
    for j, phrase in enumerate(method_phrases):
        phrase_l = phrase.lower()
        words = [w for w in phrase_l.split() if len(w) >= min_word_len]
        for i, text in enumerate(docs):
            if phrase_l in text:
                scores[i, j] = 1.0
            elif len(words) > 1:
                matches = sum(1 for w in words if w in text)
                scores[i, j] = matches / len(words)
    return scores

def combine_method_scores(tfidf_scores, lda_scores, compound_scores, weights=(0.4, 0.3, 0.3)):
    return weights[0]*tfidf_scores + weights[1]*lda_scores + weights[2]*compound_scores

def assign_top_methods_by_total_score(df, total_scores, method_names, top_n=3, min_score=0.03):
    for rank in range(top_n):
        top_method = []
        top_score = []
        for row in total_scores:
            idxs = np.argsort(row)[::-1]
            nth_idx = idxs[rank] if rank < len(idxs) else None
            if nth_idx is not None and row[nth_idx] >= min_score:
                top_method.append(method_names[nth_idx])
                top_score.append(row[nth_idx])
            else:
                top_method.append("LowConfidence")
                top_score.append(row[nth_idx] if nth_idx is not None else 0.0)
        df[f'Top_{rank+1}_Method'] = top_method
        df[f'Top_{rank+1}_Score'] = top_score
    df['Primary_Method'] = df['Top_1_Method']
    df['Primary_Method_Score'] = df['Top_1_Score']
    conf = []
    for m1, s1, m2, s2, m3, s3 in zip(
        df['Top_1_Method'], df['Top_1_Score'], df['Top_2_Method'], df['Top_2_Score'], df['Top_3_Method'], df['Top_3_Score']
    ):
        if m1 != "LowConfidence" and s1 > 2 * max(0.05, s2):
            conf.append("super_confident")
        elif m1 != "LowConfidence":
            conf.append("confident")
        else:
            conf.append("low_confidence")
    df['Method_Confidence'] = conf
    return df


In [7]:
# Cell 7: Topic Modeling + Naming + Author Functions

def run_lda_topic_modeling(df, num_topics=10, num_words=25):
    tokenized_texts = df['processed_text'].apply(lambda x: x.split()).tolist()
    bigram = Phrases(tokenized_texts, min_count=10, threshold=50, delimiter='_')
    trigram = Phrases(bigram[tokenized_texts], threshold=50, delimiter='_')
    phrased = []
    for doc in tokenized_texts:
        bigrams_ = [w for w in bigram[doc] if '_' in w]
        trigrams_ = [w for w in trigram[bigram[doc]] if '_' in w]
        combined = doc + bigrams_ + trigrams_
        phrased.append(' '.join(combined))
    vectorizer = CountVectorizer(ngram_range=(1, 1), token_pattern=r'\b[\w_-]+\b', max_df=0.95, min_df=2, max_features=10000)
    doc_term_matrix = vectorizer.fit_transform(phrased)
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    topic_distributions = lda_model.fit_transform(doc_term_matrix)
    feature_names = vectorizer.get_feature_names_out()
    topic_keywords = {}
    for topic_idx, topic in enumerate(lda_model.components_):
        top_indices = topic.argsort()[:-num_words-1:-1]
        top_words = [feature_names[i] for i in top_indices]
        topic_keywords[topic_idx] = {'top_words': top_words}
    return lda_model, vectorizer, topic_distributions, topic_keywords

def assign_papers_to_topics(topic_distributions):
    paper_classifications = []
    for idx, dist in enumerate(topic_distributions):
        top_2_topics = np.argsort(dist)[-2:][::-1]
        primary_score = dist[top_2_topics[0]]
        other_topics_sum = sum(dist) - primary_score
        dominance_ratio = primary_score / (other_topics_sum + 1e-10)
        paper_classifications.append({
            'paper_idx': idx,
            'primary_topic': top_2_topics[0],
            'secondary_topic': top_2_topics[1],
            'primary_score': primary_score,
            'dominance_ratio': dominance_ratio
        })
    return paper_classifications

def topic_name_llm(lda_keywords, tfidf_ngrams, top_titles, client, model_type, credit_tracker):
    prompt = f"""Based on the following keywords and n-grams from LDA and TF-IDF, plus top titles, provide a concise topic name (bigram or trigram, single word if fitting):
LDA: {', '.join(lda_keywords)}
TFIDF: {', '.join(tfidf_ngrams)}
TITLES: {', '.join(top_titles)}
Return ONLY the topic name."""
    tokens = num_tokens_from_string(prompt, model_type)
    credit_tracker.update(tokens)
    response = client.chat.completions.create(
        model=model_type,
        messages=[{"role": "system", "content": "You are a science topic-naming assistant."}, {"role": "user", "content": prompt}]
    )
    content = response.choices[0].message.content.strip()
    credit_tracker.update(num_tokens_from_string(content, model_type))
    return content

def get_top_titles_for_topic(df, paper_classifications, topic_idx, n_titles=10):
    dominant_papers = [p for p in paper_classifications if p['primary_topic'] == topic_idx]
    paper_infos = [
        (df.iloc[p['paper_idx']]['citationCount'] if 'citationCount' in df.columns else 0, df.iloc[p['paper_idx']]['title'])
        for p in dominant_papers if not pd.isna(df.iloc[p['paper_idx']]['title'])
    ]
    top_titles = [title for _, title in sorted(paper_infos, key=lambda x: -x[0])[:n_titles]]
    return top_titles

def get_author_stats(paper_classifications, df_field, n_top=5):
    top_papers = {}
    author_topic_stats = {}
    for topic in set(p['primary_topic'] for p in paper_classifications):
        topic_papers = [p for p in paper_classifications if p['primary_topic'] == topic]
        topic_papers.sort(key=lambda x: x['dominance_ratio'], reverse=True)
        top_papers[topic] = []
        for p in topic_papers[:n_top]:
            paper_idx = p['paper_idx']
            try:
                authors = df_field.iloc[paper_idx]['authors']
                if isinstance(authors, str):
                    try: authors = ast.literal_eval(authors)
                    except (ValueError, SyntaxError): authors = []
                if isinstance(authors, list):
                    author_list = []
                    for author in authors:
                        if isinstance(author, dict):
                            author_list.append({'name': author.get('name', 'Unknown'), 'id': author.get('authorId', 'Unknown')})
                else: author_list = []
                top_papers[topic].append({
                    'paperId': df_field.iloc[paper_idx].get('paperId',''),
                    'title': df_field.iloc[paper_idx].get('title',''),
                    'authors': author_list,
                    'score': float(p['primary_score']),
                    'dominance_ratio': float(p['dominance_ratio'])
                })
            except Exception as e: continue
    return top_papers, author_topic_stats

# Topic Modeling, Topic Naming, Author Analysis, and Topic-Specific TF-IDF Extraction

def get_top_tfidf_ngrams_per_topic(df, tfidf_matrix, feature_names, topic_col='Primary_Topic_Index', top_k=10):
    tfidf_ngrams = {}
    for topic_idx in df[topic_col].dropna().unique():
        topic_idx = int(topic_idx)
        doc_indices = df[df[topic_col] == topic_idx].index
        if len(doc_indices) == 0:
            continue
        topic_tfidf = np.asarray(tfidf_matrix[doc_indices].mean(axis=0)).ravel()
        top_indices = topic_tfidf.argsort()[-top_k:][::-1]
        top_terms = [(feature_names[i], topic_tfidf[i]) for i in top_indices if topic_tfidf[i] > 0]
        tfidf_ngrams[topic_idx] = top_terms
    return tfidf_ngrams

def save_topic_analysis_outputs(
    df, lda_model, lda_vectorizer, topic_distributions, topic_keywords, topic_names, topic_ngrams,
    author_stats, top_papers, tfidf_ngrams, suffix_string
):
    # Save topic modeling components and naming results
    topic_metadata = {
        "topics": {int(k): v for k,v in topic_keywords.items()},
        "topic_names": {int(k): v for k,v in topic_names.items()},
        "topic_ngrams": {int(k): v for k,v in topic_ngrams.items()},
    }
    with open(os.path.join(SAVE_DIR, f"topics_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump(topic_metadata, f, indent=2)
    with open(os.path.join(SAVE_DIR, f"topic_names_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump({int(k):v for k,v in topic_names.items()}, f, indent=2)
    np.save(os.path.join(SAVE_DIR, f"topic_distributions_{suffix_string}.npy"), topic_distributions)
    import joblib
    joblib.dump(lda_model, os.path.join(SAVE_DIR, f"lda_model_{suffix_string}.joblib"))
    joblib.dump(lda_vectorizer, os.path.join(SAVE_DIR, f"lda_vectorizer_{suffix_string}.joblib"))
    # Save author and paper/topic info
    with open(os.path.join(SAVE_DIR, f"top_papers_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump({int(k): v for k, v in top_papers.items()}, f, ensure_ascii=False, indent=2, default=str)
    pd.DataFrame.from_dict(author_stats, orient='index').to_csv(
        os.path.join(SAVE_DIR, f"author_stats_{suffix_string}.csv"))
    # Topic-specific tfidf ngrams
    with open(os.path.join(SAVE_DIR, f"topic_specific_tfidf_ngrams_{suffix_string}.json"), "w", encoding="utf-8") as f:
        json.dump({int(k):[(term,float(score)) for term,score in v] for k,v in tfidf_ngrams.items()}, f, indent=2)


In [8]:
# Cell 8: Diagnostics Function

def diagnostics_with_scores(
    df, tfidf_scores, lda_scores, compound_scores, combined_scores, method_names
):
    n_docs, n_methods = tfidf_scores.shape
    print("=== DIAGNOSTICS ===")
    print(f"Total documents: {n_docs}")
    print(f"Methods: {n_methods}")
    print(f"TF-IDF coverage: {(tfidf_scores > 0).any(axis=1).sum()}/{n_docs} ({100*(tfidf_scores>0).any(axis=1).mean():.1f}%)")
    print(f"LDA coverage: {(lda_scores > 0).any(axis=1).sum()}/{n_docs} ({100*(lda_scores>0).any(axis=1).mean():.1f}%)")
    print(f"Compound coverage: {(compound_scores > 0).any(axis=1).sum()}/{n_docs} ({100*(compound_scores>0).any(axis=1).mean():.1f}%)")
    print(f"Combined coverage: {(combined_scores > 0).any(axis=1).sum()}/{n_docs} ({100*(combined_scores>0).any(axis=1).mean():.1f}%)")
    if 'Primary_Method' in df.columns:
        print("\nMethod label distribution (top 10):")
        print(df['Primary_Method'].value_counts().head(10))
    if 'Method_Confidence' in df.columns:
        print("\nMethod confidence distribution:")
        print(df['Method_Confidence'].value_counts())
    print("\nMethod vocabulary sample:", ', '.join(method_names[:10]))
    print(f"\nTFIDF stats: mean={tfidf_scores.mean():.3f}, std={tfidf_scores.std():.3f}")
    print(f"LDA stats: mean={lda_scores.mean():.3f}, std={lda_scores.std():.3f}")
    print(f"Compound stats: mean={compound_scores.mean():.3f}, std={compound_scores.std():.3f}")
    print(f"Combined stats: mean={combined_scores.mean():.3f}, std={combined_scores.std():.3f}")


In [9]:
# Cell 9: Execute Topic Analysis Workflow (LDA, Topic Naming, Author Analysis, Topic N-grams)

NUM_TOPICS = 12  # adjust as desired
NUM_TOPIC_WORDS = 15

# LDA topic modeling
lda_model, lda_vectorizer, topic_distributions, topic_keywords = run_lda_topic_modeling(
    df, num_topics=NUM_TOPICS, num_words=NUM_TOPIC_WORDS)

# Paper-to-topic assignment
paper_classifications = assign_papers_to_topics(topic_distributions)
df['Primary_Topic_Index'] = [int(p['primary_topic'][0]) if isinstance(p['primary_topic'], (np.ndarray, list)) else int(p['primary_topic']) for p in paper_classifications]
df['Primary_Score'] = [p['primary_score'] for p in paper_classifications]
df['Dominance_Ratio'] = [p['dominance_ratio'] for p in paper_classifications]

# Per-topic TF-IDF n-grams for naming/interpretation
topic_tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3), min_df=2, max_df=0.95, token_pattern=r'\b[\w_-]+\b'
)
topic_tfidf_matrix = topic_tfidf_vectorizer.fit_transform(df['processed_text'])
topic_tfidf_feature_names = topic_tfidf_vectorizer.get_feature_names_out()
topic_ngrams = get_top_tfidf_ngrams_per_topic(
    df, topic_tfidf_matrix, topic_tfidf_feature_names, topic_col='Primary_Topic_Index', top_k=10)

# Generate topic names using LLM
topic_names = {}
for topic_idx, keywords in topic_keywords.items():
    lda_ngrams = keywords['top_words'][:NUM_TOPIC_WORDS]
    tfidf_ng = [ngram for ngram, _ in topic_ngrams.get(topic_idx, [])][:NUM_TOPIC_WORDS]
    top_titles = get_top_titles_for_topic(df, paper_classifications, topic_idx, n_titles=10)
    topic_name = topic_name_llm(
        lda_ngrams, tfidf_ng, top_titles, client, model_type, credit_tracker
    )
    topic_names[topic_idx] = topic_name
    logger.info(f"Topic {topic_idx}: {topic_name if topic_name else 'Unnamed'}")
df['Primary_Topic'] = df['Primary_Topic_Index'].map(lambda x: topic_names.get(x, f"Topic_{x}"))
logger.info("✓ Topic naming and assignment completed.")

# Author analysis and top papers per topic
top_papers, author_stats = get_author_stats(paper_classifications, df, n_top=5)

# Save all topic/author analysis results
current_date = datetime.now().strftime("%Y_%m_%d")
keyword_str = '_'.join(search_keywords) if search_keywords else ""
suffix_string = f"{current_date}{keyword_str}"
save_topic_analysis_outputs(df, lda_model, lda_vectorizer, topic_distributions, topic_keywords, topic_names, topic_ngrams, author_stats, top_papers, topic_ngrams, suffix_string)

# Show preview
print("\nSample topics and names:")
print({k: topic_names[k] for k in list(topic_names)[:5]})
print("\nTop authors and top papers by topic (first 2):")
print(dict(list(top_papers.items())[:2]))


2025-08-20 22:36:57,612 - INFO - collecting all words and their counts
2025-08-20 22:36:57,612 - INFO - PROGRESS: at sentence #0, processed 0 words and 0 word types
2025-08-20 22:36:59,000 - INFO - PROGRESS: at sentence #10000, processed 1524957 words and 863761 word types
2025-08-20 22:37:00,554 - INFO - PROGRESS: at sentence #20000, processed 3004878 words and 1467564 word types
2025-08-20 22:37:01,856 - INFO - collected 1902495 token types (unigram + bigrams) from a corpus of 4290297 words and 28934 sentences
2025-08-20 22:37:01,857 - INFO - merged Phrases<1902495 vocab, min_count=10, threshold=50, max_vocab_size=40000000>
2025-08-20 22:37:01,858 - INFO - Phrases lifecycle event {'msg': 'built Phrases<1902495 vocab, min_count=10, threshold=50, max_vocab_size=40000000> in 4.25s', 'datetime': '2025-08-20T22:37:01.858831', 'gensim': '4.3.2', 'python': '3.11.13 (main, Jun 12 2025, 12:41:34) [MSC v.1943 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-0


Sample topics and names:
{0: 'Solar Power Systems', 1: 'Wireless Communication Systems', 2: 'Cloud Resource Management', 3: 'Smart Grid Monitoring', 4: 'Power Control Systems'}

Top authors and top papers by topic (first 2):
{0: [{'paperId': '92dc35c9518014301149ff05aec8c715bab009b9', 'title': 'The Hybrid Cycle: Integration of a Fuel Cell With a Gas Turbine', 'authors': [{'name': 'J. Leeper', 'id': '98489278'}], 'score': 0.9947914327096009, 'dominance_ratio': 190.9913721272474}, {'paperId': '633d495adac1c6cfe8e7e20b2c052bb9c669640c', 'title': 'Study of Indonesia low rank coal utilization on modified fixed bed gasification for combined cycle power plant', 'authors': [{'name': 'T. Hardianto', 'id': '67067417'}, {'name': 'A. R. Amalia', 'id': '2128252791'}, {'name': 'A. Suwono', 'id': '72383274'}, {'name': 'P. Riauwindu', 'id': '101468093'}], 'score': 0.9947617556671915, 'dominance_ratio': 189.9036534906196}, {'paperId': '944f9353bd490f6f65bc2b540371b38a5b1b3564', 'title': 'Thermodynamic

In [10]:
# Cell 10: Main Pipeline (run in order) with Parameterization

# ===== Parameter Choices: SET HERE =====
# --- Extraction/scoring params
MAX_FEATURES = 30000          # Number of n-grams for candidate extraction
NGRAM_RANGE = (1, 4)          # For TF-IDF and LDA n-gram extraction
WINDOW_COMPOUND = 150         # Window for compound/proximity scoring
MIN_WORD_LEN = 4              # Minimum word length in compound scoring

# --- Matrix and score params
TFIDF_WEIGHT = 0.5            # Weight for TF-IDF scores in final matrix
LDA_WEIGHT = 0.3              # Weight for LDA scores in final matrix
COMPOUND_WEIGHT = 0.2         # Weight for compound scores in final matrix

# --- Assignment/confidence params
TOP_METHODS_PER_PAPER = 4     # Number of methods to assign per paper
MIN_ASSIGN_SCORE = 0.02       # Min combined score to assign a method
BATCH_SIZE_LLM = 100          # LLM batch size for abbreviations

# --- File/output
#suffix_string = f"{datetime.now().strftime('%Y_%m_%d')}_your_keywords" # Adjust as needed

logger.info("Starting pipeline for method detection and assignment...")

# 1. Extract broad candidate n-grams from the corpus
candidate_terms = extract_candidate_terms(
    df, text_col='processed_text', max_features=MAX_FEATURES)
logger.info(f"Step 1: Extracted {len(candidate_terms)} candidate terms from the corpus.")

# 2. Use LLM to filter for method/technique phrases only
method_phrases = get_method_phrases(
    candidate_terms,
    client,
    model_type,
    credit_tracker,
    consensus_runs=3,           # Or 5, for very-high repeatability
    consensus_threshold=0.7,    # Set to 1.0 for total agreement, 0.7 for majority
    temp=0                      # Always set temperature to 0 for deterministic results
)

logger.info(f"Step 2: Extracted {len(method_phrases)} method phrases from the corpus.")

# 3. Use LLM to build abbreviation/synonym dictionary
method_dict = get_method_abbreviation_dict(
    method_phrases, client, model_type, credit_tracker, batch_size=BATCH_SIZE_LLM)
abbr_to_canonical_map = build_abbr_to_canonical_map(method_dict)
df['standardized_text'] = df['processed_text'].apply(
    lambda t: standardize_methods_in_text(t, abbr_to_canonical_map))
method_vocabulary = sorted(method_dict.keys())
logger.info(f"Step 3: Built abbreviation map for {len(method_vocabulary)} methods.")

# 4. Compute all method score matrices
tfidf_scores, method_names = compute_tfidf_scores(
    df['standardized_text'], method_vocabulary, ngram_range=NGRAM_RANGE)
lda_n_topics = min(len(method_vocabulary), 100)
lda_scores, lda_names = compute_lda_scores(
    df['standardized_text'], method_vocabulary, ngram_range=NGRAM_RANGE, n_topics=lda_n_topics)
assert list(method_names) == list(lda_names)
compound_scores = compute_compound_scores(
    df, method_names, processed_col='standardized_text',
    window=WINDOW_COMPOUND, min_word_len=MIN_WORD_LEN)
combined_scores = combine_method_scores(
    tfidf_scores, lda_scores, compound_scores,
    weights=(TFIDF_WEIGHT, LDA_WEIGHT, COMPOUND_WEIGHT))
logger.info(f"Step 4: Computed method score matrices with {len(method_names)} methods.")

# 5. Assign methods to papers with confidence
df = assign_top_methods_by_total_score(
    df, combined_scores, method_names,
    top_n=TOP_METHODS_PER_PAPER, min_score=MIN_ASSIGN_SCORE)
logger.info(f"Step 5: Assigned top {TOP_METHODS_PER_PAPER} methods to {len(df)} papers with confidence levels.")

# 6. Save all matrix DataFrames for visualization
for scores, label in zip([tfidf_scores, lda_scores, compound_scores, combined_scores],
                         ["tfidf", "lda", "compound", "combined"]):
    pd.DataFrame(scores, columns=method_names, index=df.index).to_csv(
        os.path.join(SAVE_DIR, f"semantic_scholar_{suffix_string}_method_{label}_scores.csv")
    )
logger.info(f"Step 6: Saved method score matrices to {SAVE_DIR}.")

# 7. Run diagnostics
diagnostics_with_scores(df, tfidf_scores, lda_scores, compound_scores, combined_scores, method_names)
logger.info(f"Step 7: Diagnostics completed.")

# 8. Output short preview
print(df[['Primary_Method', 'Primary_Method_Score', 'Method_Confidence', 'Top_1_Method', 'Top_1_Score']].head())
logger.info(f"Step 8: Output preview of method assignments.")

# 9. Final saving of the DataFrame with method assignments
df.to_csv(os.path.join(SAVE_DIR, f"semantic_scholar_{suffix_string}_all_results.csv"))
logger.info(f"Step 9: Saved final DataFrame with method assignments to {SAVE_DIR}.")


2025-08-20 22:39:43,018 - INFO - Starting pipeline for method detection and assignment...
2025-08-20 22:40:26,780 - INFO - Step 1: Extracted 30000 candidate terms from the corpus.
2025-08-20 22:45:22,379 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Run 1: Found 68 method phrases.


2025-08-20 22:51:18,721 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Run 2: Found 112 method phrases.


2025-08-20 22:51:32,150 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 22:51:32,153 - INFO - Step 2: Extracted 81 method phrases from the corpus.


Run 3: Found 99 method phrases.

81 consensus method phrases found in >= 2/3 runs.


2025-08-20 22:51:53,048 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 22:51:53,062 - INFO - LLM mapped 81 methods to abbreviations/variants.
2025-08-20 22:53:00,078 - INFO - Step 3: Built abbreviation map for 81 methods.
2025-08-20 22:53:46,571 - INFO - Step 4: Computed method score matrices with 81 methods.
2025-08-20 22:53:46,959 - INFO - Step 5: Assigned top 4 methods to 28934 papers with confidence levels.
2025-08-20 22:53:54,441 - INFO - Step 6: Saved method score matrices to Saved_files_new.
2025-08-20 22:53:54,498 - INFO - Step 7: Diagnostics completed.
2025-08-20 22:53:54,504 - INFO - Step 8: Output preview of method assignments.


=== DIAGNOSTICS ===
Total documents: 28934
Methods: 81
TF-IDF coverage: 6057/28934 (20.9%)
LDA coverage: 28934/28934 (100.0%)
Compound coverage: 28662/28934 (99.1%)
Combined coverage: 28934/28934 (100.0%)

Method label distribution (top 10):
Primary_Method
reliability analysis          3587
multi-agent system            2806
adaptive control              2674
dynamic line rating           1307
model predictive control       993
load flow analysis             736
resource allocation            688
wavelet transform              630
data mining                    626
voltage stability analysis     614
Name: count, dtype: int64

Method confidence distribution:
Method_Confidence
confident          23772
super_confident     4890
low_confidence       272
Name: count, dtype: int64

Method vocabulary sample: adaptive control, adaptive neuro-fuzzy inference system, autoregressive integrated moving average, bayesian optimization, bootstrap, co-simulation, convolutional neural network, cooperativ

2025-08-20 22:53:58,494 - INFO - Step 9: Saved final DataFrame with method assignments to Saved_files_new.
