In [1]:
# %%
import os
import re
import json
import csv
import ast
import numpy as np
import pandas as pd
from collections import Counter
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import configparser
import tiktoken
import logging
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Phrases

import openai

SAVE_DIR = "Saved_files_new"
os.makedirs(SAVE_DIR, exist_ok=True)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

import random
random.seed(42)
np.random.seed(42)

In [2]:
class CreditTracker:
    def __init__(self):
        self.total_tokens = 0
        self.total_cost = 0
        self.cost_per_1k_tokens = 0.00015
    def update(self, tokens):
        self.total_tokens += tokens
        self.total_cost += (tokens / 1000) * self.cost_per_1k_tokens
    def get_stats(self):
        return {"total_tokens": self.total_tokens, "total_cost": round(self.total_cost, 4)}

def initialize_openai():
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE')
    client = openai.OpenAI(api_key=api_key)
    return client, model_type

client, model_type = initialize_openai()
credit_tracker = CreditTracker()

def num_tokens_from_string(string: str, model_name: str) -> int:
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(string))


In [3]:
# %%
def extract_keywords_from_filename(filename):
    base = os.path.splitext(os.path.basename(filename))[0]
    parts = base.split('_')
    keywords = [part for i, part in enumerate(parts) if i > 2 and part != 'results' and not part.isdigit()]
    return keywords

def keywords_to_filename_part(keywords):
    return '_'.join([kw.lower().replace(' ', '_') for kw in keywords])

def get_custom_stop_words(search_keywords=None):
    stop_words = set(stopwords.words('english'))
    words_to_keep = set()
    if search_keywords:
        for keyword in search_keywords:
            keyword = keyword.lower()
            words_to_keep.add(keyword)
            for word in keyword.split():
                words_to_keep.add(word)
    stop_words = stop_words - words_to_keep
    scientific_terms = {'et', 'al','ref','reference','references','cited','cite',
        'fig','figure','figures','table','tables','chart','charts',
        'published','journal','conference','proceedings','vol','volume','pp','page','pages','doi'}
    stop_words = stop_words.union(scientific_terms)
    return stop_words

def preprocess_text(text, search_keywords=None, min_word_length=2, remove_numbers=False):
    if not isinstance(text, (str, int, float)):
        return ''
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'--+', ' ', text)
    tokens = word_tokenize(text)
    stop_words = get_custom_stop_words(search_keywords)
    tokens = [t for t in tokens if len(t) >= min_word_length and t not in stop_words and len(t) > 1 and not t.isdigit()]
    lemmatizer = WordNetLemmatizer()
    try:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    except:
        pass
    return ' '.join(tokens)

def preprocess_dataframe(df, text_col, search_keywords, processed_col='processed_text'):
    df[text_col] = df[text_col].fillna('').astype(str)
    df[processed_col] = df[text_col].apply(lambda x: preprocess_text(x, search_keywords))
    return df[df[processed_col].str.strip() != '']


In [4]:
# %%
filename = "semantic_scholar_2025_02_14_reliability_resilience_power_systems_results.csv"
filepath = os.path.join("Saved_files", filename)
df = pd.read_csv(filepath, sep=";")
df['text'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')
search_keywords = extract_keywords_from_filename(filename)
df = preprocess_dataframe(df, text_col='text', search_keywords=search_keywords)

def clean_fields_of_study(s):
    valid_fields = ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics','Medicine','Business','Environmental Science','Chemistry','Materials Science','Geography','Biology','Geology','Political Science','Psychology','Com']
    if pd.isna(s) or s == '[]':
        return ["Unknown"]
    if isinstance(s, str):
        fields = [field.strip().strip("'\"") for field in s.strip('[]').split(',')]
        return [f if f in valid_fields else "Unknown" for f in fields] or ["Unknown"]
    return ["Unknown"]
df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)


In [5]:
# %%
def model_topics(df, num_topics=10, num_words=25):
    tokenized_texts = df['processed_text'].apply(lambda x: x.split()).tolist()
    bigram = Phrases(tokenized_texts, min_count=10, threshold=50, delimiter='_')
    trigram = Phrases(bigram[tokenized_texts], threshold=50, delimiter='_')
    phrased = []
    for doc in tokenized_texts:
        bigrams_ = [w for w in bigram[doc] if '_' in w]
        trigrams_ = [w for w in trigram[bigram[doc]] if '_' in w]
        combined = doc + bigrams_ + trigrams_
        phrased.append(' '.join(combined))
    vectorizer = CountVectorizer(ngram_range=(1, 1), token_pattern=r'\b[\w_-]+\b', max_df=0.95, min_df=2, max_features=10000)
    doc_term_matrix = vectorizer.fit_transform(phrased)
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    topic_distributions = lda_model.fit_transform(doc_term_matrix)
    feature_names = vectorizer.get_feature_names_out()
    topic_keywords = {}
    for topic_idx, topic in enumerate(lda_model.components_):
        top_indices = topic.argsort()[:-num_words-1:-1]
        top_words = [feature_names[i] for i in top_indices]
        word_weights = [(feature_names[i], topic[i]) for i in top_indices]
        topic_keywords[topic_idx] = {'top_words': top_words, 'word_weights': word_weights}
    return lda_model, vectorizer, topic_distributions, df, topic_keywords

lda_model, vectorizer, topic_distributions, df_topic, topic_keywords = model_topics(df, num_topics=10, num_words=25)




2025-08-19 11:34:01,842 - INFO - collecting all words and their counts
2025-08-19 11:34:01,845 - INFO - PROGRESS: at sentence #0, processed 0 words and 0 word types
2025-08-19 11:34:03,737 - INFO - PROGRESS: at sentence #10000, processed 1528760 words and 873903 word types
2025-08-19 11:34:05,685 - INFO - PROGRESS: at sentence #20000, processed 3012255 words and 1486274 word types
2025-08-19 11:34:07,222 - INFO - collected 1928697 token types (unigram + bigrams) from a corpus of 4300658 words and 28934 sentences
2025-08-19 11:34:07,223 - INFO - merged Phrases<1928697 vocab, min_count=10, threshold=50, max_vocab_size=40000000>
2025-08-19 11:34:07,225 - INFO - Phrases lifecycle event {'msg': 'built Phrases<1928697 vocab, min_count=10, threshold=50, max_vocab_size=40000000> in 5.38s', 'datetime': '2025-08-19T11:34:07.225234', 'gensim': '4.3.2', 'python': '3.11.13 (main, Jun 12 2025, 12:41:34) [MSC v.1943 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-0

In [6]:
def extract_candidate_terms(df, text_col='processed_text', max_features=1000):
    vectorizer = CountVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=2, max_features=max_features, token_pattern=r'\b[\w-]+\b')
    matrix = vectorizer.fit_transform(df[text_col].fillna(''))
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    return [term for term, freq in sorted(zip(terms, freqs), key=lambda x: x[1], reverse=True)]

def get_method_phrases(corpus_terms, client, model_type, credit_tracker):
    sample_terms = ', '.join(corpus_terms[:50])
    prompt = f"""Here are the most frequent terms from a corpus of scientific papers:
{sample_terms}
From the full list: {', '.join(corpus_terms)}
Extract ONLY the terms that represent specific methodologies, techniques, or named approaches that would actually appear in this type of engineering research. Focus on:
- Power system analysis methods
- Reliability analysis techniques  
- Engineering design approaches
- Computational methods used in power/electrical engineering
- Statistical methods for engineering

Do NOT include: generic words like "analysis", "method", "approach", "design", "system" by themselves.
DO include: specific named methods like "monte carlo simulation", "load flow analysis", "reliability assessment", loss of load probability, probabilitstic methods, etc.

Return as a simple Python list of strings, no code blocks or formatting."""
    response = client.chat.completions.create(
        model=model_type,
        messages=[{"role": "user", "content": prompt}]
    )
    try:
        return ast.literal_eval(response.choices[0].message.content)
    except:
        content = response.choices.message.content
        content = content.replace('[', '').replace(']', '').replace('"', '').replace("'", '')
        return [term.strip() for term in content.split(',') if len(term.strip()) > 3]

def clean_method_phrases_fixed(method_phrases):
    cleaned_phrases = []
    for phrase in method_phrases:
        cleaned = phrase.strip().replace('``````','').replace('[', '').replace(']', '').replace('"', '').replace("'", '').replace('\n', ' ')
        cleaned = ' '.join(cleaned.split())
        if len(cleaned) > 2:
            cleaned_phrases.append(cleaned.lower())
    return list(set(cleaned_phrases))

# Normalize and augment method vocabulary
def stable_normalize_augment_vocab(method_phrases):
    base = [p.lower().strip() for p in method_phrases if isinstance(p, str) and p.strip()]
    variants = set()
    for p in base:
        variants.add(p)
        variants.add(p.replace('-', ' '))
        variants.add(p.replace('_', ' '))
        if ' ' in p:
            variants.add(p.replace(' ', '_'))
        # canonical variants (extend as needed)
        if p == 'optimal power flow':
            variants.update(['opf','security-constrained opf','scopf'])
        if p == 'monte carlo simulation':
            variants.update(['mc simulation'])
        if p == 'load flow analysis':
            variants.update(['power flow','ac power flow','dc power flow'])
        if p == 'state estimation':
            variants.update(['wls state estimation','kalman filter','extended kalman','unscented kalman'])
        if p == 'contingency analysis':
            variants.update(['n-1 security','n-1 contingency'])
        if p == 'unit commitment':
            variants.update(['economic dispatch','security-constrained unit commitment'])
    # Create stable, reproducible order
    vocab = sorted(set(variants))
    return vocab




In [7]:
#Run thorugh method extraction
candidate_terms = extract_candidate_terms(df, text_col='processed_text', max_features=15000)
print("Number of candidate terms:", len(candidate_terms))
method_phrases = get_method_phrases(candidate_terms, client, model_type, credit_tracker)
method_phrases = clean_method_phrases_fixed(method_phrases)
method_phrases_aug = stable_normalize_augment_vocab(method_phrases)
print("Augmented (stable) vocab size:", len(method_phrases_aug))
print("Top method phrases:", method_phrases_aug[:15])

Number of candidate terms: 15000


2025-08-19 11:37:30,351 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Augmented (stable) vocab size: 93
Top method phrases: ['ac power flow', 'backward/forward compatibility', 'backward/forward_compatibility', 'constraint optimization', 'constraint_optimization', 'cost benefit analysis', 'cost-benefit analysis', 'cost-benefit_analysis', 'data driven approach', 'data-driven approach', 'data-driven_approach', 'dc power flow', 'design for reliability', 'design of experiments', 'design_for_reliability']


In [None]:
#Methods for method assignment with TF-IDF and LDA

# --- TF-IDF Method Assignment ---
def tfidf_method_assignment(df, method_phrases, processed_col='processed_text', min_score=0.005):
    logger.info(f"Assigning primary methods using TF-IDF with {len(method_phrases)} phrases.")
    # Use augmented vocab
    vocab = method_phrases
    vectorizer = TfidfVectorizer(
        vocabulary=vocab,
        ngram_range=(1, 3),
        min_df=1,
        max_df=0.999,               # allow very common terms if they’re in vocabulary
        norm='l2',
        token_pattern=r'\b[\w_-]+\b'
    )
    tfidf_matrix = vectorizer.fit_transform(df[processed_col])
    method_scores = tfidf_matrix.max(axis=1).toarray().ravel()
    argmax_indices = tfidf_matrix.argmax(axis=1).A1
    feature_names = vectorizer.get_feature_names_out()

    assigned = []
    for i, score in zip(argmax_indices, method_scores):
        if score >= min_score:
            assigned.append(feature_names[i])
        else:
            assigned.append('LowConfidence')

    df['Primary_Method_TFIDF'] = assigned
    df['Method_TFIDF_Score'] = method_scores

    # Optional: store top-k labels
    topk = 3
    mat = tfidf_matrix.toarray()
    top_idx = np.argsort(mat, axis=1)[:, -topk:]
    for k in range(1, topk+1):
        col_name = f'Top_{k}_TFIDF_Methods'
        df[col_name] = [[feature_names[j] for j in idxs[::-1][:k]] for idxs in top_idx]

    # Diagnostics
    nonzero_docs = int((method_scores > 0).sum())
    logger.info(f"TF-IDF nonzero docs: {nonzero_docs}/{len(df)} ({100*nonzero_docs/len(df):.1f}%)")
    return df

def compute_tfidf_matrix(df, vocab, processed_col='processed_text'):
    vect = TfidfVectorizer(
        vocabulary=vocab,              # stable order retained
        ngram_range=(1,3),
        min_df=1,
        max_df=0.999,
        norm='l2',
        token_pattern=r'\b[\w_-]+\b'
    )
    X = vect.fit_transform(df[processed_col])
    feature_names = vect.get_feature_names_out().tolist()  # stable order matches vocab order
    # Normalize row-wise to [0,1] for combining (optional)
    max_per_doc = X.max(axis=1).toarray().ravel()
    # Avoid division by zero
    max_per_doc[max_per_doc == 0] = 1.0
    X_norm = X.multiply(1.0 / max_per_doc.reshape(-1,1))
    return X, X_norm, feature_names


# --- LDA-based Method Assignment ---
def lda_method_assignment(
    df, method_phrases, processed_col='processed_text',
    max_method_topics=20,          # keep topics moderate
    min_papers_per_topic=2,        # allow rarer topics
    extra_gate=0.00                # was +0.03, now relaxed
):
    vocab = method_phrases
    vectorizer = CountVectorizer(
        vocabulary=vocab,
        ngram_range=(1, 3),
        token_pattern=r'\b[\w_-]+\b'
    )
    doc_term_matrix = vectorizer.fit_transform(df[processed_col])
    n_method_topics = len(vocab)
    if n_method_topics < 2:
        logger.warning("Not enough method phrases for LDA method assignment. Skipping.")
        df['Primary_Method_LDA'] = 'No_Method_Found'
        df['Method_LDA_Score'] = 0.0
        return df

    n_components = min(max_method_topics, n_method_topics)
    lda = LatentDirichletAllocation(
        n_components=n_components,
        learning_method='batch',
        random_state=42,
        max_iter=30
    )
    doc_topic_dist = lda.fit_transform(doc_term_matrix)
    # Map topics to vocabulary indices via component highest-weight terms (approximate label)
    vocab_list = list(vectorizer.vocabulary_.keys())
    # Label each LDA topic by its top token in vocab_list
    topic_labels = []
    for t in range(n_components):
        comp = lda.components_[t]
        top_idx = comp.argmax()
        # We need the term string at index top_idx in the vectorizer feature order
        feat_names = vectorizer.get_feature_names_out()
        topic_labels.append(feat_names[top_idx])

    best_topic_idx = doc_topic_dist.argmax(axis=1)
    best_topic_val = doc_topic_dist[np.arange(len(df)), best_topic_idx]
    gate = (1.0 / n_components) + extra_gate

    # Count topic assignments
    topic_assignment_counts = pd.Series(best_topic_idx).value_counts()
    rare_topics = set(topic_assignment_counts[topic_assignment_counts < min_papers_per_topic].index.tolist())

    assigned_methods = []
    for j, i in enumerate(best_topic_idx):
        label = topic_labels[i]
        if best_topic_val[j] > gate and (i not in rare_topics):
            assigned_methods.append(label)
        else:
            assigned_methods.append('LowConfidence')

    df['Primary_Method_LDA'] = assigned_methods
    df['Method_LDA_Score'] = best_topic_val

    # Store top-3 topic labels and scores per doc (optional)
    topk = 3
    top_idx = np.argsort(doc_topic_dist, axis=1)[:, -topk:]
    df['Top_3_Methods_LDA'] = [[topic_labels[i] for i in idxs[::-1]] for idxs in top_idx]
    df['Top_3_Methods_LDA_Scores'] = [
        [float(doc_topic_dist[row_i, i]) for i in idxs[::-1]] for row_i, idxs in enumerate(top_idx)
    ]

    # Diagnostics
    non_low = (df['Primary_Method_LDA'] != 'LowConfidence').sum()
    logger.info(f"LDA confident assignments: {non_low}/{len(df)} ({100*non_low/len(df):.1f}%) (gate={gate:.4f}, rare<{min_papers_per_topic})")
    return df


def compute_compound_matrix(df, vocab, processed_col='processed_text', window=300, ratio_thresh=0.5):
    # Scoring: exact match -> 1.0
    # Else if compound: (#sig words present / #sig words) if any two consecutive words appear within window -> >=0.5 typical
    n_docs = len(df)
    n_terms = len(vocab)
    scores = np.zeros((n_docs, n_terms), dtype=np.float32)

    docs = df[processed_col].fillna('').str.lower().tolist()

    for j, phrase in enumerate(vocab):
        phrase_l = phrase.lower()
        phrase_words = [w for w in phrase_l.split() if len(w) > 0]
        sig_words = [w for w in phrase_words if len(w) > 3]
        for i, text in enumerate(docs):
            if phrase_l in text:
                scores[i, j] = 1.0
                continue
            if len(phrase_words) > 1:
                # coverage score
                present = sum(1 for w in sig_words if w in text) if sig_words else 0
                if sig_words:
                    coverage = present / len(sig_words)
                else:
                    coverage = 0.0
                # proximity check on consecutive words
                prox_hit = False
                for k in range(len(phrase_words)-1):
                    w1 = phrase_words[k]
                    w2 = phrase_words[k+1]
                    pos = text.find(w1)
                    if pos >= 0:
                        nearby = text[pos:pos+window]
                        if w2 in nearby:
                            prox_hit = True
                            break
                if coverage >= ratio_thresh or prox_hit:
                    scores[i, j] = max(scores[i, j], min(1.0, 0.6 + 0.4*coverage))  # lift to 0.6..1.0
            else:
                # single word long technical token
                if len(phrase_l) > 6 and phrase_l in text:
                    scores[i, j] = max(scores[i, j], 0.7)
    # Row-wise normalization to [0,1] for combining
    row_max = scores.max(axis=1, keepdims=True)
    row_max[row_max == 0] = 1.0
    scores_norm = scores / row_max
    return scores, scores_norm




In [9]:
# methods for compound phrases
def assign_methods_compound(df, method_phrases, processed_col='processed_text', min_confidence=0.05):
    all_text = ' '.join(df[processed_col]).lower()
    validated_phrases = []

    def strict_phrase_validation(phrase, all_text):
        phrase_lower = phrase.lower()
        if phrase_lower in all_text:
            return True, 'exact'
        words = phrase_lower.split()
        # Lower threshold from 0.6 to 0.5
        if len(words) > 1:
            sig_words = [w for w in words if len(w) > 3]
            if not sig_words:
                return False, 'none'
            word_matches = sum(1 for w in sig_words if w in all_text)
            if word_matches >= max(1, int(len(sig_words) * 0.5)):
                return True, 'compound'
        if len(words) == 1 and len(phrase_lower) > 6:
            generic_terms = {'analysis','method','approach','technique','system','design','study','evaluation','assessment','processing'}
            if phrase_lower not in generic_terms and phrase_lower in all_text:
                return True, 'technical'
        return False, 'none'

    for phrase in method_phrases:
        is_valid, _ = strict_phrase_validation(phrase, all_text)
        if is_valid:
            validated_phrases.append(phrase)
    logger.info(f"Validated {len(validated_phrases)} method phrases (compound).")

    def targeted_contains_method(text, validated_phrases):
        text_lower = text.lower()
        exact_matches = [phrase for phrase in validated_phrases if phrase.lower() in text_lower]
        if exact_matches:
            return True, exact_matches
        # Wider proximity window: 300 chars
        for phrase in validated_phrases:
            words = phrase.lower().split()
            if len(words) > 1:
                for i, word in enumerate(words[:-1]):
                    if word in text_lower:
                        word_pos = text_lower.find(word)
                        next_word = words[i+1]
                        nearby_text = text_lower[word_pos:word_pos+300]
                        if next_word in nearby_text:
                            return True, [phrase]
        return False, []

    classification_data = []
    for idx, row in df.iterrows():
        has_method, found_phrases = targeted_contains_method(row[processed_col], validated_phrases)
        classification_data.append({'index': idx, 'has_method': has_method, 'found_phrases': found_phrases})
    docs_to_classify_indices = [d['index'] for d in classification_data if d['has_method']]
    docs_to_classify = df.loc[docs_to_classify_indices].copy()
    logger.info(f"Compound: docs to classify: {len(docs_to_classify)}")

    if len(docs_to_classify) > 0 and len(validated_phrases) > 0:
        method_tfidf_vectorizer = TfidfVectorizer(
            vocabulary=validated_phrases,
            ngram_range=(1, 3),
            min_df=1,
            max_df=0.999,
            sublinear_tf=True,
            norm='l2',
            token_pattern=r'\b[\w_-]+\b'
        )
        method_tfidf_matrix = method_tfidf_vectorizer.fit_transform(docs_to_classify[processed_col])
        method_scores_matrix = method_tfidf_matrix.toarray()
        feature_names = method_tfidf_vectorizer.get_feature_names_out()
        method_scores_max = method_scores_matrix.max(axis=1)
        argmax_indices = method_scores_matrix.argmax(axis=1)
        primary_methods = [feature_names[i] for i in argmax_indices]

        final_methods = []
        confidence_flags = []
        for method, score in zip(primary_methods, method_scores_max):
            final_methods.append(method)
            if score == 0.0:
                confidence_flags.append("NoEvidence")
            elif score < min_confidence:
                confidence_flags.append("Low")
            else:
                confidence_flags.append("High")

        df.loc[docs_to_classify.index, 'Method_Detected_Compound'] = final_methods
        df.loc[docs_to_classify.index, 'Method_Compound_Score'] = method_scores_max
        df.loc[docs_to_classify.index, 'Method_Compound_Confidence'] = confidence_flags
    else:
        logger.warning("No documents or validated phrases for compound classification.")

    unclassified_idx = df.index.difference(docs_to_classify.index)
    df.loc[unclassified_idx, 'Method_Detected_Compound'] = 'No_Method_Found'
    df.loc[unclassified_idx, 'Method_Compound_Score'] = 0.0
    df.loc[unclassified_idx, 'Method_Compound_Confidence'] = 'NotClassified'
    return df






In [None]:
# %%
def combined_method_assignment_with_compound(df):
    final_method = []
    final_confidence = []
    for i, row in df.iterrows():
        compound_conf = row.get('Method_Compound_Confidence', 'NotClassified')
        compound_method = row.get('Method_Detected_Compound', 'No_Method_Found')
        tfidf_method = row.get('Primary_Method_TFIDF', 'LowConfidence')
        tfidf_score = row.get('Method_TFIDF_Score', 0.0)
        lda_method = row.get('Primary_Method_LDA', 'LowConfidence')

        if compound_conf == "High":
            final_method.append(compound_method)
            final_confidence.append('compound_high')
        elif tfidf_method != 'LowConfidence':
            final_method.append(tfidf_method)
            final_confidence.append('tfidf_confident')
        elif lda_method != 'LowConfidence':
            final_method.append(lda_method)
            final_confidence.append('lda_confident')
        elif compound_conf == "Low" and (pd.isna(tfidf_score) or float(tfidf_score) == 0.0):
            # Prefer compound_low if TF-IDF score is zero
            final_method.append(compound_method)
            final_confidence.append('compound_low')
        else:
            final_method.append('LowConfidence')
            final_confidence.append('low')

    df['Final_Method_Label'] = final_method
    df['Final_Method_Confidence'] = final_confidence
    return df

def combine_scores_to_confidence(tfidf_norm, compound_norm, feature_names,
                                 w_tfidf=0.6, w_comp=0.4,
                                 th_super=0.85, th_high=0.6, th_low=0.2):
    # combined = w_tfidf*tfidf + w_comp*compound
    Xc = tfidf_norm.multiply(w_tfidf).toarray() + (compound_norm * w_comp)
    # Confidence tiers
    # super_high >= th_super, high >= th_high, low >= th_low, else not_detected
    n_docs, n_terms = Xc.shape
    labels = np.full((n_docs, n_terms), 'not_detected', dtype=object)
    labels[Xc >= th_low] = 'low'
    labels[Xc >= th_high] = 'high'
    labels[Xc >= th_super] = 'super_high'
    return Xc, labels  # combined score, confidence label matrix (same shape as Xc)

def attach_multilabel_to_df(df, feature_names, combined_scores, confidence_labels, topk=40):
    n_docs = combined_scores.shape[0]
    top_methods = []
    top_conf = []
    top_scores = []
    for i in range(n_docs):
        row = combined_scores[i]
        idxs = np.argsort(row)[-topk:][::-1]
        methods_i = [feature_names[j] for j in idxs]
        conf_i = [confidence_labels[i, j] for j in idxs]
        scores_i = [float(row[j]) for j in idxs]
        top_methods.append(methods_i)
        top_conf.append(conf_i)
        top_scores.append(scores_i)
    df['ML_TopMethods'] = top_methods
    df['ML_TopConf'] = top_conf
    df['ML_TopScores'] = top_scores
    return df

def stable_single_label_from_multilabel(df, feature_names, combined_scores, confidence_labels):
    """
    Pick a single deterministic method label per document from multi-label matrices.
    - combined_scores: np.ndarray (n_docs x n_terms), higher is better
    - confidence_labels: np.ndarray (n_docs x n_terms) in {super_high, high, low, not_detected}
    Deterministic tie-breaking: score desc, tier desc, method name asc.
    """
    import numpy as np

    # Shape guards
    if not isinstance(combined_scores, np.ndarray):
        combined_scores = np.asarray(combined_scores)
    if not isinstance(confidence_labels, np.ndarray):
        confidence_labels = np.asarray(confidence_labels)

    if combined_scores.ndim != 2:
        raise ValueError(f"combined_scores must be 2D, got shape {combined_scores.shape}")
    if confidence_labels.ndim != 2:
        raise ValueError(f"confidence_labels must be 2D, got shape {confidence_labels.shape}")
    if combined_scores.shape != confidence_labels.shape:
        raise ValueError(f"Shape mismatch: combined_scores {combined_scores.shape} vs confidence_labels {confidence_labels.shape}")

    n_docs, n_terms = combined_scores.shape
    if n_terms != len(feature_names):
        raise ValueError(f"Feature names length {len(feature_names)} != number of terms {n_terms}")

    # Confidence tier ranking (higher is better)
    tier_rank = {'super_high': 3, 'high': 2, 'low': 1, 'not_detected': 0}

    final_label = []
    final_conf = []

    for i in range(n_docs):
        row_scores = combined_scores[i]
        row_labels = confidence_labels[i]

        # Build tuples: (method_name, score, confidence)
        tuples = [(feature_names[j], float(row_scores[j]), row_labels[j]) for j in range(n_terms)]

        # Sort by: score desc, confidence tier desc, method name asc
        # IMPORTANT: tier uses x[2] (confidence), not x[1]
        tuples_sorted = sorted(
            tuples,
            key=lambda x: (-x[1], -tier_rank.get(x[2], 0), x)
        )

        # Take the first tuple ONLY, then unpack it
        top_method, top_score, top_conf = tuples_sorted[0]

        if top_conf != 'not_detected':
            final_label.append(top_method)
            final_conf.append(top_conf)
        else:
            final_label.append('LowConfidence')
            final_conf.append('low')

    df['Final_Method_Label'] = final_label
    df['Final_Method_Confidence'] = final_conf
    return df





In [None]:
#Workflow to run through method assignment with TF-IDF and LDA and compound methods

#Run through method assignment with TF-IDF and LDA
df = tfidf_method_assignment(df, method_phrases_aug, processed_col='processed_text', min_score=0.01)
df = lda_method_assignment(df, method_phrases_aug, processed_col='processed_text', max_method_topics=50, min_papers_per_topic=3)

# Run after TFIDF/LDA assignment:
df = assign_methods_compound(df, method_phrases_aug, processed_col='processed_text', min_confidence=0.005)
#Run after TFIDF/LDA assignment with compound methods
df = combined_method_assignment_with_compound(df)

2025-08-19 11:37:30,490 - INFO - Assigning primary methods using TF-IDF with 93 phrases.
2025-08-19 11:37:36,782 - INFO - TF-IDF nonzero docs: 4361/28934 (15.1%)
2025-08-19 11:38:10,899 - INFO - LDA confident assignments: 4360/28934 (15.1%) (gate=0.0200, rare<3)
2025-08-19 11:38:12,910 - INFO - Validated 55 method phrases (compound).
2025-08-19 11:38:17,724 - INFO - Compound: docs to classify: 22062


In [12]:
# Deterministic seeds
import random
random.seed(42)
np.random.seed(42)

# Build matrices
X_tfidf, X_tfidf_norm, feat = compute_tfidf_matrix(df, method_phrases_aug, processed_col='processed_text')
X_compound, X_compound_norm = compute_compound_matrix(df, method_phrases_aug, processed_col='processed_text', window=300, ratio_thresh=0.5)

# Combine and label
X_combined, conf_labels = combine_scores_to_confidence(
    X_tfidf_norm, X_compound_norm, feat,
    w_tfidf=0.6, w_comp=0.4,
    th_super=0.85, th_high=0.6, th_low=0.2
)

# Attach multi-label summaries and deterministic single-label
df = attach_multilabel_to_df(df, feat, X_combined, conf_labels, topk=5)
df = stable_single_label_from_multilabel(df, feat, X_combined, conf_labels)


In [13]:
def stability_diagnostics(df, feat, X_tfidf, X_compound, X_combined, conf_labels):
    total = X_combined.shape[0]
    tfidf_nonzero = int((X_tfidf.max(axis=1).toarray().ravel() > 0).sum())
    print(f"TF-IDF nonzero docs: {tfidf_nonzero}/{total} ({100*tfidf_nonzero/total:.1f}%)")
    cmp_nonzero = int((X_compound.max(axis=1) > 0).sum())
    print(f"Compound nonzero docs: {cmp_nonzero}/{total} ({100*cmp_nonzero/total:.1f}%)")

    # Confidence distribution (flattened)
    unique, counts = np.unique(conf_labels, return_counts=True)
    dist = dict(zip(unique, counts))
    print("Confidence label counts (matrix-level):", dist)

    if 'Final_Method_Confidence' in df.columns:
        print("Final confidence distribution (per doc):")
        print(df['Final_Method_Confidence'].value_counts())

    # Top-20 methods by average combined score
    avg_scores = X_combined.mean(axis=0)
    top_idx = np.argsort(avg_scores)[-20:][::-1]
    top_methods = [(feat[j], float(avg_scores[j])) for j in top_idx]
    print("Top methods by avg combined score:")
    for m, s in top_methods:
        print(f"  {m}: {s:.3f}")

stability_diagnostics(df, feat, X_tfidf, X_compound, X_combined, conf_labels)


TF-IDF nonzero docs: 4361/28934 (15.1%)
Compound nonzero docs: 28349/28934 (98.0%)
Confidence label counts (matrix-level): {'high': 578, 'low': 303355, 'not_detected': 2382173, 'super_high': 4756}
Final confidence distribution (per doc):
Final_Method_Confidence
low           24573
super_high     4361
Name: count, dtype: int64
Top methods by avg combined score:
  power flow: 0.344
  dc power flow: 0.323
  ac power flow: 0.323
  design for reliability: 0.255
  reliability optimization: 0.232
  reliability assessment: 0.220
  safety analysis: 0.107
  proportional-integral control: 0.101
  load forecasting: 0.100
  design of experiments: 0.099
  sensitivity analysis: 0.099
  mc simulation: 0.093
  real-time simulation: 0.092
  regression analysis: 0.091
  cost-benefit analysis: 0.090
  optimal power flow: 0.088
  data-driven approach: 0.076
  detection algorithm: 0.075
  constraint optimization: 0.073
  real time simulation: 0.070


In [14]:
# Diagnostics & coverage report
def method_diagnostics_report(df):
    total = len(df)
    print("=== Diagnostics ===")
    # TF-IDF
    tfidf_nonzero = int((df['Method_TFIDF_Score'] > 0).sum()) if 'Method_TFIDF_Score' in df.columns else 0
    print(f"TF-IDF: nonzero-score docs: {tfidf_nonzero}/{total} ({100*tfidf_nonzero/total:.1f}%)")

    # LDA
    lda_confident = int((df['Primary_Method_LDA'] != 'LowConfidence').sum()) if 'Primary_Method_LDA' in df.columns else 0
    print(f"LDA: confident assignments: {lda_confident}/{total} ({100*lda_confident/total:.1f}%)")

    # Compound
    cmp_col = 'Method_Compound_Confidence'
    if cmp_col in df.columns:
        cmp_counts = df[cmp_col].value_counts()
        print("Compound confidence counts:")
        print(cmp_counts)

    # Final
    if 'Final_Method_Confidence' in df.columns:
        print("\nFinal confidence distribution:")
        print(df['Final_Method_Confidence'].value_counts())
    if 'Final_Method_Label' in df.columns:
        print("\nTop final methods:")
        print(df['Final_Method_Label'].value_counts().head(20))

    # Sample spot checks where TF-IDF score == 0 but compound found something
    if 'Method_TFIDF_Score' in df.columns and 'Method_Compound_Confidence' in df.columns:
        mask = (df['Method_TFIDF_Score'] == 0) & (df['Method_Compound_Confidence'].isin(['High','Low']))
        print(f"\nDocs with TF-IDF score==0 but Compound detected: {mask.sum()}")
        print(df.loc[mask, ['Final_Method_Label','Final_Method_Confidence','Primary_Method_TFIDF','Primary_Method_LDA','Method_Detected_Compound','Method_Compound_Confidence']].head(5))

method_diagnostics_report(df)


=== Diagnostics ===
TF-IDF: nonzero-score docs: 4361/28934 (15.1%)
LDA: confident assignments: 4360/28934 (15.1%)
Compound confidence counts:
Method_Compound_Confidence
NoEvidence       17701
NotClassified     6872
High              4361
Name: count, dtype: int64

Final confidence distribution:
Final_Method_Confidence
low           24573
super_high     4361
Name: count, dtype: int64

Top final methods:
Final_Method_Label
ac power flow                     7367
design for reliability            4435
mc simulation                     3508
cost benefit analysis             1262
energy management optimization    1001
power flow                         842
data driven approach               763
optimal power flow                 732
reliability assessment             697
real time simulation               663
machine learning                   641
constraint optimization            628
LowConfidence                      585
reliability optimization           441
particle swarm optimization  

In [15]:
# %%
current_date = datetime.now().strftime("%Y_%m_%d")
output_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{current_date}_final_combined_methods.csv")
df.to_csv(output_filename, sep=';', encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC, escapechar='\\')
print(f"Results saved to {output_filename}")

print("Final method label distribution:")
print(df['Final_Method_Label'].value_counts())
print("Confidence breakdown:\n", df['Final_Method_Confidence'].value_counts())
print("First 3 sample assigned methods:")
print(df[['Final_Method_Label', 'Final_Method_Confidence', 'Primary_Method_TFIDF', 'Primary_Method_LDA', 'Method_Detected_Compound']].head())

print("API token usage and cost:", credit_tracker.get_stats())


Results saved to Saved_files_new\semantic_scholar_2025_08_19_final_combined_methods.csv
Final method label distribution:
Final_Method_Label
ac power flow                       7367
design for reliability              4435
mc simulation                       3508
cost benefit analysis               1262
energy management optimization      1001
power flow                           842
data driven approach                 763
optimal power flow                   732
reliability assessment               697
real time simulation                 663
machine learning                     641
constraint optimization              628
LowConfidence                        585
reliability optimization             441
particle swarm optimization          437
statistical process control          370
load flow analysis                   313
monte carlo simulation               296
detection algorithm                  282
time series analysis                 280
sensitivity analysis                 271