In [1]:
# Imports and Setup
import os
import re
import json
import csv
import ast
import numpy as np
import pandas as pd
from collections import Counter
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import configparser
import tiktoken
import logging
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Phrases
import openai
import random

# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)

SAVE_DIR = "Saved_files_new"
os.makedirs(SAVE_DIR, exist_ok=True)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


In [2]:
# OpenAI Setup and Credit Tracking
class CreditTracker:
    def __init__(self):
        self.total_tokens = 0
        self.total_cost = 0
        self.cost_per_1k_tokens = 0.00015
    
    def update(self, tokens):
        self.total_tokens += tokens
        self.total_cost += (tokens / 1000) * self.cost_per_1k_tokens
    
    def get_stats(self):
        return {"total_tokens": self.total_tokens, "total_cost": round(self.total_cost, 4)}

def initialize_openai():
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE')
    client = openai.OpenAI(api_key=api_key)
    return client, model_type

def num_tokens_from_string(string: str, model_name: str) -> int:
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(string))

client, model_type = initialize_openai()
credit_tracker = CreditTracker()


In [3]:
#Text Processing Functions
# %%
def extract_keywords_from_filename(filename):
    base = os.path.splitext(os.path.basename(filename))[0]
    parts = base.split('_')
    return [part for i, part in enumerate(parts) if i > 2 and part != 'results' and not part.isdigit()]

def get_custom_stop_words(search_keywords=None):
    stop_words = set(stopwords.words('english'))
    words_to_keep = set()
    if search_keywords:
        for keyword in search_keywords:
            keyword = keyword.lower()
            words_to_keep.add(keyword)
            for word in keyword.split():
                words_to_keep.add(word)
    stop_words = stop_words - words_to_keep
    scientific_terms = {'et', 'al','ref','reference','references','cited','cite',
        'fig','figure','figures','table','tables','chart','charts',
        'published','journal','conference','proceedings','vol','volume','pp','page','pages','doi'}
    return stop_words.union(scientific_terms)

def preprocess_text(text, search_keywords=None, min_word_length=2, remove_numbers=False):
    if not isinstance(text, (str, int, float)):
        return ''
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'--+', ' ', text)
    tokens = word_tokenize(text)
    stop_words = get_custom_stop_words(search_keywords)
    tokens = [t for t in tokens if len(t) >= min_word_length and t not in stop_words and len(t) > 1 and not t.isdigit()]
    lemmatizer = WordNetLemmatizer()
    try:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    except:
        pass
    return ' '.join(tokens)

def preprocess_dataframe(df, text_col, search_keywords, processed_col='processed_text'):
    df[text_col] = df[text_col].fillna('').astype(str)
    df[processed_col] = df[text_col].apply(lambda x: preprocess_text(x, search_keywords))
    return df[df[processed_col].str.strip() != '']


In [4]:

# Data Loading and Preprocessing

filename = "semantic_scholar_2025_02_14_reliability_resilience_power_systems_results.csv"
filepath = os.path.join("Saved_files", filename)
df = pd.read_csv(filepath, sep=";")
df['text'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')
search_keywords = extract_keywords_from_filename(filename)
df = preprocess_dataframe(df, text_col='text', search_keywords=search_keywords)

def clean_fields_of_study(s):
    valid_fields = ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics',
                    'Medicine','Business','Environmental Science','Chemistry','Materials Science',
                    'Geography','Biology','Geology','Political Science','Psychology','Com']
    if pd.isna(s) or s == '[]':
        return ["Unknown"]
    if isinstance(s, str):
        fields = [field.strip().strip("'\"") for field in s.strip('[]').split(',')]
        return [f if f in valid_fields else "Unknown" for f in fields] or ["Unknown"]
    return ["Unknown"]

df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)


In [5]:
#Topic Modeling
def model_topics(df, num_topics=10, num_words=25):
    tokenized_texts = df['processed_text'].apply(lambda x: x.split()).tolist()
    bigram = Phrases(tokenized_texts, min_count=10, threshold=50, delimiter='_')
    trigram = Phrases(bigram[tokenized_texts], threshold=50, delimiter='_')
    phrased = []
    for doc in tokenized_texts:
        bigrams_ = [w for w in bigram[doc] if '_' in w]
        trigrams_ = [w for w in trigram[bigram[doc]] if '_' in w]
        combined = doc + bigrams_ + trigrams_
        phrased.append(' '.join(combined))
    vectorizer = CountVectorizer(ngram_range=(1, 1), token_pattern=r'\b[\w_-]+\b', max_df=0.95, min_df=2, max_features=10000)
    doc_term_matrix = vectorizer.fit_transform(phrased)
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    topic_distributions = lda_model.fit_transform(doc_term_matrix)
    feature_names = vectorizer.get_feature_names_out()
    topic_keywords = {}
    for topic_idx, topic in enumerate(lda_model.components_):
        top_indices = topic.argsort()[:-num_words-1:-1]
        top_words = [feature_names[i] for i in top_indices]
        word_weights = [(feature_names[i], topic[i]) for i in top_indices]
        topic_keywords[topic_idx] = {'top_words': top_words, 'word_weights': word_weights}
    return lda_model, vectorizer, topic_distributions, df, topic_keywords

lda_model, vectorizer, topic_distributions, df_topic, topic_keywords = model_topics(df, num_topics=10, num_words=25)


2025-08-19 12:25:17,725 - INFO - collecting all words and their counts
2025-08-19 12:25:17,726 - INFO - PROGRESS: at sentence #0, processed 0 words and 0 word types
2025-08-19 12:25:20,515 - INFO - PROGRESS: at sentence #10000, processed 1528760 words and 873903 word types
2025-08-19 12:25:23,535 - INFO - PROGRESS: at sentence #20000, processed 3012255 words and 1486274 word types
2025-08-19 12:25:26,017 - INFO - collected 1928697 token types (unigram + bigrams) from a corpus of 4300658 words and 28934 sentences
2025-08-19 12:25:26,019 - INFO - merged Phrases<1928697 vocab, min_count=10, threshold=50, max_vocab_size=40000000>
2025-08-19 12:25:26,020 - INFO - Phrases lifecycle event {'msg': 'built Phrases<1928697 vocab, min_count=10, threshold=50, max_vocab_size=40000000> in 8.29s', 'datetime': '2025-08-19T12:25:26.020938', 'gensim': '4.3.2', 'python': '3.11.13 (main, Jun 12 2025, 12:41:34) [MSC v.1943 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-0

In [6]:
#Method Phrase Extraction

# %%
def extract_candidate_terms(df, text_col='processed_text', max_features=15000):
    vectorizer = CountVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=2, 
                                max_features=max_features, token_pattern=r'\b[\w-]+\b')
    matrix = vectorizer.fit_transform(df[text_col].fillna(''))
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    return [term for term, freq in sorted(zip(terms, freqs), key=lambda x: x[1], reverse=True)]

def get_method_phrases(corpus_terms, client, model_type, credit_tracker):
    sample_terms = ', '.join(corpus_terms[:50])
    prompt = f"""Here are the most frequent terms from a corpus of scientific papers:
{sample_terms}
From the full list: {', '.join(corpus_terms)}
Extract ONLY the terms that represent specific methodologies, techniques, or named approaches that would actually appear in this type of engineering research. Focus on:
- Power system analysis methods
- Reliability analysis techniques  
- Engineering design approaches
- Computational methods used in power/electrical engineering
- Statistical methods for engineering

Do NOT include: generic words like "analysis", "method", "approach", "design", "system" by themselves.
DO include: specific named methods like "monte carlo simulation", "load flow analysis", "reliability assessment", loss of load probability, probabilitstic methods, etc.

Return as a simple Python list of strings, no code blocks or formatting."""
    
    response = client.chat.completions.create(
        model=model_type,
        messages=[{"role": "user", "content": prompt}]
    )
    try:
        return ast.literal_eval(response.choices[0].message.content)
    except:
        content = response.choices.message.content
        content = content.replace('[', '').replace(']', '').replace('"', '').replace("'", '')
        return [term.strip() for term in content.split(',') if len(term.strip()) > 3]

def clean_method_phrases_fixed(method_phrases):
    cleaned_phrases = []
    for phrase in method_phrases:
        cleaned = phrase.strip().replace('``````','').replace('[', '').replace(']', '').replace('"', '').replace("'", '').replace('\n', ' ')
        cleaned = ' '.join(cleaned.split())
        if len(cleaned) > 2:
            cleaned_phrases.append(cleaned.lower())
    return list(set(cleaned_phrases))

def stable_normalize_augment_vocab(method_phrases):
    base = [p.lower().strip() for p in method_phrases if isinstance(p, str) and p.strip()]
    variants = set()
    for p in base:
        variants.add(p)
        variants.add(p.replace('-', ' '))
        variants.add(p.replace('_', ' '))
        if ' ' in p:
            variants.add(p.replace(' ', '_'))
        # Add domain-specific variants
        if p == 'optimal power flow':
            variants.update(['opf','security-constrained opf','scopf'])
        if p == 'monte carlo simulation':
            variants.update(['mc simulation'])
        if p == 'load flow analysis':
            variants.update(['power flow','ac power flow','dc power flow'])
        if p == 'state estimation':
            variants.update(['wls state estimation','kalman filter','extended kalman','unscented kalman'])
        if p == 'contingency analysis':
            variants.update(['n-1 security','n-1 contingency'])
        if p == 'unit commitment':
            variants.update(['economic dispatch','security-constrained unit commitment'])
    return sorted(set(variants))

candidate_terms = extract_candidate_terms(df, text_col='processed_text')
method_phrases = get_method_phrases(candidate_terms, client, model_type, credit_tracker)
method_phrases = clean_method_phrases_fixed(method_phrases)
method_phrases_aug = stable_normalize_augment_vocab(method_phrases)
logger.info(f"Augmented vocab size: {len(method_phrases_aug)}")


2025-08-19 12:31:50,882 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-19 12:31:50,896 - INFO - Augmented vocab size: 133


In [7]:
#Multi-Label Method Scoring

# %%
def compute_method_scores(df, vocab, processed_col='processed_text', 
                         w_tfidf=0.6, w_compound=0.4, top_k=5):
    """
    Compute combined scores for all methods and return per-method columns plus top-k scores.
    """
    n_docs = len(df)
    n_methods = len(vocab)
    
    # TF-IDF scores
    tfidf_vectorizer = TfidfVectorizer(
        vocabulary=vocab, ngram_range=(1, 3), min_df=1, max_df=0.999,
        norm='l2', token_pattern=r'\b[\w_-]+\b'
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[processed_col])
    tfidf_scores = tfidf_matrix.toarray()  # (n_docs, n_methods)
    
    # Compound/proximity scores
    compound_scores = compute_compound_scores(df, vocab, processed_col)
    
    # Combined scores
    combined_scores = w_tfidf * tfidf_scores + w_compound * compound_scores
    
    # Create per-method columns with combined scores
    method_columns = {}
    for j, method in enumerate(vocab):
        safe_name = f"method_{method.replace(' ', '_').replace('-', '_')}"
        method_columns[safe_name] = combined_scores[:, j]
    
    # Add method score columns to dataframe
    for col_name, scores in method_columns.items():
        df[col_name] = scores
    
    # Top-k TF-IDF and LDA scores
    tfidf_topk_idx = np.argsort(tfidf_scores, axis=1)[:, -top_k:][:, ::-1]
    compound_topk_idx = np.argsort(compound_scores, axis=1)[:, -top_k:][:, ::-1]
    combined_topk_idx = np.argsort(combined_scores, axis=1)[:, -top_k:][:, ::-1]
    
    # Create top-k columns
    for k in range(top_k):
        # TF-IDF top-k
        df[f'tfidf_top_{k+1}_method'] = [vocab[idx[k]] for idx in tfidf_topk_idx]
        df[f'tfidf_top_{k+1}_score'] = [tfidf_scores[i, idx[k]] for i, idx in enumerate(tfidf_topk_idx)]
        
        # Compound top-k
        df[f'compound_top_{k+1}_method'] = [vocab[idx[k]] for idx in compound_topk_idx]
        df[f'compound_top_{k+1}_score'] = [compound_scores[i, idx[k]] for i, idx in enumerate(compound_topk_idx)]
        
        # Combined top-k
        df[f'combined_top_{k+1}_method'] = [vocab[idx[k]] for idx in combined_topk_idx]
        df[f'combined_top_{k+1}_score'] = [combined_scores[i, idx[k]] for i, idx in enumerate(combined_topk_idx)]
    
    return df, combined_scores, tfidf_scores, compound_scores, vocab

def compute_compound_scores(df, vocab, processed_col='processed_text', window=300, ratio_thresh=0.5):
    """Compute compound/proximity scores for all methods."""
    n_docs = len(df)
    n_terms = len(vocab)
    scores = np.zeros((n_docs, n_terms), dtype=np.float32)
    
    docs = df[processed_col].fillna('').str.lower().tolist()
    
    for j, phrase in enumerate(vocab):
        phrase_l = phrase.lower()
        phrase_words = [w for w in phrase_l.split() if len(w) > 0]
        sig_words = [w for w in phrase_words if len(w) > 3]
        
        for i, text in enumerate(docs):
            if phrase_l in text:
                scores[i, j] = 1.0
                continue
                
            if len(phrase_words) > 1:
                # Coverage score
                present = sum(1 for w in sig_words if w in text) if sig_words else 0
                coverage = present / len(sig_words) if sig_words else 0.0
                
                # Proximity check
                prox_hit = False
                for k in range(len(phrase_words)-1):
                    w1, w2 = phrase_words[k], phrase_words[k+1]
                    pos = text.find(w1)
                    if pos >= 0:
                        nearby = text[pos:pos+window]
                        if w2 in nearby:
                            prox_hit = True
                            break
                
                if coverage >= ratio_thresh or prox_hit:
                    scores[i, j] = max(scores[i, j], min(1.0, 0.6 + 0.4*coverage))
            else:
                # Single technical term
                if len(phrase_l) > 6 and phrase_l in text:
                    scores[i, j] = max(scores[i, j], 0.7)
    
    return scores

def assign_primary_method_and_confidence(df, combined_scores, vocab, 
                                       th_super=0.85, th_high=0.6, th_low=0.2):
    """Assign primary method and confidence level to each document."""
    n_docs = len(df)
    primary_methods = []
    confidences = []
    
    for i in range(n_docs):
        scores = combined_scores[i]
        max_idx = np.argmax(scores)
        max_score = scores[max_idx]
        best_method = vocab[max_idx]
        
        # Determine confidence
        if max_score >= th_super:
            confidence = 'super_high'
        elif max_score >= th_high:
            confidence = 'high'
        elif max_score >= th_low:
            confidence = 'low'
        else:
            confidence = 'not_detected'
            best_method = 'LowConfidence'
        
        primary_methods.append(best_method)
        confidences.append(confidence)
    
    df['Primary_Method'] = primary_methods
    df['Method_Confidence'] = confidences
    return df


In [8]:
# Execute Method Scoring
logger.info("Computing multi-label method scores...")
df, combined_scores, tfidf_scores, compound_scores, vocab = compute_method_scores(
    df, method_phrases_aug, processed_col='processed_text', 
    w_tfidf=0.6, w_compound=0.4, top_k=5
)

# Assign primary method and confidence
df = assign_primary_method_and_confidence(
    df, combined_scores, vocab, 
    th_super=0.85, th_high=0.6, th_low=0.2
)


2025-08-19 12:31:50,943 - INFO - Computing multi-label method scores...
  df['Primary_Method'] = primary_methods
  df['Method_Confidence'] = confidences


In [9]:
#Diagnostics and Results

# %%
def comprehensive_diagnostics(df, combined_scores, tfidf_scores, compound_scores, vocab):
    n_docs = len(df)
    
    print("=== METHOD ASSIGNMENT DIAGNOSTICS ===")
    print(f"Total documents: {n_docs}")
    print(f"Total method phrases: {len(vocab)}")
    
    # Coverage statistics
    tfidf_nonzero = (tfidf_scores > 0).any(axis=1).sum()
    compound_nonzero = (compound_scores > 0).any(axis=1).sum()
    combined_nonzero = (combined_scores > 0).any(axis=1).sum()
    
    print(f"\nCoverage:")
    print(f"  TF-IDF coverage: {tfidf_nonzero}/{n_docs} ({100*tfidf_nonzero/n_docs:.1f}%)")
    print(f"  Compound coverage: {compound_nonzero}/{n_docs} ({100*compound_nonzero/n_docs:.1f}%)")
    print(f"  Combined coverage: {combined_nonzero}/{n_docs} ({100*combined_nonzero/n_docs:.1f}%)")
    
    # Confidence distribution
    if 'Method_Confidence' in df.columns:
        conf_dist = df['Method_Confidence'].value_counts()
        print(f"\nConfidence distribution:")
        for conf, count in conf_dist.items():
            print(f"  {conf}: {count} ({100*count/n_docs:.1f}%)")
    
    # Top methods
    if 'Primary_Method' in df.columns:
        method_dist = df['Primary_Method'].value_counts().head(15)
        print(f"\nTop 15 assigned methods:")
        for method, count in method_dist.items():
            print(f"  {method}: {count}")
    
    # Score statistics
    print(f"\nScore statistics:")
    print(f"  Combined scores - Mean: {combined_scores.mean():.4f}, Std: {combined_scores.std():.4f}")
    print(f"  TF-IDF scores - Mean: {tfidf_scores.mean():.4f}, Std: {tfidf_scores.std():.4f}")
    print(f"  Compound scores - Mean: {compound_scores.mean():.4f}, Std: {compound_scores.std():.4f}")

comprehensive_diagnostics(df, combined_scores, tfidf_scores, compound_scores, vocab)


=== METHOD ASSIGNMENT DIAGNOSTICS ===
Total documents: 28934
Total method phrases: 133

Coverage:
  TF-IDF coverage: 3775/28934 (13.0%)
  Compound coverage: 28279/28934 (97.7%)
  Combined coverage: 28279/28934 (97.7%)

Confidence distribution:
  low: 24504 (84.7%)
  super_high: 3563 (12.3%)
  not_detected: 655 (2.3%)
  high: 212 (0.7%)

Top 15 assigned methods:
  ac power flow: 6177
  mc simulation: 4424
  performance based design: 2745
  distribution system reliability assessment: 1392
  dynamic line rating: 1125
  power flow: 974
  reliability distribution analysis: 827
  reliability assessment: 726
  dynamic reactive power control: 660
  LowConfidence: 655
  model predictive control: 521
  genetic algorithm: 492
  multi objective optimization: 486
  hybrid optimization: 471
  techno economic analysis: 441

Score statistics:
  Combined scores - Mean: 0.0278, Std: 0.0958
  TF-IDF scores - Mean: 0.0011, Std: 0.0313
  Compound scores - Mean: 0.0678, Std: 0.2284


In [10]:
# Save Results
current_date = datetime.now().strftime("%Y_%m_%d")
output_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{current_date}_multilabel_methods.csv")
df.to_csv(output_filename, sep=';', encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC, escapechar='\\')

print(f"Results saved to {output_filename}")
print(f"API token usage: {credit_tracker.get_stats()}")

# Display sample results
print("\nSample results:")
display_cols = ['Primary_Method', 'Method_Confidence', 'combined_top_1_method', 'combined_top_1_score', 
                'tfidf_top_1_method', 'tfidf_top_1_score', 'compound_top_1_method', 'compound_top_1_score']
available_cols = [col for col in display_cols if col in df.columns]
print(df[available_cols].head())


Results saved to Saved_files_new\semantic_scholar_2025_08_19_multilabel_methods.csv
API token usage: {'total_tokens': 0, 'total_cost': 0}

Sample results:
                               Primary_Method Method_Confidence  \
0                               ac power flow               low   
1            reliability centered maintenance               low   
2  distribution system reliability assessment               low   
3                              control theory               low   
4                               ac power flow               low   

                        combined_top_1_method  combined_top_1_score  \
0                               ac power flow              0.320000   
1            reliability centered maintenance              0.346667   
2  distribution system reliability assessment              0.360000   
3                              control theory              0.400000   
4                               ac power flow              0.400000   

     tfidf_top_