In [12]:
# --- Imports & Initialization ---
import os
import re
import json
import ast
import numpy as np
import pandas as pd
from collections import Counter
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from openai import OpenAI
import configparser
import tiktoken
import logging
import nltk
import csv
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Phrases

SAVE_DIR = "Saved_files_new"
os.makedirs(SAVE_DIR, exist_ok=True)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


In [2]:
# --- Utility Functions: Stopwords, Keyword Extraction, and API ---

def extract_keywords_from_filename(filename):
    # Extract search keywords from filename
    base = os.path.splitext(os.path.basename(filename))[0]
    parts = base.split('_')
    keywords = [part for i, part in enumerate(parts) if i > 2 and part != 'results' and not part.isdigit()]
    return keywords

def keywords_to_filename_part(keywords):
    return '_'.join([kw.lower().replace(' ', '_') for kw in keywords])

def get_custom_stop_words(search_keywords=None):
    stop_words = set(stopwords.words('english'))
    words_to_keep = set()
    if search_keywords:
        for keyword in search_keywords:
            keyword = keyword.lower()
            words_to_keep.add(keyword)
            for word in keyword.split():
                words_to_keep.add(word)
    stop_words = stop_words - words_to_keep
    scientific_terms = {'et', 'al', 'ref', 'reference', 'references', 'cited', 'cite', 'fig', 'figure', 'figures', 'table', 'tables',
                        'chart', 'charts', 'published', 'journal', 'conference', 'proceedings', 'vol', 'volume', 'pp', 'page', 'pages', 'doi'}
    stop_words = stop_words.union(scientific_terms)
    return stop_words

def initialize_openai():
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE')
    return OpenAI(api_key=api_key), model_type

class CreditTracker:
    def __init__(self):
        self.total_tokens = 0
        self.total_cost = 0
        self.cost_per_1k_tokens = 0.00015

    def update(self, tokens):
        self.total_tokens += tokens
        self.total_cost += (tokens / 1000) * self.cost_per_1k_tokens

    def get_stats(self):
        return {"total_tokens": self.total_tokens, "total_cost": round(self.total_cost, 4)}

def num_tokens_from_string(string: str, model_name: str) -> int:
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(string))


In [3]:
# --- Preprocess and Lemmatize ---
def preprocess_text(text, search_keywords=None, min_word_length=2, remove_numbers=True):
    if not isinstance(text, (str, int, float)):
        return ''
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'--+', ' ', text)
    tokens = word_tokenize(text)
    stop_words = get_custom_stop_words(search_keywords)
    tokens = [t for t in tokens if len(t) >= min_word_length and t not in stop_words and len(t) > 1 and not t.isdigit()]
    lemmatizer = WordNetLemmatizer()
    try:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    except:
        pass
    return ' '.join(tokens)

def preprocess_dataframe(df, text_col, search_keywords, processed_col='processed_text'):
    df[text_col] = df[text_col].fillna('')
    df[text_col] = df[text_col].astype(str)
    df[processed_col] = df[text_col].apply(lambda x: preprocess_text(x, search_keywords))
    df = df[df[processed_col].str.strip() != '']
    return df


In [4]:
# --- Load, Clean, and Preprocess Input ---
filename = "semantic_scholar_2025_02_14_reliability_resilience_power_systems_results.csv"
filepath = os.path.join("Saved_files", filename)

df = pd.read_csv(filepath, sep=";")
df['text'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')
search_keywords = extract_keywords_from_filename(filename)
print(f"Extracted search keywords: {search_keywords}")
df = preprocess_dataframe(df, text_col='text', search_keywords=search_keywords)

# Remove or impute 'fieldsOfStudy'
def clean_fields_of_study(s):
    valid_fields = ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics',
                    'Medicine', 'Business', 'Environmental Science', 'Chemistry', 'Materials Science',
                    'Geography', 'Biology', 'Geology', 'Political Science', 'Psychology', 'Com']
    if pd.isna(s) or s == '[]':
        return ["Unknown"]
    if isinstance(s, str):
        fields = [field.strip().strip("'\"") for field in s.strip('[]').split(',')]
        cleaned_fields = [f if f in valid_fields else "Unknown" for f in fields]
        return cleaned_fields if cleaned_fields else ["Unknown"]
    return ["Unknown"]

df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)


Extracted search keywords: ['reliability', 'resilience', 'power', 'systems']


In [5]:
# --- LDA Topic Modeling (papers grouped by scientific themes) ---
def model_topics(df, num_topics=10, num_words=100):
    """
    Standard thematic topic modeling.
    """
    tokenized_texts = df['processed_text'].apply(lambda x: x.split()).tolist()
    bigram = Phrases(tokenized_texts, min_count=10, threshold=50, delimiter='_')
    trigram = Phrases(bigram[tokenized_texts], threshold=50, delimiter='_')
    phrased = []
    for doc in tokenized_texts:
        bigrams_ = [w for w in bigram[doc] if '_' in w]
        trigrams_ = [w for w in trigram[bigram[doc]] if '_' in w]
        combined = doc + bigrams_ + trigrams_
        phrased.append(' '.join(combined))
    vectorizer = CountVectorizer(ngram_range=(1, 1), token_pattern=r'\b[\w_-]+\b', max_df=0.95, min_df=2, max_features=10000)
    doc_term_matrix = vectorizer.fit_transform(phrased)
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    topic_distributions = lda_model.fit_transform(doc_term_matrix)
    feature_names = vectorizer.get_feature_names_out()
    # Store top words for topics
    def extract_topic_keywords(lda_model, feature_names, num_words=10):
        topic_keywords = {}
        for topic_idx, topic in enumerate(lda_model.components_):
            top_indices = topic.argsort()[:-num_words-1:-1]
            top_words = [feature_names[i] for i in top_indices]
            word_weights = [(feature_names[i], topic[i]) for i in top_indices]
            topic_keywords[topic_idx] = {'top_words': top_words, 'word_weights': word_weights}
        return topic_keywords
    topic_keywords = extract_topic_keywords(lda_model, feature_names, num_words)
    return lda_model, vectorizer, topic_distributions, df, topic_keywords

lda_model, vectorizer, topic_distributions, df_topic, topic_keywords = model_topics(df, num_topics=10, num_words=25)


2025-08-16 08:33:43,301 - INFO - collecting all words and their counts
2025-08-16 08:33:43,302 - INFO - PROGRESS: at sentence #0, processed 0 words and 0 word types
2025-08-16 08:33:45,361 - INFO - PROGRESS: at sentence #10000, processed 1524957 words and 863761 word types
2025-08-16 08:33:47,387 - INFO - PROGRESS: at sentence #20000, processed 3004878 words and 1467564 word types
2025-08-16 08:33:49,218 - INFO - collected 1902495 token types (unigram + bigrams) from a corpus of 4290297 words and 28934 sentences
2025-08-16 08:33:49,219 - INFO - merged Phrases<1902495 vocab, min_count=10, threshold=50, max_vocab_size=40000000>
2025-08-16 08:33:49,220 - INFO - Phrases lifecycle event {'msg': 'built Phrases<1902495 vocab, min_count=10, threshold=50, max_vocab_size=40000000> in 5.92s', 'datetime': '2025-08-16T08:33:49.220518', 'gensim': '4.3.2', 'python': '3.11.13 (main, Jun 12 2025, 12:41:34) [MSC v.1943 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-0

In [6]:
# --- Assign Names to Each Topic Using LLM ---
def generate_topic_name(top_words, client, model_type, credit_tracker):
    prompt = f"From this list of keywords: {', '.join(top_words)}, generate a concise, specific research topic name (preferably a bigram or trigram):"
    response = client.chat.completions.create(
        model=model_type,
        messages=[
            {"role": "system", "content": "You are an expert scientific assistant generating concise topic names."},
            {"role": "user", "content": prompt}
        ]
    )
    credit_tracker.update(num_tokens_from_string(prompt, model_type))
    # Assume plain output, but adjust this as needed
    return response.choices[0].message.content.strip()

client, model_type = initialize_openai()
credit_tracker = CreditTracker()

topic_names = {}
for topic_idx, keywords in topic_keywords.items():
    name = generate_topic_name(keywords['top_words'], client, model_type, credit_tracker)
    topic_names[topic_idx] = name

print("Topic Names:", topic_names)


2025-08-16 08:36:37,810 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-16 08:36:38,595 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-16 08:36:39,448 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-16 08:36:40,061 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-16 08:36:40,677 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-16 08:36:41,139 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-16 08:36:41,626 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-16 08:36:42,313 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-16 08:36:42,827 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "

Topic Names: {0: '"Thermal Efficiency of Solar Power Plants"', 1: '"Adaptive Channel Allocation in Wireless Networks"', 2: '"Cloud-Based IoT Performance Optimization"', 3: '"Smart Grid Data Management"', 4: '"Fault-Tolerant Control for DC Grid Systems"', 5: '"Optimizing Wind Energy Distribution Systems"', 6: '"Renewable Energy Storage Optimization"', 7: '"Reliability Assessment Models"', 8: '"High-Performance Battery Materials"', 9: '"Renewable Energy Generation Costs"'}


In [14]:
# --- Extract Method Phrases Using LLM ---
def extract_candidate_terms(df, text_col='processed_text', max_features=300):
    vectorizer = CountVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=2, max_features=max_features, token_pattern=r'\b[\w-]+\b')
    matrix = vectorizer.fit_transform(df[text_col].fillna(''))
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    sorted_terms = sorted(zip(terms, freqs), key=lambda x: x[1], reverse=True)
    return [term for term, freq in sorted_terms]


def extract_candidate_terms(df, text_col='processed_text', max_features=100):
    """Extract candidate keywords and n-grams from processed text for LLM prompt testing"""
    vectorizer = CountVectorizer(
        ngram_range=(1, 3),
        max_df=0.95,
        min_df=2,
        max_features=max_features,
        token_pattern=r'\b[\w-]+\b'
    )
    matrix = vectorizer.fit_transform(df[text_col].fillna(''))
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    # Sort terms by frequency descending
    sorted_terms = sorted(zip(terms, freqs), key=lambda x: x[1], reverse=True)
    # Return just the terms
    return [term for term, freq in sorted_terms]

def get_method_phrases(corpus_terms, client, model_type, credit_tracker):
    """Improved prompt that considers the actual domain of your corpus"""
    
    # Analyze the candidate terms to understand the domain
    sample_terms = ', '.join(corpus_terms[:50])
    
    prompt = f"""Here are the most frequent terms from a corpus of scientific papers:
{sample_terms}

Based on these terms, this appears to be a corpus focused on power systems, electrical engineering, and reliability analysis.

From the full list of terms: {', '.join(corpus_terms)}

Extract ONLY the terms that represent specific methodologies, techniques, or named approaches that would actually appear in this type of engineering research. Focus on:
- Power system analysis methods
- Reliability analysis techniques  
- Engineering design approaches
- Computational methods used in power/electrical engineering
- Statistical methods for engineering

Do NOT include: generic words like "analysis", "method", "approach", "design", "system" by themselves, nor general expressions like distributed generation, renewable resources that dont specifically describe a method or technique.
DO include: specific named methods like "monte carlo simulation", "load flow analysis", "reliability assessment", loss of load probability, probabilitstic methods, etc.

Return as a simple Python list of strings, no code blocks or formatting."""

    response = client.chat.completions.create(
        model=model_type,
        messages=[{"role": "user", "content": prompt}]
    )
    
    try:
        return ast.literal_eval(response.choices[0].message.content)
    except:
        # Fallback parsing
        content = response.choices[0].message.content
        content = content.replace('[', '').replace(']', '').replace('"', '').replace("'", '')
        return [term.strip() for term in content.split(',') if len(term.strip()) > 3]


import re

def clean_method_phrases_fixed(method_phrases):
    """Clean and validate method phrases from LLM output"""
    cleaned_phrases = []
    
    for phrase in method_phrases:
        # Remove code block markers, quotes, and extra whitespace
        cleaned = phrase.strip()
        cleaned = cleaned.replace('```python', '').replace('```','')
        cleaned = cleaned.replace('[', '').replace(']', '')
        cleaned = cleaned.replace('"', '').replace("'", '')
        cleaned = cleaned.replace('\n', ' ')
        cleaned = ' '.join(cleaned.split())  # Remove extra whitespace
        
        # Skip empty or very short phrases
        if len(cleaned) > 2:
            cleaned_phrases.append(cleaned.lower())
    
    return list(set(cleaned_phrases))


def validate_method_phrases_improved_fixed(df, method_phrases):
    """Improved validation that handles multi-word phrases"""
    all_text = ' '.join(df['processed_text']).lower()
    matched_phrases = []
    
    for phrase in method_phrases:
        phrase_clean = phrase.lower().strip()
        
        # Check for exact phrase match
        if phrase_clean in all_text:
            matched_phrases.append(phrase)
        # Check for partial word matches (for compound terms)
        elif any(word in all_text for word in phrase_clean.split() if len(word) > 3):
            matched_phrases.append(phrase)
    
    return matched_phrases
candidate_terms = extract_candidate_terms(df, text_col='processed_text', max_features=1000)
method_phrases = get_method_phrases(candidate_terms, client, model_type, credit_tracker)
print("Top method phrases:", method_phrases[:15])


2025-08-18 12:15:41,985 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Top method phrases: ['load flow analysis', 'reliability assessment', 'monte carlo simulation', 'loss of load probability', 'probabilistic methods', 'power flow', 'energy management', 'optimization', 'neural network', 'fuzzy', 'dynamic modeling', 'state estimation', 'dispatch', 'signal processing', 'computer simulation']


In [10]:
# --- LDA-based Method Detection ---
def lda_method_assignment(
    df, method_phrases, processed_col='processed_text', max_method_topics=30, min_papers_per_topic=15
):
    # Reduction if many method phrases
    if len(method_phrases) > max_method_topics:
        logger.info(f"Reducing method phrases ({len(method_phrases)}) to top {max_method_topics}.")
        tfidf_vectorizer = TfidfVectorizer(vocabulary=method_phrases, ngram_range=(1, 3), min_df=1, max_df=0.95, norm='l2')
        tfidf_matrix = tfidf_vectorizer.fit_transform(df[processed_col])
        total_method_scores = np.asarray(tfidf_matrix.sum(axis=0)).ravel()
        phrase_ranking = np.argsort(total_method_scores)[-max_method_topics:][::-1]
        best_phrases = [tfidf_vectorizer.get_feature_names_out()[i] for i in phrase_ranking]
    else:
        best_phrases = method_phrases

    # LDA with [number of] method topics
    vectorizer = CountVectorizer(vocabulary=best_phrases, ngram_range=(1, 3), token_pattern=r'\b[\w-]+\b')
    doc_term_matrix = vectorizer.fit_transform(df[processed_col])
    n_method_topics = len(best_phrases)
    if n_method_topics < 2:
        logger.warning("Not enough method phrases for LDA method assignment. Skipping.")
        df['Primary_Method_LDA'] = 'No_Method_Found'
        df['Method_LDA_Score'] = 0.0
        return df

    lda = LatentDirichletAllocation(n_components=n_method_topics, learning_method='batch', random_state=42, max_iter=20)
    doc_topic_dist = lda.fit_transform(doc_term_matrix)
    topic_labels = best_phrases

    best_topic_idx = doc_topic_dist.argmax(axis=1)
    best_topic_val = doc_topic_dist[np.arange(len(df)), best_topic_idx]
    assigned_methods = [topic_labels[i] if best_topic_val[j] > 1/n_method_topics+0.05 else 'LowConfidence'
                        for j, i in enumerate(best_topic_idx)]

    # Mask rare method-topics
    topic_assignment_counts = pd.Series(best_topic_idx).value_counts()
    rare_topics = topic_assignment_counts[topic_assignment_counts < min_papers_per_topic].index.tolist()
    assigned_methods = [
        'LowConfidence' if idx in rare_topics or label == 'LowConfidence' else topic_labels[idx]
        for (idx, label) in zip(best_topic_idx, assigned_methods)
    ]

    df['Primary_Method_LDA'] = assigned_methods
    df['Method_LDA_Score'] = best_topic_val
    # Save the full method-topic distribution for each doc (optional)
    df['Top_3_Methods_LDA'] = [
        [topic_labels[i] for i in doc_topic_dist[j].argsort()[-3:][::-1]]
        for j in range(doc_topic_dist.shape[0])
    ]
    df['Top_3_Methods_LDA_Scores'] = [
        [doc_topic_dist[j, i] for i in doc_topic_dist[j].argsort()[-3:][::-1]]
        for j in range(doc_topic_dist.shape[0])
    ]
    return df

df_methods = lda_method_assignment(df, method_phrases, processed_col='processed_text', max_method_topics=25, min_papers_per_topic=8)


2025-08-18 12:11:34,554 - INFO - Reducing method phrases (40) to top 25.


In [13]:
# --- Save and Summarize ---

current_date = datetime.now().strftime("%Y_%m_%d")
output_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{current_date}_final_lda_method.csv")
df_methods.to_csv(output_filename, sep=';', encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC, escapechar='\\')
print(f"Results saved to {output_filename}")

print("LDA-based method label distribution:")
print(df_methods['Primary_Method_LDA'].value_counts())
print("Top 3 methods for a sample document:", df_methods['Top_3_Methods_LDA'].iloc, df_methods['Top_3_Methods_LDA_Scores'].iloc)
print("API token usage and cost:", credit_tracker.get_stats())


Results saved to Saved_files_new\semantic_scholar_2025_08_18_final_lda_method.csv
LDA-based method label distribution:
Primary_Method_LDA
LowConfidence                  16816
distributed generation          3277
fault analysis                  1861
dynamic modeling                1348
state estimation                1146
energy management                711
evaluation                       578
detection                        497
simulation result                481
load forecasting                 461
dispatch strategy                263
reliability assessment           250
strategic planning               235
optimization                     191
fuzzy logic                      172
sensitivity analysis             168
performance evaluation           166
energy storage optimization      134
monte carlo simulation            53
load flow analysis                41
risk assessment                   36
machine learning                  30
control theory                    19
Name: count