In [None]:
import os
import re
import json
import csv
import ast
import numpy as np
import pandas as pd
from collections import Counter
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import configparser
import tiktoken
import logging
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Phrases

import openai  # <-- Import OpenAI

SAVE_DIR = "Saved_files_new"
os.makedirs(SAVE_DIR, exist_ok=True)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)



In [None]:

class CreditTracker:
    def __init__(self):
        self.total_tokens = 0
        self.total_cost = 0
        self.cost_per_1k_tokens = 0.00015
    def update(self, tokens):
        self.total_tokens += tokens
        self.total_cost += (tokens / 1000) * self.cost_per_1k_tokens
    def get_stats(self):
        return {"total_tokens": self.total_tokens, "total_cost": round(self.total_cost, 4)}

def initialize_openai():
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE')
    client = openai.OpenAI(api_key=api_key)
    return client, model_type

client, model_type = initialize_openai()
credit_tracker = CreditTracker()

def num_tokens_from_string(string: str, model_name: str) -> int:
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(string))


In [19]:
def extract_keywords_from_filename(filename):
    base = os.path.splitext(os.path.basename(filename))[0]
    parts = base.split('_')
    keywords = [part for i, part in enumerate(parts) if i > 2 and part != 'results' and not part.isdigit()]
    return keywords

def keywords_to_filename_part(keywords):
    return '_'.join([kw.lower().replace(' ', '_') for kw in keywords])

def get_custom_stop_words(search_keywords=None):
    stop_words = set(stopwords.words('english'))
    words_to_keep = set()
    if search_keywords:
        for keyword in search_keywords:
            keyword = keyword.lower()
            words_to_keep.add(keyword)
            for word in keyword.split():
                words_to_keep.add(word)
    stop_words = stop_words - words_to_keep
    scientific_terms = {'et', 'al','ref','reference','references','cited','cite',
        'fig','figure','figures','table','tables','chart','charts',
        'published','journal','conference','proceedings','vol','volume','pp','page','pages','doi'}
    stop_words = stop_words.union(scientific_terms)
    return stop_words

def initialize_openai():
    import openai
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE')
    return openai.OpenAI(api_key=api_key), model_type

class CreditTracker:
    def __init__(self):
        self.total_tokens = 0
        self.total_cost = 0
        self.cost_per_1k_tokens = 0.00015
    def update(self, tokens):
        self.total_tokens += tokens
        self.total_cost += (tokens / 1000) * self.cost_per_1k_tokens
    def get_stats(self):
        return {"total_tokens": self.total_tokens, "total_cost": round(self.total_cost, 4)}

def num_tokens_from_string(string: str, model_name: str) -> int:
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(string))

client, model_type = initialize_openai()
credit_tracker = CreditTracker()


In [20]:
def preprocess_text(text, search_keywords=None, min_word_length=2, remove_numbers=True):
    if not isinstance(text, (str, int, float)):
        return ''
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'--+', ' ', text)
    tokens = word_tokenize(text)
    stop_words = get_custom_stop_words(search_keywords)
    tokens = [t for t in tokens if len(t) >= min_word_length and t not in stop_words and len(t) > 1 and not t.isdigit()]
    lemmatizer = WordNetLemmatizer()
    try:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    except:
        pass
    return ' '.join(tokens)

def preprocess_dataframe(df, text_col, search_keywords, processed_col='processed_text'):
    df[text_col] = df[text_col].fillna('').astype(str)
    df[processed_col] = df[text_col].apply(lambda x: preprocess_text(x, search_keywords))
    return df[df[processed_col].str.strip() != '']


In [21]:
filename = "semantic_scholar_2025_02_14_reliability_resilience_power_systems_results.csv"
filepath = os.path.join("Saved_files", filename)
df = pd.read_csv(filepath, sep=";")
df['text'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')
search_keywords = extract_keywords_from_filename(filename)
df = preprocess_dataframe(df, text_col='text', search_keywords=search_keywords)

def clean_fields_of_study(s):
    valid_fields = ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics','Medicine','Business','Environmental Science','Chemistry','Materials Science','Geography','Biology','Geology','Political Science','Psychology','Com']
    if pd.isna(s) or s == '[]':
        return ["Unknown"]
    if isinstance(s, str):
        fields = [field.strip().strip("'\"") for field in s.strip('[]').split(',')]
        return [f if f in valid_fields else "Unknown" for f in fields] or ["Unknown"]
    return ["Unknown"]
df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)


In [22]:
def model_topics(df, num_topics=10, num_words=25):
    tokenized_texts = df['processed_text'].apply(lambda x: x.split()).tolist()
    bigram = Phrases(tokenized_texts, min_count=10, threshold=50, delimiter='_')
    trigram = Phrases(bigram[tokenized_texts], threshold=50, delimiter='_')
    phrased = []
    for doc in tokenized_texts:
        bigrams_ = [w for w in bigram[doc] if '_' in w]
        trigrams_ = [w for w in trigram[bigram[doc]] if '_' in w]
        combined = doc + bigrams_ + trigrams_
        phrased.append(' '.join(combined))
    vectorizer = CountVectorizer(ngram_range=(1, 1), token_pattern=r'\b[\w_-]+\b', max_df=0.95, min_df=2, max_features=10000)
    doc_term_matrix = vectorizer.fit_transform(phrased)
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    topic_distributions = lda_model.fit_transform(doc_term_matrix)
    feature_names = vectorizer.get_feature_names_out()
    topic_keywords = {}
    for topic_idx, topic in enumerate(lda_model.components_):
        top_indices = topic.argsort()[:-num_words-1:-1]
        top_words = [feature_names[i] for i in top_indices]
        word_weights = [(feature_names[i], topic[i]) for i in top_indices]
        topic_keywords[topic_idx] = {'top_words': top_words, 'word_weights': word_weights}
    return lda_model, vectorizer, topic_distributions, df, topic_keywords

lda_model, vectorizer, topic_distributions, df_topic, topic_keywords = model_topics(df, num_topics=10, num_words=25)


2025-08-18 12:45:03,675 - INFO - collecting all words and their counts
2025-08-18 12:45:03,677 - INFO - PROGRESS: at sentence #0, processed 0 words and 0 word types
2025-08-18 12:45:05,780 - INFO - PROGRESS: at sentence #10000, processed 1524957 words and 863761 word types
2025-08-18 12:45:07,973 - INFO - PROGRESS: at sentence #20000, processed 3004878 words and 1467564 word types
2025-08-18 12:45:09,967 - INFO - collected 1902495 token types (unigram + bigrams) from a corpus of 4290297 words and 28934 sentences
2025-08-18 12:45:09,970 - INFO - merged Phrases<1902495 vocab, min_count=10, threshold=50, max_vocab_size=40000000>
2025-08-18 12:45:09,971 - INFO - Phrases lifecycle event {'msg': 'built Phrases<1902495 vocab, min_count=10, threshold=50, max_vocab_size=40000000> in 6.30s', 'datetime': '2025-08-18T12:45:09.971485', 'gensim': '4.3.2', 'python': '3.11.13 (main, Jun 12 2025, 12:41:34) [MSC v.1943 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-0

In [23]:
def extract_candidate_terms(df, text_col='processed_text', max_features=1000):
    vectorizer = CountVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=2, max_features=max_features, token_pattern=r'\b[\w-]+\b')
    matrix = vectorizer.fit_transform(df[text_col].fillna(''))
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    return [term for term, freq in sorted(zip(terms, freqs), key=lambda x: x[1], reverse=True)]

def get_method_phrases(corpus_terms, client, model_type, credit_tracker):
    sample_terms = ', '.join(corpus_terms[:150])
    prompt = f"""Here are the most frequent terms from a corpus of scientific papers:
{sample_terms}
From the full list: {', '.join(corpus_terms)}
Extract ONLY the terms that represent specific methodologies, techniques, or named approaches that would actually appear in this type of engineering research. Focus on:
- Power system analysis methods
- Reliability analysis techniques  
- Engineering design approaches
- Computational methods used in power/electrical engineering
- Statistical methods for engineering

Do NOT include: generic words like "analysis", "method", "approach", "design", "system" by themselves, nor general expressions such as "distributed generation", "renewable resources" that dont specifically describe a method.
DO include: specific named methods like "monte carlo simulation", "load flow analysis", "reliability assessment", loss of load probability, probabilitstic methods, etc.

Return as a simple Python list of strings, no code blocks or formatting."""
    response = client.chat.completions.create(
        model=model_type,
        messages=[{"role": "user", "content": prompt}]
    )
    try:
        return ast.literal_eval(response.choices[0].message.content)
    except:
        content = response.choices.message.content
        content = content.replace('[', '').replace(']', '').replace('"', '').replace("'", '')
        return [term.strip() for term in content.split(',') if len(term.strip()) > 3]


def clean_method_phrases_fixed(method_phrases):
    cleaned_phrases = []
    for phrase in method_phrases:
        cleaned = phrase.strip().replace('``````','').replace('[', '').replace(']', '').replace('"', '').replace("'", '').replace('\n', ' ')
        cleaned = ' '.join(cleaned.split())
        if len(cleaned) > 2:
            cleaned_phrases.append(cleaned.lower())
    return list(set(cleaned_phrases))

candidate_terms = extract_candidate_terms(df, text_col='processed_text')
method_phrases = get_method_phrases(candidate_terms, client, model_type, credit_tracker)
method_phrases = clean_method_phrases_fixed(method_phrases)
print("Top method phrases:", method_phrases[:15])


2025-08-18 12:48:15,589 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Top method phrases: ['monte carlo simulation', 'linear programming', 'signal processing techniques', 'load flow analysis', 'dynamic simulation', 'deep learning', 'fault tree analysis', 'power system state estimation', 'control theory applications', 'probabilistic methods', 'differential evolution', 'nonlinear programming', 'reliability assessment', 'contingency analysis', 'empirical modeling']


In [24]:
# --- TF-IDF Method Assignment ---
def tfidf_method_assignment(df, method_phrases, processed_col='processed_text', min_score=0.05):
    vectorizer = TfidfVectorizer(vocabulary=method_phrases, ngram_range=(1, 3), min_df=1, max_df=0.95, norm='l2')
    tfidf_matrix = vectorizer.fit_transform(df[processed_col])
    method_scores = tfidf_matrix.max(axis=1).toarray().ravel()
    argmax_indices = tfidf_matrix.argmax(axis=1).A1
    feature_names = vectorizer.get_feature_names_out()
    assigned = [feature_names[i] if score >= min_score else 'LowConfidence' for i, score in zip(argmax_indices, method_scores)]
    df['Primary_Method_TFIDF'] = assigned
    df['Method_TFIDF_Score'] = method_scores
    # Store top 3 for optional use
    topk = 3
    for k in range(1, topk+1):
        indices = np.argsort(tfidf_matrix.toarray(), axis=1)[:, -k:]
        col_name = f'Top_{k}_TFIDF_Methods'
        df[col_name] = [[feature_names[j] for j in idxs[::-1]] for idxs in indices]
    return df

df = tfidf_method_assignment(df, method_phrases, processed_col='processed_text', min_score=0.03)

# --- LDA-based Method Assignment ---
def lda_method_assignment(
    df, method_phrases, processed_col='processed_text', max_method_topics=25, min_papers_per_topic=8
):
    # Reduce if many
    if len(method_phrases) > max_method_topics:
        logger.info(f"Reducing method phrases ({len(method_phrases)}) to top {max_method_topics}.")
        tfidf_vectorizer = TfidfVectorizer(vocabulary=method_phrases, ngram_range=(1, 3), min_df=1, max_df=0.95, norm='l2')
        tfidf_matrix = tfidf_vectorizer.fit_transform(df[processed_col])
        total_method_scores = np.asarray(tfidf_matrix.sum(axis=0)).ravel()
        phrase_ranking = np.argsort(total_method_scores)[-max_method_topics:][::-1]
        best_phrases = [tfidf_vectorizer.get_feature_names_out()[i] for i in phrase_ranking]
    else:
        best_phrases = method_phrases
    vectorizer = CountVectorizer(vocabulary=best_phrases, ngram_range=(1, 3), token_pattern=r'\b[\w-]+\b')
    doc_term_matrix = vectorizer.fit_transform(df[processed_col])
    n_method_topics = len(best_phrases)
    if n_method_topics < 2:
        logger.warning("Not enough method phrases for LDA method assignment. Skipping.")
        df['Primary_Method_LDA'] = 'No_Method_Found'
        df['Method_LDA_Score'] = 0.0
        return df
    lda = LatentDirichletAllocation(n_components=n_method_topics, learning_method='batch', random_state=42, max_iter=20)
    doc_topic_dist = lda.fit_transform(doc_term_matrix)
    topic_labels = best_phrases
    best_topic_idx = doc_topic_dist.argmax(axis=1)
    best_topic_val = doc_topic_dist[np.arange(len(df)), best_topic_idx]
    assigned_methods = [topic_labels[i] if best_topic_val[j] > 1/n_method_topics+0.03 else 'LowConfidence' for j, i in enumerate(best_topic_idx)]
    topic_assignment_counts = pd.Series(best_topic_idx).value_counts()
    rare_topics = topic_assignment_counts[topic_assignment_counts < min_papers_per_topic].index.tolist()
    assigned_methods = ['LowConfidence' if idx in rare_topics or label == 'LowConfidence' else topic_labels[idx] for (idx, label) in zip(best_topic_idx, assigned_methods)]
    df['Primary_Method_LDA'] = assigned_methods
    df['Method_LDA_Score'] = best_topic_val
    df['Top_3_Methods_LDA'] = [
        [topic_labels[i] for i in doc_topic_dist[j].argsort()[-3:][::-1]]
        for j in range(doc_topic_dist.shape[0])
    ]
    df['Top_3_Methods_LDA_Scores'] = [
        [doc_topic_dist[j, i] for i in doc_topic_dist[j].argsort()[-3:][::-1]]
        for j in range(doc_topic_dist.shape[0])
    ]
    return df


df = lda_method_assignment(df, method_phrases, processed_col='processed_text', max_method_topics=40, min_papers_per_topic=3)


2025-08-18 12:48:22,092 - INFO - Reducing method phrases (44) to top 30.


In [25]:
def combined_method_assignment(df):
    # Add a new column for confidence label and unified prediction
    super_confident = []
    confident = []
    low_confidence = []
    final_method = []

    for i, row in df.iterrows():
        tfidf_method = row['Primary_Method_TFIDF']
        tfidf_score = row['Method_TFIDF_Score']
        lda_method = row['Primary_Method_LDA']
        lda_score = row['Method_LDA_Score']

        # Both methods strong and agree
        if tfidf_method != 'LowConfidence' and lda_method != 'LowConfidence' and tfidf_method == lda_method:
            super_confident.append(True)
            confident.append(False)
            low_confidence.append(False)
            final_method.append(tfidf_method)
        # Prefer TF-IDF if it's strong
        elif tfidf_method != 'LowConfidence':
            super_confident.append(False)
            confident.append(True)
            low_confidence.append(False)
            final_method.append(tfidf_method)
        # Use LDA if TF-IDF is low or unclear, but LDA is strong
        elif lda_method != 'LowConfidence':
            super_confident.append(False)
            confident.append(True)
            low_confidence.append(False)
            final_method.append(lda_method)
        # Both weak: pick the best non-null candidate or fallback
        else:
            super_confident.append(False)
            confident.append(False)
            low_confidence.append(True)
            # Try top-3 from either method; if unavailable, just mark as low confidence
            candidates = set(row['Top_3_Methods_LDA']) | set(row['Top_1_TFIDF_Methods']) if 'Top_1_TFIDF_Methods' in row else set(row['Top_3_Methods_LDA'])
            candidates = [w for w in candidates if w != 'LowConfidence']
            final_method.append(candidates[0] if candidates else 'LowConfidence')

    df['Method_Label'] = final_method
    df['Method_Confidence'] = [
        'super_confident' if s else 'confident' if c else 'low_confidence'
        for s, c, l in zip(super_confident, confident, low_confidence)
    ]
    return df

df = combined_method_assignment(df)


In [26]:
current_date = datetime.now().strftime("%Y_%m_%d")
output_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{current_date}_final_combined_methods.csv")
df.to_csv(output_filename, sep=';', encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC, escapechar='\\')
print(f"Results saved to {output_filename}")

print("Combined method label distribution:")
print(df['Method_Label'].value_counts())
print("Confidence breakdown:\n", df['Method_Confidence'].value_counts())
print("First 3 sample assigned methods:")
print(df[['Method_Label', 'Method_Confidence', 'Primary_Method_TFIDF', 'Primary_Method_LDA']].head())

print("API token usage and cost:", credit_tracker.get_stats())


Results saved to Saved_files_new\semantic_scholar_2025_08_18_final_combined_methods.csv
Combined method label distribution:
Method_Label
particle swarm optimization    25785
genetic algorithm                420
monte carlo simulation           408
deep learning                    332
reliability assessment           295
linear programming               295
sensitivity analysis             239
optimal power flow               185
risk assessment                  128
load forecasting                 117
dynamic simulation                86
state estimation                  78
dynamic programming               75
wavelet transform                 66
nonlinear programming             57
finite element method             53
load flow analysis                47
differential evolution            45
contingency analysis              35
network optimization              30
fuzzy logic control               28
fault tree analysis               27
system identification             24
time series 