In [1]:
# Cell 1 - Imports and Setup
import os
import re
import json
import csv
import ast
import numpy as np
import pandas as pd
from collections import Counter
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import configparser
import tiktoken
import logging
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Phrases
import openai

SAVE_DIR = "Saved_files_new"
os.makedirs(SAVE_DIR, exist_ok=True)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)


True

In [2]:
# cell 2 OpenAI Setup
# %%
class CreditTracker:
    def __init__(self):
        self.total_tokens = 0
        self.total_cost = 0
        self.cost_per_1k_tokens = 0.00015
    def update(self, tokens):
        self.total_tokens += tokens
        self.total_cost += (tokens / 1000) * self.cost_per_1k_tokens
    def get_stats(self):
        return {"total_tokens": self.total_tokens, "total_cost": round(self.total_cost, 4)}

def initialize_openai():
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE')
    client = openai.OpenAI(api_key=api_key)
    return client, model_type

def num_tokens_from_string(string: str, model_name: str) -> int:
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(string))

client, model_type = initialize_openai()
credit_tracker = CreditTracker()


In [3]:
# Cell 3 Utility Functions & Preprocessing
def extract_keywords_from_filename(filename):
    base = os.path.splitext(os.path.basename(filename))[0]
    parts = base.split('_')
    keywords = [part for i, part in enumerate(parts) if i > 2 and part != 'results' and not part.isdigit()]
    return keywords

def get_custom_stop_words(search_keywords=None):
    stop_words = set(stopwords.words('english'))
    words_to_keep = set()
    if search_keywords:
        for keyword in search_keywords:
            keyword = keyword.lower()
            words_to_keep.add(keyword)
            for word in keyword.split():
                words_to_keep.add(word)
    stop_words = stop_words - words_to_keep
    scientific_terms = {'et', 'al','ref','reference','references','cited','cite',
        'fig','figure','figures','table','tables','chart','charts',
        'published','journal','conference','proceedings','vol','volume','pp','page','pages','doi'}
    stop_words = stop_words.union(scientific_terms)
    return stop_words

def preprocess_text(text, search_keywords=None, min_word_length=2, remove_numbers=True):
    if not isinstance(text, (str, int, float)):
        return ''
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'--+', ' ', text)
    tokens = word_tokenize(text)
    stop_words = get_custom_stop_words(search_keywords)
    tokens = [t for t in tokens if len(t) >= min_word_length and t not in stop_words and len(t) > 1 and not t.isdigit()]
    lemmatizer = WordNetLemmatizer()
    try:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    except:
        pass
    return ' '.join(tokens)

def preprocess_dataframe(df, text_col, search_keywords, processed_col='processed_text'):
    df[text_col] = df[text_col].fillna('').astype(str)
    df[processed_col] = df[text_col].apply(lambda x: preprocess_text(x, search_keywords))
    return df[df[processed_col].str.strip() != '']


In [4]:
# Cell 4 Data Loading & Cleaning

filename = "semantic_scholar_2025_02_14_reliability_resilience_power_systems_results.csv"
filepath = os.path.join("Saved_files", filename)
df = pd.read_csv(filepath, sep=";")
df['text'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')
search_keywords = extract_keywords_from_filename(filename)
df = preprocess_dataframe(df, text_col='text', search_keywords=search_keywords)

def clean_fields_of_study(s):
    valid_fields = ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics',
        'Medicine','Business','Environmental Science','Chemistry','Materials Science',
        'Geography','Biology','Geology','Political Science','Psychology','Com']
    if pd.isna(s) or s == '[]':
        return ["Unknown"]
    if isinstance(s, str):
        fields = [field.strip().strip("'\"") for field in s.strip('[]').split(',')]
        return [f if f in valid_fields else "Unknown" for f in fields] or ["Unknown"]
    return ["Unknown"]

df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)


In [None]:
# Cell 5 Candidate Extraction & LLM Filtering
def extract_candidate_terms(df, text_col='processed_text', max_features=20000):
    vectorizer = CountVectorizer(
        ngram_range=(1, 4),
        max_df=0.95,
        min_df=2,
        max_features=max_features,
        token_pattern=r'\b[\w-]+\b'
    )
    matrix = vectorizer.fit_transform(df[text_col].fillna(''))
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    return [term for term, freq in sorted(zip(terms, freqs), key=lambda x: x[1], reverse=True)]

def get_method_phrases(corpus_terms, client, model_type, credit_tracker):
    import ast
    sample_terms = ', '.join(corpus_terms[:100])
    prompt = f"""Here are the most frequent terms from a corpus of scientific papers:
{sample_terms}

From the full list: {', '.join(corpus_terms)}
Extract ONLY the terms that represent specific methodologies, techniques, or named approaches. Focus on specific computational, statistical, engineering, and reliability analysis methods. Do NOT include generic system names or vague phrases such as "analysis", "Grid Planning", "Risk Assesment" by themselves ; do include things like "monte carlo simulation", "optimal power flow", "genetic algorithm", "fault tree analysis", etc.

Return as a Python list with no code blocks or formatting."""
    response = client.chat.completions.create(
        model=model_type,
        messages=[{"role": "user", "content": prompt}]
    )
    content = response.choices[0].message.content
    try:
        return ast.literal_eval(content)
    except:
        content = content.replace('[', '').replace(']', '').replace('"', '').replace("'",'')
        return [term.strip() for term in content.split(',') if len(term.strip()) > 3]

def clean_method_phrases_fixed(method_phrases):
    cleaned_phrases = []
    for phrase in method_phrases:
        cleaned = phrase.strip().replace('``````','').replace('[', '').replace(']', '').replace('"', '').replace("'", '').replace('\\n', ' ')
        cleaned = ' '.join(cleaned.split())
        if len(cleaned) > 2:
            cleaned_phrases.append(cleaned.lower())
    return list(set(cleaned_phrases))

candidate_terms = extract_candidate_terms(df, text_col='processed_text')
method_phrases = get_method_phrases(candidate_terms, client, model_type, credit_tracker)
method_phrases = clean_method_phrases_fixed(method_phrases)
print(f"Number of methods found: {len(method_phrases)}, Top method phrases:{ method_phrases[:15]}")


2025-08-20 12:46:05,484 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Number of methods found: 44, Top method phrases:['confidence interval estimation', 'particle swarm optimization', 'analytical hierarchy process', 'countermeasure analysis', 'pso algorithm', 'control strategy', 'fuzzy logic', 'neural network', 'genetic algorithm', 'contextual bandits', 'model predictive control', 'deep learning', 'signal processing', 'computer simulation', 'multi-agent systems']


In [6]:
# Cell 6 Synonym/Abbreviation Mapping and Standardization
def get_method_abbreviation_dict(method_phrases, client, model_type, credit_tracker, batch_size=100):
    import ast
    results = {}
    for i in range(0, len(method_phrases), batch_size):
        batch = method_phrases[i:i+batch_size]
        prompt = f"""For each of the following phrases, extract ALL common scientific abbreviations, synonyms, and aliases for methods/techniques. Format the response strictly as a Python dictionary.

Methods:
{chr(10).join(batch)}
Only return methods as keys and aliases as list values. No intros, explanations, code blocks, or categories."""
        response = client.chat.completions.create(
            model=model_type,
            messages=[
                {"role": "system", "content": "You are a scientific abbreviation expert."},
                {"role": "user", "content": prompt}
            ]
        )
        content = response.choices[0].message.content.strip()
        start, end = content.find('{'), content.rfind('}')+1
        method_dict = {}
        if start >= 0 and end > start:
            try:
                method_dict = ast.literal_eval(content[start:end])
            except Exception as e:
                logger.warning(f"Failed to parse dictionary from LLM batch: {e}")
        results.update(method_dict)
    logger.info(f"LLM mapped {len(results)} methods to abbreviations/variants.")
    return results

def build_abbr_to_canonical_map(method_dict):
    abbr_map = {}
    for canonical, variants in method_dict.items():
        abbr_map[canonical.lower()] = canonical
        for v in variants:
            abbr_map[v.lower()] = canonical
    return abbr_map

import re
def standardize_methods_in_text(text, abbr_to_canonical):
    sorted_vars = sorted(abbr_to_canonical, key=lambda x: -len(x))
    for var in sorted_vars:
        pattern = r'\b' + re.escape(var) + r'\b'
        text = re.sub(pattern, abbr_to_canonical[var], text, flags=re.IGNORECASE)
    return text

method_dict = get_method_abbreviation_dict(method_phrases, client, model_type, credit_tracker, batch_size=100)
abbr_to_canonical_map = build_abbr_to_canonical_map(method_dict)
df['standardized_text'] = df['processed_text'].apply(lambda t: standardize_methods_in_text(t, abbr_to_canonical_map))
method_vocabulary = sorted(method_dict.keys())


2025-08-20 12:46:11,012 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-20 12:46:11,016 - INFO - LLM mapped 44 methods to abbreviations/variants.


In [7]:
# Cell 7
# A: Build TFIDF Score Matrix

def compute_tfidf_scores(processed_texts, method_phrases, ngram_range=(1, 4), min_df=1, max_df=0.95, norm='l2'):
    tfidf_vectorizer = TfidfVectorizer(
        vocabulary=method_phrases, ngram_range=ngram_range,
        min_df=min_df, max_df=max_df, norm=norm
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(processed_texts)
    scores = tfidf_matrix.toarray()
    feature_names = tfidf_vectorizer.get_feature_names_out()
    return scores, feature_names
# 
# B: Build LDA Score Matrix

def compute_lda_scores(processed_texts, method_phrases, ngram_range=(1, 3), n_topics=100, max_iter=20):
    vectorizer = CountVectorizer(
        vocabulary=method_phrases, ngram_range=ngram_range, token_pattern=r'\b[\w-]+\b'
    )
    doc_term_matrix = vectorizer.fit_transform(processed_texts)
    feature_names = vectorizer.get_feature_names_out()
    if n_topics >= 2:
        lda = LatentDirichletAllocation(n_components=n_topics, learning_method='batch', 
                                       random_state=42, max_iter=max_iter)
        lda_matrix = lda.fit_transform(doc_term_matrix)
    else:
        lda_matrix = np.zeros((doc_term_matrix.shape[0], len(method_phrases)))
    return lda_matrix, feature_names
# 
# Compound (partial phrase/proximity) Score Matrix

def compute_compound_scores(df, method_phrases, processed_col='standardized_text', window=80, min_word_len=4):
    n_docs = len(df)
    n_methods = len(method_phrases)
    scores = np.zeros((n_docs, n_methods), dtype=np.float32)
    docs = df[processed_col].fillna('').str.lower().tolist()
    for j, phrase in enumerate(method_phrases):
        phrase_l = phrase.lower()
        words = [w for w in phrase_l.split() if len(w) >= min_word_len]
        for i, text in enumerate(docs):
            if phrase_l in text:
                scores[i, j] = 1.0
            elif len(words) > 1:
                matches = sum(1 for w in words if w in text)
                scores[i, j] = matches / len(words)
    return scores

In [11]:
# Cell 8: Combine Scores

def combine_method_scores(tfidf_scores, lda_scores, compound_scores, weights=(0.4, 0.3, 0.3)):
    # weights = (tfidf, lda, compound)
    return weights[0]*tfidf_scores + weights[1]*lda_scores + weights[2]*compound_scores


In [13]:
# Cell 9: Assign Top-N Methods by Total Score

def assign_top_methods_by_total_score(df, total_scores, method_names, top_n=3, min_score=0.03):
    for rank in range(top_n):
        top_method = []
        top_score = []
        for row in total_scores:
            idxs = np.argsort(row)[::-1]
            nth_idx = idxs[rank] if rank < len(idxs) else None
            if nth_idx is not None and row[nth_idx] >= min_score:
                top_method.append(method_names[nth_idx])
                top_score.append(row[nth_idx])
            else:
                top_method.append("LowConfidence")
                top_score.append(row[nth_idx] if nth_idx is not None else 0.0)
        df[f'Top_{rank+1}_Method'] = top_method
        df[f'Top_{rank+1}_Score'] = top_score
    df['Primary_Method'] = df['Top_1_Method']
    df['Primary_Method_Score'] = df['Top_1_Score']
    # Assign confidence
    conf = []
    for m1, s1, m2, s2, m3, s3 in zip(
        df['Top_1_Method'], df['Top_1_Score'],
        df['Top_2_Method'], df['Top_2_Score'],
        df['Top_3_Method'], df['Top_3_Score']
    ):
        if m1 != "LowConfidence" and s1 > 2 * max(0.05, s2):
            conf.append("super_confident")
        elif m1 != "LowConfidence":
            conf.append("confident")
        else:
            conf.append("low_confidence")
    df['Method_Confidence'] = conf
    return df


In [None]:
# Cell 10: Main Pipeline - Call the Functions

# 1. Compute the three score matrices
tfidf_scores, tfidf_method_names = compute_tfidf_scores(df['standardized_text'], method_phrases, ngram_range=(1, 4))
lda_n_topics = min(len(method_phrases), 100)
lda_scores, lda_method_names = compute_lda_scores(df['standardized_text'], method_phrases, ngram_range=(1, 3), n_topics=lda_n_topics)
# Make sure order matches
assert list(tfidf_method_names) == list(lda_method_names), "Method name order mismatch"

compound_scores = compute_compound_scores(df, tfidf_method_names, processed_col='standardized_text', window=150, min_word_len=4)

# 2. Combine all scores
total_scores = combine_method_scores(tfidf_scores, lda_scores, compound_scores, weights=(0.4, 0.3, 0.3))

# 3. Assign top-N by combined score
df = assign_top_methods_by_total_score(df, total_scores, tfidf_method_names, top_n=3, min_score=0.03)

# 4. Save full score matrix for review if desired
combined_score_df = pd.DataFrame(total_scores, columns=tfidf_method_names, index=df.index)
score_save_path = os.path.join(
    SAVE_DIR, f"semantic_scholar_{datetime.now().strftime('%Y_%m_%d')}_method_all_combined_scores.csv"
)
combined_score_df.to_csv(score_save_path)
print(f"Full combined score matrix for all methods saved to: {score_save_path}")

# 5. Output sample assignments
print(df[['Primary_Method', 'Primary_Method_Score', 'Method_Confidence',
          'Top_1_Method', 'Top_1_Score', 'Top_2_Method', 'Top_2_Score', 'Top_3_Method', 'Top_3_Score']].head())


Full combined score matrix for all methods saved to: Saved_files_new\semantic_scholar_2025_08_20_method_all_combined_scores.csv
                Primary_Method  Primary_Method_Score Method_Confidence  \
0     reliability-based design              0.156818         confident   
1          computer simulation              0.156818         confident   
2     variable frequency drive              0.206818         confident   
3  statistical process control              0.306818         confident   
4  statistical process control              0.206818         confident   

                  Top_1_Method  Top_1_Score                 Top_2_Method  \
0     reliability-based design     0.156818  statistical process control   
1          computer simulation     0.156818          dynamic programming   
2     variable frequency drive     0.206818             load forecasting   
3  statistical process control     0.306818       support vector machine   
4  statistical process control     0.206818    

In [14]:
# Cell 11 Diagnostics Function 
def comprehensive_diagnostics_granular(
    df,
    tfidf_scores,
    method_phrases,
    method_dict=None
):
    n_docs = len(df)
    n_methods = len(method_phrases)
    print("=== DIAGNOSTICS ===")
    print(f"Total docs: {n_docs}, Total unique methods: {n_methods}")
    if method_dict is not None:
        n_variants = sum(1 + len(variants) for variants in method_dict.values())
        print(f"Total method variants (including abbreviations): {n_variants}")

    # Coverage
    tfidf_nonzero = (tfidf_scores > 0).any(axis=1).sum()
    print(f"\nTF-IDF coverage: {tfidf_nonzero}/{n_docs} ({100*tfidf_nonzero/n_docs:.1f}%)")

    # Confidence distribution
    if 'Method_Confidence' in df.columns:
        conf_dist = df['Method_Confidence']
        if isinstance(conf_dist, pd.DataFrame):
            conf_dist = conf_dist.iloc[:, 0]
        print("Confidence stats:")
        print(conf_dist.value_counts())

    if 'Method_Label' in df.columns:
        lbl_dist = df['Method_Label']
        if isinstance(lbl_dist, pd.DataFrame):
            lbl_dist = lbl_dist.iloc[:, 0]
        print("Assigned method stats:")
        print(lbl_dist.value_counts().head(10))

    print("\nSample methods (from canonical vocabulary):")
    print(method_phrases[:10])
    if method_dict is not None:
        print("\nAbbreviation mapping examples:")
        for i, (canonical, variants) in enumerate(list(method_dict.items())[:5]):
            print(f"  {canonical}: {', '.join(variants[:5])}")

# Run diagnostics
comprehensive_diagnostics_granular(df, df['Method_TFIDF_Score'].to_numpy().reshape(-1, 1), method_phrases, method_dict)


KeyError: 'Method_TFIDF_Score'