In [None]:

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.lda_model
from gensim.models import Phrases
import numpy as np
from openai import OpenAI
import configparser 
import tiktoken
from difflib import SequenceMatcher
from datetime import datetime
import json
import ast
import csv
import os
import pickle
import joblib
from bertopic import BERTopic

#from sklearn.model_selection import train_test_split
#from gensim.models import CoherenceModel
#import gensim.corpora as corpora

SAVE_DIR = "Saved_files_new"
os.makedirs(SAVE_DIR, exist_ok=True)

In [24]:
# Download necessary NLTK data and handle stop words
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

#Code to extract keywords from a filename and convert them to a specific format
def extract_keywords_from_filename(filename):
    # Remove directory and extension
    base = os.path.splitext(os.path.basename(filename))[0]
    # Split by underscores and remove known prefixes/suffixes
    parts = base.split('_')
    # Assuming your format: semantic_scholar_DATE_KEYWORDS_results
    # Find the keywords part (between date and 'results')
    # This example assumes date is always 3 parts (YYYY_MM_DD)
       
    keywords = []
    for i, part in enumerate(parts):
        if i > 2 and part != 'results':
            keywords.append(part)
    # removing strings that are numbers from the keywords (check that the string is not a digit)
    keywords = [kw for kw in keywords if not kw.isdigit()]
    return keywords

#functin to convert keywords to a filename part
def keywords_to_filename_part(keywords):
    # Lowercase, remove/replace spaces and join with underscores
    return '_'.join([kw.lower().replace(' ', '_') for kw in keywords])

def get_custom_stop_words(search_keywords=None):
    
    # Get standard stopwords
    stop_words = set(stopwords.words('english'))
    
    # Words to keep (search keywords)
    words_to_keep = set()
    if search_keywords:
        # Convert keywords to lowercase and split multi-word terms
        for keyword in search_keywords:
            keyword = keyword.lower()
            # Add both the full phrase and individual words
            words_to_keep.add(keyword)
            for word in keyword.split():
                words_to_keep.add(word)
    
    # Remove search keywords from stopwords
    stop_words = stop_words - words_to_keep
    
    # Scientific paper terms to exclude (add to stopwords)
    scientific_terms = {
        # Citation terms
        'et', 'al', 'ref', 'reference', 'references', 'cited', 'cite',
        # Figure and table references
        'fig', 'figure', 'figures', 'table', 'tables', 'chart', 'charts',
        # Publication terms
        'published', 'journal', 'conference', 'proceedings',
        # Measurement units and numbers
        'vol', 'volume', 'pp', 'page', 'pages', 'doi'
    }
    
    # Add scientific terms to stopwords
    stop_words = stop_words.union(scientific_terms)
    
    return stop_words


[nltk_data] Downloading package punkt to C:\Users\STSI/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\STSI/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\STSI/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
# Some code for AI assistance
# Initialize the LLM API
def initialize_openai():
    config = configparser.ConfigParser()
    config.read('config_LLM.txt')
    api_key = config['LLM'].get('OPENAI_API_KEY')
    model_type = config['LLM'].get('MODEL_TYPE')
    return OpenAI(api_key=api_key), model_type

# Credit tracker to keep track of the cost when using paid API
class CreditTracker:
    def __init__(self):
        self.total_tokens = 0
        self.total_cost = 0
        self.cost_per_1k_tokens = 0.00015

    def update(self, tokens):
        self.total_tokens += tokens
        self.total_cost += (tokens / 1000) * self.cost_per_1k_tokens

    def get_stats(self):
        return {
            "total_tokens": self.total_tokens,
            "total_cost": round(self.total_cost, 4)
        }

credit_tracker = CreditTracker()

def num_tokens_from_string(string: str, model_name: str) -> int:
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [26]:
# Preprocess function
def preprocess_text(text, search_keywords, min_word_length=2, remove_numbers=True):
    """Enhanced text preprocessing with additional options"""
    
    # Input validation
    if not isinstance(text, (str, int, float)):
        return ''
    text = str(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs and email addresses
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove numbers if specified
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    
    # Remove punctuation but keep hyphens within words
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'--+', ' ', text)  # Replace multiple hyphens with space
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove tokens that are too short
    tokens = [token for token in tokens if len(token) >= min_word_length]
    
    # Remove stop words
    stop_words = get_custom_stop_words(search_keywords)
    tokens = [token for token in tokens if token not in stop_words]
    
    # Remove single characters and purely numeric tokens
    tokens = [token for token in tokens if len(token) > 1 and not token.isdigit()]
    
    # Lemmatization with POS tagging for better accuracy
    lemmatizer = WordNetLemmatizer()
    
    # Simple POS mapping for better lemmatization
    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                   "N": nltk.corpus.wordnet.NOUN,
                   "V": nltk.corpus.wordnet.VERB,
                   "R": nltk.corpus.wordnet.ADV}
        return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)
    
    # Lemmatize with POS information
    try:
        tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    except:
        # Fallback to simple lemmatization if POS tagging fails
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Remove any remaining empty tokens
    tokens = [token for token in tokens if token.strip()]
    
    return ' '.join(tokens)



def detect_phrases(df, text_col='processed_text'):
    """Augment text with detected phrases while keeping original tokens"""
    tokenized_texts = df[text_col].apply(lambda x: x.split()).tolist()

    # Detect bigrams and trigrams
    bigram = Phrases(tokenized_texts, min_count=10, threshold=50, delimiter='_')
    trigram = Phrases(bigram[tokenized_texts], threshold=50, delimiter='_')

    # Combine original tokens with detected phrases
    phrased_texts = []
    for doc in tokenized_texts:
        # Get phrases at different levels
        bigrams_ = [w for w in bigram[doc] if '_' in w]
        trigrams_ = [w for w in trigram[bigram[doc]] if '_' in w]
        combined = doc + bigrams_ + trigrams_
        phrased_texts.append(' '.join(combined))
    
    return phrased_texts



def preprocess_dataframe(df, text_col, search_keywords, processed_col='processed_text'):
    """
    Clean NaN, concatenate, and preprocess text for NLP analysis.
    - df: DataFrame
    - text_col: column with raw text (e.g., 'text' or 'abstract')
    - search_keywords: list of keywords to keep
    - processed_col: name for processed text column
    Returns: DataFrame with new column of processed text.
    """
    # 1. Fill NaN with empty string
    df[text_col] = df[text_col].fillna('')
    # 2. Convert everything to string (handles numbers, etc.)
    df[text_col] = df[text_col].astype(str)
    # 3. Apply your advanced preprocessing function
    df[processed_col] = df[text_col].apply(lambda x: preprocess_text(x, search_keywords))
    # 4. Remove rows where processed text is empty after cleaning
    df = df[df[processed_col].str.strip() != '']
    return df


"""
def preprocess_text(text, search_keywords):
    if not isinstance(text, (str, int, float)):
        return ''
    text = str(text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s-]', '', text)
    tokens = word_tokenize(text) # Split text into individual words/tokens using NLTK's tokenizer
    stop_words = get_custom_stop_words(search_keywords)
    tokens = [token for token in tokens if token not in stop_words] # Remove all stop words from our tokens using list comprehension
    lemmatizer = WordNetLemmatizer() # Initialize WordNet lemmatizer to reduce words to their base/dictionary form
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # Convert each token to its lemma (e.g., "systems" → "system", "running" → "run")
    return ' '.join(tokens) # Combine all processed tokens back into a single string with spaces between words
"""

# Function to convert string representation of list to actual list, replacing long strings with "Unknown"
def string_to_list(s):
    if isinstance(s, str):
        # Remove brackets and split by comma
        fields = [field.strip().strip("'") for field in s.strip('[]').split(',')]
        # Replace fields wit more than 100 characters with "Unknown"
        return ["Unknown" if len(field) > 100 else field for field in fields]
    return ["Unknown"]  # Return ["Unknown"] for empty or non-string entries

# Clean fields of study
def clean_fields_of_study(s):
    valid_fields= ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics', 'Medicine', 'Business', 'Environmental Science', 'Chemistry', 'Materials Science', 'Geography', 'Biology', 'Geology', 'Political Science', 'Psychology', 'Com']
    if pd.isna(s) or s == '[]':
        return ["Unknown"]
    if isinstance(s, str):
        fields = [field.strip().strip("'\"") for field in s.strip('[]').split(',')]
        cleaned_fields = []
        for field in fields:
            if field in valid_fields:
                cleaned_fields.append(field)
            else:
                cleaned_fields.append("Unknown")
        return cleaned_fields if cleaned_fields else ["Unknown"]
    return ["Unknown"]

  """


## Simple analysis to extract keyword and n-gram frequency

In [27]:
def get_term_frequencies(vectorizer, texts):
    matrix = vectorizer.fit_transform(texts)
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    return dict(sorted(zip(terms, freqs.tolist()), key=lambda x: x[1], reverse=True))

def extract_keywords_and_ngrams(df, max_features=1000):
    # Create vectorizers
    keyword_vectorizer = CountVectorizer(
        max_df=0.95, 
        min_df=2,
        #stop_words='english',# can be removed as stopwords are already removed from processed_text
        max_features=max_features,
        token_pattern=r'(?u)\b[A-Za-z][A-Za-z-]+[A-Za-z]\b'
    )
    
    bigram_vectorizer = CountVectorizer(
        ngram_range=(2,2),
        max_df=0.95,
        min_df=2,
        #stop_words='english',# can be removed as stopwords are already removed from processed_text
        max_features=max_features
    )
    
    trigram_vectorizer = CountVectorizer(
        ngram_range=(3,3),
        max_df=0.95,
        min_df=2,
        max_features=max_features,
        #stop_words='english' # can be removed as stopwords are already removed from processed_text
    )
    
    # Extract frequencies
    keyword_freq = get_term_frequencies(keyword_vectorizer, df['processed_text'])
    bigram_freq = get_term_frequencies(bigram_vectorizer, df['processed_text'])
    trigram_freq = get_term_frequencies(trigram_vectorizer, df['processed_text'])
    
    # Save results
    current_date = datetime.now().strftime("%Y_%m_%d")
    filename=os.path.join(SAVE_DIR,f'term_frequencies_{current_date}.json')
    results = {
        'keywords': keyword_freq,
        'bigrams': bigram_freq,
        'trigrams': trigram_freq
    }
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"Results saved to {filename}")

## More comprehensive analyzis with LDA and topic naming

In [None]:

def extract_topic_keywords(lda_model, feature_names, num_words=10):
    # feature_names is already an array, do NOT call .get_feature_names_out()
    topic_keywords = {}
    for topic_idx, topic in enumerate(lda_model.components_):
        top_indices = topic.argsort()[:-num_words-1:-1]
        top_words = [feature_names[i] for i in top_indices]
        word_weights = [(feature_names[i], topic[i]) for i in top_indices]
        topic_keywords[topic_idx] = {
            'top_words': top_words,
            'word_weights': word_weights
        }
    return topic_keywords



# Model topics function
def model_topics_by_field(df, field, num_topics=10, num_words=5):
    df_field = df[df['fieldsOfStudy'].apply(lambda x: field in x)] # filtering so that only the documents within the field is keept.
    print (f"Analyzing {len(df_field)} papers")
    if df_field.empty:
            print(f"No papers found for field: {field}")            #warning if no papers found
            return None, None, None, None, None
    text_data = df_field['abstract'].fillna('')                     # filtering to avoid errors due to missing fields
    
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') # vectorizing with max_df = 0.95 as default (terms that appear in more than 95% of documents) and min_df =2 (terms that appear in less than 2 of the documents). Values below 1 indicates percentile, and values above 1 indicates number of docs.
    doc_term_matrix = vectorizer.fit_transform(text_data) #creating the term matrix:  vectorizer : sklearn.feature_extraction.text.(CountVectorizer, TfIdfVectorizer).vectorizer used to convert raw documents to document-term matrix (`dtm`)
    
    #Using the sklearn decomposition LatentDirchletAllocation (see https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html). 
    #Fit the LDA model
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42) 
    #Randomstate 42 is passed to maintain reproducibility in results (0 and 42 are commonly used values. Default "None" will result in using globa radnom instande (numpy.radom) and re-running may produce different results
    
    topic_distribution = lda_model.fit_transform(doc_term_matrix)
    topic_keywords=extract_topic_keywords(lda_model, vectorizer)
    
    #feature_names = vectorizer.get_feature_names_out()
   
    for topic_idx, keywords in topic_keywords.items():
        print(f"\nTopic {topic_idx + 1}:")
        # Format each word with its weight in parentheses
        formatted_words = [f"{word} ({weight:.2f})" 
                         for word, weight in keywords['word_weights']]
        print(", ".join(formatted_words)) 
    return lda_model, vectorizer, topic_distribution, df_field, topic_keywords

def tune_topic_number(df, min_topics=2, max_topics=20, step=1, use_coherence=True):
    text_data = df['abstract'].fillna('')
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    doc_term_matrix = vectorizer.fit_transform(text_data)
    
    # Split data for perplexity calculation
    train_data, test_data = train_test_split(doc_term_matrix, test_size=0.3, random_state=42)
    
    best_score = float('inf')
    best_num_topics = min_topics
    scores = []
    
    for num_topics in range(min_topics, max_topics + 1, step):
        lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
        lda_model.fit(train_data)
        
        perplexity = lda_model.perplexity(test_data)
        
        if use_coherence:
            # Prepare data for coherence calculation
            id2word = {i: word for i, word in enumerate(vectorizer.get_feature_names_out())}
            corpus = [id2word.doc2bow(doc.split()) for doc in text_data]
            
            coherence_model = CoherenceModel(model=lda_model, texts=text_data, dictionary=id2word, coherence='c_v')
            coherence = coherence_model.get_coherence()
            
            # Combine perplexity and coherence
            score = perplexity / coherence
        else:
            score = perplexity
        
        scores.append((num_topics, score))
        
        if score < best_score:
            best_score = score
            best_num_topics = num_topics
    
    return best_num_topics, scores 

def get_top_tfidf_ngrams_per_topic(df, tfidf_matrix, feature_names, topic_col='Primary_Topic_Index', top_k=10):
    tfidf_ngrams = {}
    for topic_idx in df[topic_col].dropna().unique():
        topic_idx = int(topic_idx)
        doc_indices = df[df[topic_col] == topic_idx].index
        if len(doc_indices) == 0:
            continue
        topic_tfidf = np.asarray(tfidf_matrix[doc_indices].mean(axis=0)).ravel()
        top_indices = topic_tfidf.argsort()[-top_k:][::-1]
        top_terms = [(feature_names[i], topic_tfidf[i]) for i in top_indices if topic_tfidf[i] > 0]
        tfidf_ngrams[topic_idx] = top_terms
    return tfidf_ngrams

def model_topics(df, num_topics=10, num_words=100):
    """LDA analysis with unigrams + phrases"""
    phrased_texts = detect_phrases(df)
    
    vectorizer = CountVectorizer(
        ngram_range=(1, 1),  # We handle n-grams via phrase detection
        token_pattern=r'\b[\w_-]+\b',  # Allow underscores
        max_df=0.95,
        min_df=2,
        max_features=10000
    )
    
    doc_term_matrix = vectorizer.fit_transform(phrased_texts)
    
    logger.info(f"Fitting LDA model with {num_topics} topics")
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    topic_distributions = lda_model.fit_transform(doc_term_matrix)
    logger.info(f"LDA model with {num_topics} topics fitted")
    # Extract keywords (now includes phrases)
    logger.info(f"Extracting topic keywords with {num_words} words per topic")
    feature_names = vectorizer.get_feature_names_out()
    topic_keywords = extract_topic_keywords(lda_model, feature_names, num_words)
    logger.info(f"Topic keywords extracted")
    return lda_model, vectorizer, topic_distributions, df, topic_keywords

from bertopic import BERTopic
# did not work as well as well as hoped, so using a simpler method instead
"""def model_topics_guided(df, num_topics=10, seed_topics=None):
    #Guided topic modeling with BERTopic using seed words/phrases.
    # 1. Prepare seed topics format for BERTopic
    if seed_topics is None:
        seed_topics = [
            ["deep_learning", "neural_network"],  # Topic 0 seeds
            ["power_system", "reliability"]       # Topic 1 seeds
        ]
    
    # 2. Initialize BERTopic with seed topics
    topic_model = BERTopic(
        seed_topic_list=seed_topics,
        n_gram_range=(1, 3),      # Auto-detect uni/bi/trigrams
        min_topic_size=15,        # Adjust based on your dataset
        verbose=True
    )
    
    # 3. Get processed texts (assuming 'processed_text' exists in df)
    processed_texts = df['processed_text'].tolist()
    
    # 4. Fit model and get predictions
    topics, probs = topic_model.fit_transform(processed_texts)
    
    # 5. Get topic keywords (BERTopic's default representation)
    topic_info = topic_model.get_topic_info()
    
    # 6. Format output to match your existing structure
    topic_keywords = {}
    for topic_id in topic_info['Topic'].unique():
        if topic_id != -1:  # Skip outlier topic
            words_weights = topic_model.get_topic(topic_id)
            topic_keywords[topic_id] = {
                'top_words': [word for word, _ in words_weights],
                'word_weights': words_weights
            }

    return topic_model, topics, probs, df, topic_keywords


def model_topics_guided_bertopic(df, num_topics=10, seed_topics=None):
    #Guided topic modeling with BERTopic using seed words/phrases.
    logger.info("🔄 Starting BERTopic guided topic modeling...")
    if seed_topics is None:
        seed_topics = [
            ["deep_learning", "neural_network"],
            ["power_system", "reliability"]
        ]
    processed_texts = df['processed_text'].tolist()
    topic_model = BERTopic(
        seed_topic_list=seed_topics,
        n_gram_range=(1, 3),
        min_topic_size=15,
        verbose=True
    )
    topics, probs = topic_model.fit_transform(processed_texts)
    logger.info("✓ BERTopic guided topic modeling completed")
    topic_keywords = {}
    for topic_id in set(topics):
        if topic_id == -1:
            continue
        words_weights = topic_model.get_topic(topic_id)
        topic_keywords[topic_id] = {
            'top_words': [word for word, _ in words_weights],
            'word_weights': words_weights
        }
    return topic_model, topics, probs, df, topic_keywords

from bertopic import BERTopic
"""

def classify_methods_tfidv2(df, method_phrases):
    """Improved TF-IDF method classification with better scoring"""
    vectorizer = TfidfVectorizer(
        vocabulary=method_phrases,
        token_pattern=r'\b[\w-]+\b',
        ngram_range=(1, 3),
        min_df=1,
        max_df=0.7,
        sublinear_tf=True,
        use_idf=True,  # Ensure IDF is used
        smooth_idf=True  # Smooth IDF weights
    )
    
    tfidf_matrix = vectorizer.fit_transform(df['processed_text'])
    df = df.copy()
    
    # Get method scores
    df['Method_Score'] = tfidf_matrix.max(axis=1).toarray().ravel()
    
    # Handle argmax correctly
    if tfidf_matrix.shape[1] == 1:
        df['Primary_Method'] = vectorizer.get_feature_names_out()[0]
    else:
        argmax_indices = tfidf_matrix.argmax(axis=1).A1
        feature_names = vectorizer.get_feature_names_out()
        df['Primary_Method'] = [feature_names[i] for i in argmax_indices]
    
    # Add threshold-based classification
    threshold = df['Method_Score'].quantile(0.25)  # Use 25th percentile as threshold
    df['Method_Detected'] = df['Primary_Method'].where(
        df['Method_Score'] > threshold, 'Low_Confidence'
    )
    
    return df



def hybrid_method_scoring_fixed(df, lda_model, tfidf_matrix, method_phrases):
    """Fixed hybrid scoring with error handling"""
    n_docs = len(df)
    if lda_model is None:
        topic_dist = np.random.dirichlet([1] * 5, n_docs)
    else:
        try:
            topic_dist = lda_model.transform(tfidf_matrix)
        except:
            topic_dist = np.random.dirichlet([1] * 5, n_docs)
    
    df = df.copy()
    df['Hybrid_Score'] = 0
    
    for idx, row in df.iterrows():
        method_score = row.get('Method_Score', 0)
        topic_weight = topic_dist[idx].max() if idx < len(topic_dist) else 0.5
        df.at[idx, 'Hybrid_Score'] = method_score * (1 + 0.5 * topic_weight)
    
    threshold = df['Hybrid_Score'].quantile(0.6)
    df['Method_Detected'] = df['Primary_Method'].where(df['Hybrid_Score'] > threshold, 'Other')
    
    return df

def classify_methods_tfidf(df, method_phrases, text_col='processed_text', threshold=0.01):
    """Classify methods using TF-IDF scores only."""
    # Create TF-IDF vectorizer with method phrases as vocabulary
    vectorizer = TfidfVectorizer(
        vocabulary=method_phrases,
        lowercase=True,
        token_pattern=r'\b[\w-]+\b'
    )
    
    # Transform documents
    tfidf_matrix = vectorizer.fit_transform(df[text_col])
    feature_names = vectorizer.get_feature_names_out()
    
    # Get scores and assign methods
    df = df.copy()
    df['Primary_Method'] = np.nan
    df['Method_Score'] = 0.0
    
    for i in range(len(df)):
        scores = tfidf_matrix[i].toarray().flatten()
        if scores.max() > threshold:
            top_idx = np.argmax(scores)
            df.at[i, 'Primary_Method'] = feature_names[top_idx]
            df.at[i, 'Method_Score'] = scores[top_idx]
    
    logger.info(f"Assigned methods to {df['Primary_Method'].notna().sum()} documents")
    return df


def classify_methods_tfidf_lda(df, method_phrases, lda_vectorizer, lda_model=None, 
                              text_col='processed_text', topic_weight=0.3, 
                              method_threshold=0.001, top_n=1):
    """
    Classify methods using existing LDA vectorizer and optional LDA weighting.
    """
    # Clean phrases and find those in LDA vocabulary
    method_phrases = clean_method_phrases(method_phrases)
    vocab = lda_vectorizer.vocabulary_
    present_methods = [m for m in method_phrases if m in vocab]
    
    if not present_methods:
        logger.warning("No method phrases found in LDA vocabulary.")
        df['Primary_Method'] = np.nan
        return df
    
    # Transform documents using LDA vectorizer
    tfidf_matrix = lda_vectorizer.transform(df[text_col])
    feature_names = lda_vectorizer.get_feature_names_out()
    
    # Get TF-IDF scores for method phrases
    indices = [vocab[m] for m in present_methods]
    tfidf_scores = tfidf_matrix[:, indices].toarray()
    
    # Optional LDA weighting
    if lda_model:
        lda_scores = lda_model.transform(tfidf_matrix)
    else:
        lda_scores = np.zeros((len(df), 1))
    
    # Combine scores
    combined_scores = tfidf_scores * (1 + topic_weight * lda_scores.max(axis=1, keepdims=True))
    
    # Assign methods
    df = df.copy()
    df['Primary_Method'] = np.nan
    for i in range(len(df)):
        scores = combined_scores[i]
        if scores.max() < method_threshold:
            continue
        top_idx = np.argmax(scores)
        df.at[i, 'Primary_Method'] = present_methods[top_idx]
    
    return df


def classify_methods_tfidf_lda_OLD(df, method_phrases, lda_model=None, vectorizer=None, 
                              text_col='processed_text', topic_weight=0.3, 
                              method_threshold=0.01, top_n=1, max_ngram=5):
    # Lowercase and clean method phrases
    method_phrases = [phrase.strip('"').strip("'").lower() for phrase in method_phrases]
    method_phrases = list(set(method_phrases))
    
    # Build TF-IDF vectorizer with n-grams
    tfidf_vectorizer = TfidfVectorizer(
        ngram_range=(1, max_ngram),
        min_df=1,
        max_df=1.0,
        token_pattern=r'\b[\w-]+\b'
    )
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[text_col])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    print("TF-IDF vocabulary sample:", list(tfidf_vectorizer.vocabulary_.keys())[:30])

    # Find which method phrases are present as n-grams
    #present_methods = [m for m in method_phrases if m in feature_names]
    present_methods = [m for m in method_phrases if m in tfidf_vectorizer.vocabulary_]
    print(f"Method phrases present in TF-IDF vocabulary: {present_methods}")
    print(f"Total present: {len(present_methods)} / {len(method_phrases)}")

    if not present_methods:
        print("No method phrases found in text after n-gram extraction.")
        df = df.copy()
        df['Primary_Method'] = np.nan
        df['Method_Score'] = np.nan
        df['Top_Methods'] = [[] for _ in range(len(df))]
        return df
    
    # Restrict to method phrases present in vocabulary
    indices = [list(feature_names).index(m) for m in present_methods]
    tfidf_matrix_methods = tfidf_matrix[:, indices]
    
    # LDA part (optional, as before)
    if lda_model and vectorizer:
        lda_matrix = lda_model.transform(vectorizer.transform(df[text_col]))
    else:
        lda_matrix = np.zeros((len(df), 1))
    
    # Combine scores
    combined_scores = []
    for i in range(len(df)):
        doc_tfidf = tfidf_matrix_methods[i].toarray().flatten()
        doc_lda = lda_matrix[i]
        weighted = doc_tfidf * (1 + topic_weight * doc_lda.max())
        combined_scores.append(weighted)
    
    df = df.copy()
    df['Primary_Method'] = np.nan
    df['Method_Score'] = np.nan
    df['Top_Methods'] = [[] for _ in range(len(df))]
    for i, scores in enumerate(combined_scores):
        if len(scores) == 0:
            continue
        top_indices = np.argsort(scores)[-top_n:][::-1]
        valid_indices = [idx for idx in top_indices if scores[idx] > method_threshold]
        if valid_indices:
            methods = [present_methods[idx] for idx in valid_indices]
            df.at[i, 'Primary_Method'] = methods[0]
            df.at[i, 'Method_Score'] = scores[valid_indices[0]]
            df.at[i, 'Top_Methods'] = methods
    return df


def run_guided_lda(df, method_phrases, num_topics=15):
    """Guided topic modeling with BERTopic using seed phrases."""
    # Prepare seed topics (one seed list per topic)
    seed_topic_list = [[phrase] for phrase in method_phrases[:num_topics]]
    
    # Initialize BERTopic with seed topics
    topic_model = BERTopic(
        seed_topic_list=seed_topic_list,
        verbose=True
    )
    
    # Fit on documents (use the 'processed_text' column)
    topics, probs = topic_model.fit_transform(df['processed_text'].tolist())
    
    return topic_model, topics, probs


def run_guided_lda_(df, method_phrases, num_topics=15):
    """Seeded LDA focusing on method-related phrases"""
    from guidedlda import GuidedLDA

    phrased_texts = detect_phrases(df)
    vectorizer = CountVectorizer(
        token_pattern=r'\b[\w_-]+\b',
        max_features=10000
    )
    doc_term_matrix = vectorizer.fit_transform(phrased_texts)

    # Convert method phrases to word IDs
    seed_word_ids = {}
    valid_phrases = [p.replace(' ', '_') for p in method_phrases if p.replace(' ', '_') in vectorizer.vocabulary_]
    
    for idx, phrase in enumerate(valid_phrases[:num_topics]):
        seed_word_ids[idx] = [vectorizer.vocabulary_[phrase]]
    
    topic_model = BERTopic(seed_topic_list=seed_topic_list)
    topics, probs = topic_model.fit_transform(docs)  # docs = list of text
    model = GuidedLDA(n_topics=num_topics, n_iter=100)
    model.fit(doc_term_matrix, seed_word_ids=seed_word_ids)
    
    return model, vectorizer


# Generate topic name function
def generate_topic_name(text, keywords, client, model_type, credit_tracker):
    try:
        tokens = num_tokens_from_string(text + " " + " ".join(keywords), model_type)
        credit_tracker.update(tokens)
        response = client.chat.completions.create(
            model=model_type,
            messages=[
                {"role": "system", "content": "You are a helpful scientific assistant that generates concise topic expressions based on academic paper titles and keywords for a collection of papers."},
                {"role": "user", "content": f"""Based on the following titles and keywords obtained by LDA-analysis, provide a concise single word, bigram, or trigram that best describes the main topic these abstracts have in common. The expression should be specific and descriptive and feasible for categorization.

Title:
{text}

Keywords:
{', '.join(keywords)}

Concise topic expression:"""}
            ]
        )
        content = response.choices[0].message.content.strip()
        credit_tracker.update(num_tokens_from_string(content, model_type))
        return content
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def generate_topic_name_advanced(lda_keywords, tfidf_ngrams, top_titles, client, model_type, credit_tracker, search_keywords=None):
    #if search_keywords is None:
    #    search_keywords='scientific-articles'
    prompt = f"""Based on the following keywords and n-grams from LDA and TF-IDF analysis and the titles of the most cited papers with this topic as dominant, provide a concise, specific, and descriptive topic name (preferably a bigram or trigram, but a single word is allowed if most fitting):

LDA keywords and n-grams:
{', '.join(lda_keywords)}

TF-IDF n-grams:
{', '.join(tfidf_ngrams)}

Most cited paper titles:
{'; '.join(top_titles)}


Concise topic name:"""
    try:
        tokens = num_tokens_from_string(prompt, model_type)
        credit_tracker.update(tokens)
        response = client.chat.completions.create(
            model=model_type,
            messages=[
                {"role": "system", "content": "You are a helpful scientific assistant that generates concise topic names based on keywords and paper titles."},
                {"role": "user", "content": prompt}
            ]
        )
        content = response.choices[0].message.content.strip()
        credit_tracker.update(num_tokens_from_string(content, model_type))
        return content
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def extract_candidate_terms(df, text_col='processed_text', max_features=100):
    """Extract candidate keywords and n-grams from processed text for LLM prompt testing"""
    vectorizer = CountVectorizer(
        ngram_range=(1, 3),
        max_df=0.95,
        min_df=2,
        max_features=max_features,
        token_pattern=r'\b[\w-]+\b'
    )
    matrix = vectorizer.fit_transform(df[text_col].fillna(''))
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    # Sort terms by frequency descending
    sorted_terms = sorted(zip(terms, freqs), key=lambda x: x[1], reverse=True)
    # Return just the terms
    return [term for term, freq in sorted_terms]

def get_method_phrases(corpus_terms, client, model_type, credit_tracker):
    """Improved prompt that considers the actual domain of your corpus"""
    
    # Analyze the candidate terms to understand the domain
    sample_terms = ', '.join(corpus_terms[:50])
    
    prompt = f"""Here are the most frequent terms from a corpus of scientific papers:
{sample_terms}

Based on these terms, this appears to be a corpus focused on power systems, electrical engineering, and reliability analysis.

From the full list of terms: {', '.join(corpus_terms)}

Extract ONLY the terms that represent specific methodologies, techniques, or named approaches that would actually appear in this type of engineering research. Focus on:
- Power system analysis methods
- Reliability analysis techniques  
- Engineering design approaches
- Computational methods used in power/electrical engineering
- Statistical methods for engineering

Do NOT include: generic words like "analysis", "method", "approach", "design", "system" by themselves.
DO include: specific named methods like "monte carlo simulation", "load flow analysis", "reliability assessment", loss of load probability, probabilitstic methods, etc.

Return as a simple Python list of strings, no code blocks or formatting."""

    response = client.chat.completions.create(
        model=model_type,
        messages=[{"role": "user", "content": prompt}]
    )
    
    try:
        return ast.literal_eval(response.choices[0].message.content)
    except:
        # Fallback parsing
        content = response.choices[0].message.content
        content = content.replace('[', '').replace(']', '').replace('"', '').replace("'", '')
        return [term.strip() for term in content.split(',') if len(term.strip()) > 3]


import re

def clean_method_phrases_fixed(method_phrases):
    """Clean and validate method phrases from LLM output"""
    cleaned_phrases = []
    
    for phrase in method_phrases:
        # Remove code block markers, quotes, and extra whitespace
        cleaned = phrase.strip()
        cleaned = cleaned.replace('```python', '').replace('```','')
        cleaned = cleaned.replace('[', '').replace(']', '')
        cleaned = cleaned.replace('"', '').replace("'", '')
        cleaned = cleaned.replace('\n', ' ')
        cleaned = ' '.join(cleaned.split())  # Remove extra whitespace
        
        # Skip empty or very short phrases
        if len(cleaned) > 2:
            cleaned_phrases.append(cleaned.lower())
    
    return list(set(cleaned_phrases))


def validate_method_phrases_improved_fixed(df, method_phrases):
    """Improved validation that handles multi-word phrases"""
    all_text = ' '.join(df['processed_text']).lower()
    matched_phrases = []
    
    for phrase in method_phrases:
        phrase_clean = phrase.lower().strip()
        
        # Check for exact phrase match
        if phrase_clean in all_text:
            matched_phrases.append(phrase)
        # Check for partial word matches (for compound terms)
        elif any(word in all_text for word in phrase_clean.split() if len(word) > 3):
            matched_phrases.append(phrase)
    
    return matched_phrases

def targeted_method_classification(df, sample_size=None):
    """More targeted method classification with specific phrase matching"""
    
    if sample_size:
        df_test = df.head(sample_size).copy()
    else:
        df_test = df.copy()
    
    logger.info(f"Processing {len(df_test)} documents with targeted criteria")
    
    # Get method phrases (your existing LLM approach)
    candidate_terms = extract_candidate_terms(df_test, max_features=2000)
    client, model_type = initialize_openai()
    credit_tracker = CreditTracker()
    method_phrases = get_method_phrases(candidate_terms, client, model_type, credit_tracker)
    method_phrases = clean_method_phrases_fixed(method_phrases)
    
    logger.info(f"Original method phrases: {method_phrases}")
    

    # Validate with stricter criteria
    all_text = ' '.join(df_test['processed_text']).lower()
    validated_phrases = []
    phrase_types = []
    
    for phrase in method_phrases:
        is_valid, match_type = strict_phrase_validation(phrase, all_text)
        if is_valid:
            validated_phrases.append(phrase)
            phrase_types.append(match_type)
            logger.info(f"  ✓ {match_type.upper()}: '{phrase}'")
        else:
            logger.info(f"  ✗ REJECTED: '{phrase}'")
    
    logger.info(f"Validated {len(validated_phrases)} phrases with strict matching")
    
    
    
    # Apply targeted selection
    classification_data = []
    for idx, row in df_test.iterrows():
        has_method, found_phrases = targeted_contains_method(row['processed_text'])
        classification_data.append({
            'index': idx,
            'has_method': has_method,
            'found_phrases': found_phrases,
            'phrase_count': len(found_phrases)
        })
    
    # Select documents for classification
    docs_to_classify_indices = [d['index'] for d in classification_data if d['has_method']]
    docs_to_classify = df_test.loc[docs_to_classify_indices].copy()
    
    logger.info(f"Documents to classify: {len(docs_to_classify)} ({len(docs_to_classify)/len(df_test)*100:.1f}%)")
    
    # Show phrase distribution
    phrase_counts = {}
    for d in classification_data:
        for phrase in d['found_phrases']:
            phrase_counts[phrase] = phrase_counts.get(phrase, 0) + 1
    
    logger.info("Phrase distribution in selected documents:")
    for phrase, count in sorted(phrase_counts.items(), key=lambda x: x[1], reverse=True):
        logger.info(f"  {phrase}: {count} documents")
    
    if len(docs_to_classify) > 0 and len(validated_phrases) > 0:
        # PRECISE TF-IDF CLASSIFICATION
        tfidf_vectorizer = TfidfVectorizer(
            vocabulary=validated_phrases,
            ngram_range=(1, 3),     # Focused n-gram range
            min_df=1,
            max_df=0.95,            # Slightly stricter than before
            sublinear_tf=True,
            norm='l2'
        )
        
        tfidf_matrix = tfidf_vectorizer.fit_transform(docs_to_classify['processed_text'])
        method_scores = tfidf_matrix.max(axis=1).toarray().ravel()
        
        # Handle classification
        if tfidf_matrix.shape[1] == 1:
            primary_methods = [tfidf_vectorizer.get_feature_names_out()[0]] * len(docs_to_classify)
        else:
            argmax_indices = tfidf_matrix.argmax(axis=1).A1
            feature_names = tfidf_vectorizer.get_feature_names_out()
            primary_methods = [feature_names[i] for i in argmax_indices]
        
        # Apply minimum score threshold to avoid weak assignments
        min_threshold = 0.05  # Require meaningful TF-IDF score
        final_methods = []
        for method, score in zip(primary_methods, method_scores):
            if score >= min_threshold:
                final_methods.append(method)
            else:
                final_methods.append('Low_Confidence_Method')
        
        # Assign results
        docs_to_classify['Primary_Method'] = primary_methods
        docs_to_classify['Method_Score'] = method_scores
        docs_to_classify['Method_Detected'] = final_methods
        
        # Merge back to full dataset
        df_result = df_test.copy()
        df_result['Method_Detected'] = 'No_Method_Found'
        df_result['Method_Score'] = 0.0
        
        df_result.loc[docs_to_classify.index, 'Primary_Method'] = primary_methods
        df_result.loc[docs_to_classify.index, 'Method_Score'] = method_scores
        df_result.loc[docs_to_classify.index, 'Method_Detected'] = final_methods
        
        # Results
        final_counts = df_result['Method_Detected'].value_counts()
        logger.info("=== TARGETED CLASSIFICATION RESULTS ===")
        logger.info(f"Final classification distribution:\n{final_counts}")
        
        # Show score statistics for classified documents
        classified_scores = df_result[df_result['Method_Detected'] != 'No_Method_Found']['Method_Score']
        if len(classified_scores) > 0:
            logger.info(f"Score statistics for classified documents:\n{classified_scores.describe()}")
        
        return df_result
    
    else:
        logger.warning("No documents selected for classification")
        df_test['Method_Detected'] = 'No_Methods_Available'
        return df_test



# STRICTER VALIDATION: Require more specific matching
def strict_phrase_validation(phrase, all_text):
    phrase_lower = phrase.lower()
    
    # Exact phrase match (highest confidence)
    if phrase_lower in all_text:
        return True, 'exact'
        
    # For compound phrases, require most words to be present
    words = phrase_lower.split()
    if len(words) > 1:
        word_matches = sum(1 for word in words if len(word) > 3 and word in all_text)
        # Require at least 60% of significant words
        if word_matches >= len([w for w in words if len(w) > 3]) * 0.6:
            return True, 'compound'
    
    # For single technical terms, be more selective
    if len(words) == 1 and len(phrase_lower) > 6:
        # Only include if it's a specific technical term, not generic
        generic_terms = ['analysis', 'method', 'approach', 'technique', 'system', 
                        'design', 'study', 'evaluation', 'assessment', 'processing']
        if phrase_lower not in generic_terms and phrase_lower in all_text:
            return True, 'technical'
    
    return False, 'none'

    
# TARGETED DOCUMENT SELECTION
def targeted_contains_method(text):
    text_lower = text.lower()
    
    # Exact phrase matches only
    exact_matches = [phrase for phrase in validated_phrases if phrase.lower() in text_lower]
    
    if exact_matches:
        return True, exact_matches
    
    # For compound phrases, check if key technical words appear together
    for phrase in validated_phrases:
        words = phrase.split()
        if len(words) > 1:
            # Check if words appear within reasonable proximity (within 100 characters)
            for i, word in enumerate(words[:-1]):
                if word in text_lower:
                    word_pos = text_lower.find(word)
                    next_word = words[i+1]
                    nearby_text = text_lower[word_pos:word_pos+150]
                    if next_word in nearby_text:
                        return True, [phrase]
    
    return False, []

def string_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def generate_topic_name_multiple(text, keywords, client, model_type, credit_tracker, initial_iterations=3, max_iterations=10, similarity_threshold=0.7):
    iterations = initial_iterations
    while iterations <= max_iterations:
        generated_names = []
        for _ in range(iterations):
            # Reuse the generate_topic_name function instead of duplicating code
            name = generate_topic_name(text, keywords, client, model_type, credit_tracker)
            if name:
                generated_names.append(name)
        
        # Check for dominant topic
        for i, name in enumerate(generated_names):
            similar_names = [other_name for j, other_name in enumerate(generated_names) 
                             if i != j and string_similarity(name, other_name) >= similarity_threshold]
            if len(similar_names) >= len(generated_names) // 2:
                return name  # Return the dominant topic name

        # If no dominant topic found, increase iterations
        iterations += 2
        print(f"No clear common topic name found. Increasing iterations to {iterations}.")

    # If max iterations reached without finding a dominant topic, return the most common name
    from collections import Counter
    return Counter(generated_names).most_common(1)[0][0]

def generate_topic_name_multiple_advanced(lda_keywords, tfidf_ngrams, top_titles, client, model_type, credit_tracker,search_keywords=None, initial_iterations=3, max_iterations=10, similarity_threshold=0.7):
    iterations = initial_iterations
    while iterations <= max_iterations:
        generated_names = []
        for _ in range(iterations):
            name = generate_topic_name_advanced(lda_keywords, tfidf_ngrams, top_titles, client, model_type, credit_tracker, search_keywords=search_keywords)
            if name:
                generated_names.append(name)
        # Majority vote logic as before...
        for i, name in enumerate(generated_names):
            similar_names = [other_name for j, other_name in enumerate(generated_names) 
                             if i != j and string_similarity(name, other_name) >= similarity_threshold]
            if len(similar_names) >= len(generated_names) // 2:
                return name
        iterations += 2
        print(f"No clear common topic name found. Increasing iterations to {iterations}.")
    from collections import Counter
    return Counter(generated_names).most_common(1)[0][0]


def classify_papers(topic_distributions, df_field):
    """Classify papers based on topic distributions"""
    paper_classifications = []
    
    for idx, dist in enumerate(topic_distributions): #looping throug the papers one-by-one (index by index) and topics distribution 
        # Get primary and secondary topics (only primary is used in dominance ratio)
        top_2_topics = np.argsort(dist)[-2:][::-1] # extracting the to last [-2:] the order is reversed to highest first with the [::-1]
        
        # Calculate dominance ratios by :
        primary_score = dist[top_2_topics[0]] # 1) extracting the probability of the primary topic
        other_topics_sum = sum(dist) - primary_score # 2) calc. the sum of all the other topics (all topic-primary)
        dominance_ratio = primary_score / (other_topics_sum + 1e-10) #3) dividing the primary score by the sum (adding very small number to avoid dividing by zero if only one topic)
        
        # Storing the paper classifications
        paper_classifications.append({
            'paper_idx': idx,                   #storing the paper index
            'primary_topic': top_2_topics[0],   # the most probable topic
            'secondary_topic': top_2_topics[1], # second most probable topic
            'primary_score': primary_score,     # the probability of the most probable topic 
            'dominance_ratio': dominance_ratio  # the domincane ratio (eg. is the paper only about this, or also covering other topics?)
        })
    
    return paper_classifications

# Get top papers per topic per field

def get_top_papers(paper_classifications, df_field, n_top=5):
    top_papers = {}
    author_topic_stats = {}
    
    # Debug print
    #print(f"Total papers to analyze: {len(paper_classifications)}")
    
    for topic in set(p['primary_topic'] for p in paper_classifications): # creating a set of the unique primary topics in the datasett, e.g. if there are only three primary topics and all papers "belong" to one of them, the set will be {Primary topic 1, Primary topic 2, Primary topic 3}
        print(f"\nProcessing topic {topic}")
        topic_papers = [p for p in paper_classifications if p['primary_topic'] == topic]  # Get all the papers for the current topic
        print(f"Papers for topic {topic}: {len(topic_papers)}")
        topic_papers.sort(key=lambda x: x['dominance_ratio'], reverse=True) # Sort by dominance ratio, reverse=True to get the highest ratios at the top.
        top_papers[topic] = []
        
        # Get top n papers where n is the numebr spesified as top_n when calling the functino
        for p in topic_papers[:n_top]: #loop through the top_n number of papers
            paper_idx = p['paper_idx']
            authors = df_field.iloc[paper_idx]['authors']
            
            # Debug print
            #print(f"\nPaper index: {paper_idx}")
            #print(f"Authors data type: {type(authors)}")
            #print(f"Authors content: {authors}")
            
            # Check if authors is string (might be stored as JSON string)
            if isinstance(authors, str):
                try:
                    authors = ast.literal_eval(authors)
                except (ValueError, SyntaxError):
                    print(f"Failed to parse authors string: {authors}")
                    authors = []
            
            if isinstance(authors, list):
                author_list = []
                for author in authors:
                    # Debug print
                    print(f"Processing author: {author}")
                    if isinstance(author, dict):
                        author_list.append({
                            'name': author.get('name', 'Unknown'),
                            'id': author.get('authorId', 'Unknown')
                        })
                    else:
                        print(f"Unexpected author format: {author}")
            else:
                print(f"Unexpected authors format: {authors}")
                author_list = []
            
            # Debug print
            #print(f"Processed author list: {author_list}")
            
            top_papers[topic].append({
                'paperId': df_field.iloc[paper_idx]['paperId'],
                'title': df_field.iloc[paper_idx]['title'],
                'abstract': df_field.iloc[paper_idx]['abstract'],
                'authors': author_list,
                'score': float(p['primary_score']),
                'dominance_ratio': float(p['dominance_ratio'])
            })
            
            for author in author_list:
                author_id = author['id']
                if author_id not in author_topic_stats:
                    author_topic_stats[author_id] = {
                        'name': author['name'],
                        'topics': {},
                        'total_papers': 0,
                        'top_papers': 0
                    }
                
                if topic not in author_topic_stats[author_id]['topics']:
                    author_topic_stats[author_id]['topics'][topic] = {
                        'paper_count': 0,
                        'avg_dominance': 0,
                        'top_papers': []
                    }
                
                author_stats = author_topic_stats[author_id]['topics'][topic]
                author_stats['paper_count'] += 1
                author_stats['avg_dominance'] = (
                    (author_stats['avg_dominance'] * (author_stats['paper_count'] - 1) + 
                     float(p['dominance_ratio'])) / author_stats['paper_count']
                )
                author_stats['top_papers'].append({
                    'title': df_field.iloc[paper_idx]['title'],
                    'dominance_ratio': float(p['dominance_ratio'])
                })
                
                author_topic_stats[author_id]['total_papers'] += 1
                author_topic_stats[author_id]['top_papers'] += 1
    
      # Convert topic numbers to regular integers for JSON serialization
    author_topic_stats_clean = {}
    for author_id, stats in author_topic_stats.items():
        author_topic_stats_clean[author_id] = stats.copy()
        author_topic_stats_clean[author_id]['topics'] = {
            int(topic): topic_stats 
            for topic, topic_stats in stats['topics'].items()
        }
    
    # Debug print with cleaned data
    #print(f"\nFinal author_topic_stats: {json.dumps(author_topic_stats_clean, indent=2)}")
    
    return top_papers, author_topic_stats

def get_top_titles_for_topic(df, paper_classifications, topic_idx, n_titles=20):
    # Find indices of papers where this topic is dominant
    dominant_papers = [p for p in paper_classifications if p['primary_topic'] == topic_idx]
    # Get their citation counts and indices
    paper_infos = [
        (df.iloc[p['paper_idx']]['citationCount'], df.iloc[p['paper_idx']]['title'])
        for p in dominant_papers
        if not pd.isna(df.iloc[p['paper_idx']]['title'])
    ]
    # Sort by citation count (descending) and take top n
    top_titles = [title for _, title in sorted(paper_infos, key=lambda x: -x[0])[:n_titles]]
    return top_titles


def print_author_analysis(author_topic_stats, min_papers=2):
    """Print detailed author analysis"""
    print("\nAuthor Analysis:")
    for author_id, stats in author_topic_stats.items():
        if stats['total_papers'] >= min_papers:
            print(f"\nAuthor: {stats['name']}")
            print(f"Total papers in top lists: {stats['total_papers']}")
            print("Topics:")
            for topic, topic_stats in stats['topics'].items():
                print(f"\nTopic {topic}:")
                print(f"  Paper count: {topic_stats['paper_count']}")
                print(f"  Average dominance ratio: {topic_stats['avg_dominance']:.4f}")
                print("  Top papers:")
                for paper in topic_stats['top_papers']:
                    print(f"    - {paper['title']} (dominance: {paper['dominance_ratio']:.4f})")


### Model saving 

In [169]:
def save_topic_terms(topic_terms, suffix_string, method='lda'):
    """
    Save topic terms (LDA or TF-IDF) to JSON file.
    topic_terms: dict of {topic_idx: {'top_words': [...], 'word_weights': [(word, weight), ...]}}
    method: 'lda' or 'tfidf' (used in filename)
    """
    topic_terms_serializable = {}
    for topic_idx, keywords in topic_terms.items():
        topic_terms_serializable[int(topic_idx)] = {
            'top_words': keywords['top_words'],
            'word_weights': [(word, float(weight)) for word, weight in keywords['word_weights']]
        }
    filename = os.path.join(SAVE_DIR, f"{method}_topic_terms_{suffix_string}.json")
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(topic_terms_serializable, f, ensure_ascii=False, indent=2)
    print(f"{method.upper()} topic terms saved to {filename}")

def build_tfidf_topic_terms(tfidf_ngrams, top_k=30):
    tfidf_topic_terms = {}
    for topic_idx, ngrams in tfidf_ngrams.items():
        top_words = [ngram for ngram, _ in ngrams[:top_k]]
        word_weights = [(ngram, float(score)) for ngram, score in ngrams[:top_k]]
        tfidf_topic_terms[topic_idx] = {
            'top_words': top_words,
            'word_weights': word_weights
        }
    return tfidf_topic_terms

def save_lda_components(lda_model, vectorizer, topic_distributions, suffix_string):
    """Save LDA model and related components"""
    # Save LDA model using joblib (recommended for sklearn models)
    lda_filename = os.path.join(SAVE_DIR, f"lda_model_{suffix_string}.joblib")
    joblib.dump(lda_model, lda_filename)
    
    # Save vectorizer
    vectorizer_filename = os.path.join(SAVE_DIR, f"vectorizer_{suffix_string}.joblib")
    joblib.dump(vectorizer, vectorizer_filename)
    
    # Save topic distributions
    distributions_filename = os.path.join(SAVE_DIR, f"topic_distributions_{suffix_string}.npy")
    np.save(distributions_filename, topic_distributions)
    
    print(f"LDA model saved to {lda_filename}")
    print(f"Vectorizer saved to {vectorizer_filename}")
    print(f"Topic distributions saved to {distributions_filename}")

In [8]:
""" needs fixing"""

def analyze_specific_topic(df, topic, num_subtopics=5, n_top=5):
    # Filter the dataframe for the specific topic
    df_topic = df[df['Primary_Topic'] == topic].copy()
    
    print(f"Analyzing topic: {topic}")
    print(f"Number of papers: {len(df_topic)}")
    
    if len(df_topic) < 10:  # Adjust this threshold as needed
        print("Not enough papers for meaningful subtopic analysis.")
        return None, None, None
    
    # Ensure index is unique
    df_topic = df_topic.reset_index(drop=True)
    
    # Prepare the text data
    text_data = df_topic['abstract'].fillna('')
    
    # Create document-term matrix
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    doc_term_matrix = vectorizer.fit_transform(text_data)
    
    # Create and fit LDA model for subtopics
    lda_model = LatentDirichletAllocation(n_components=num_subtopics, random_state=42)
    subtopic_distributions = lda_model.fit_transform(doc_term_matrix)
    
    # Get top words for each subtopic
    feature_names = vectorizer.get_feature_names_out()
    for idx, subtopic in enumerate(lda_model.components_):
        top_words = [feature_names[i] for i in subtopic.argsort()[:-10 - 1:-1]]
        print(f"Subtopic {idx + 1}: {', '.join(top_words)}")
    
    # Get top papers for each subtopic
    top_papers = {}
    for subtopic in range(num_subtopics):
        subtopic_scores = subtopic_distributions[:, subtopic]
        other_subtopics_sum = subtopic_distributions.sum(axis=1) - subtopic_scores
        subtopic_dominance = subtopic_scores / (other_subtopics_sum + 1e-10)
        top_indices = np.argsort(subtopic_dominance)[-n_top:][::-1]
        
        top_papers[subtopic] = [
            {
                'title': df_topic.iloc[i]['title'],
                'abstract': df_topic.iloc[i]['abstract'],
                'score': float(subtopic_scores[i]),
                'dominance_ratio': float(subtopic_dominance[i])
            }
            for i in top_indices
        ]
    # Generate subtopic names using the same method as before - Skipped until copyright breach issue is resolved
    """
    subtopic_names = {}
    for subtopic, papers in top_papers.items():
        abstracts = "\n\n".join([paper['abstract'] for paper in papers])
        keywords = [word for paper in papers for word in paper['abstract'].split()[:10]]
        subtopic_name = generate_topic_name(abstracts, keywords)
        subtopic_names[subtopic] = subtopic_name
    """
    #Add subtopic scores and names to the dataframe
    for i in range(num_subtopics):
        df_topic[f'Subtopic_{i+1}_Score'] = subtopic_distributions[:, i]
        df_topic['Primary_Subtopic'] = df_topic[[f'Subtopic_{i+1}_Score' for i in range(num_subtopics)]].idxmax(axis=1)
   # df_topic['Primary_Subtopic_Name'] = df_topic['Primary_Subtopic'].map(lambda x: subtopic_names[int(x.split('_')[1]) - 1])
    
    # Update the main dataframe with the new subtopic information
        df_update = df.copy()
        df_update.loc[df_topic.index, [f'Subtopic_{i+1}_Score' for i in range(num_subtopics)]] = df_topic[[f'Subtopic_{i+1}_Score' for i in range(num_subtopics)]]
        df_update.loc[df_topic.index, 'Primary_Subtopic'] = df_topic['Primary_Subtopic']
  # df_update.loc[df_topic.index, 'Primary_Subtopic_Name'] = df_topic['Primary_Subtopic_Name']
    
    return df_update, top_papers#, subtopic_names

#### Some older versions of analyse papers


In [99]:
"""
def analyze_papers_with_topic_names(df, fields_to_analyze,n_papers=5, output_suffix="_analyzed_results"):
    # Initialize OpenAI client
    client, model_type = initialize_openai()
    credit_tracker = CreditTracker()
    
    # Create a copy of the original dataframe to store all results
    df_analyzed = df.copy()
    all_author_stats = {}
    all_topic_names = {}
    
    for field in fields_to_analyze:
        print(f"\nAnalyzing field: {field}")
        
        # Step 1: Model topics
        lda_model, vectorizer, topic_distributions, df_field, topic_keywords = model_topics(
            df, field, num_topics=10)
        
        if lda_model is None:
            continue
            
        # Step 2: Classify papers and add to main dataframe
        paper_classifications = classify_papers(topic_distributions, df_field)
        
        # Add classifications to the analyzed dataframe
        for p in paper_classifications:
            idx = df_field.index[p['paper_idx']]
            df_analyzed.loc[idx, f'{field}_Primary_Topic'] = p['primary_topic']
            df_analyzed.loc[idx, f'{field}_Secondary_Topic'] = p['secondary_topic']
            df_analyzed.loc[idx, f'{field}_Primary_Score'] = p['primary_score']
            df_analyzed.loc[idx, f'{field}_Dominance_Ratio'] = p['dominance_ratio']
        
        # Step 3: Get top papers and author stats
        top_papers, author_stats = get_top_papers(paper_classifications, df_field, n_top=n_papers)
        
        # Step 4: Generate topic names
        field_topic_names = {}
        for topic_idx, papers in top_papers.items():
            # Combine abstracts and extract keywords
            input_text = "\n\n".join([str(paper.get('title', '')) for paper in papers])
            keywords = topic_keywords[topic_idx]['top_words']
            
            # Generate topic name
            topic_name = generate_topic_name_multiple(input_text, keywords, client, model_type, credit_tracker)
            
            if topic_name:
                field_topic_names[topic_idx] = topic_name
                # Add topic name to dataframe
                topic_column = f'{field}_Topic_{topic_idx}_Name'
                df_analyzed[topic_column] = topic_name
                
                # Update primary topic name for papers with this primary topic
                primary_topic_mask = df_analyzed[f'{field}_Primary_Topic'] == topic_idx
                df_analyzed.loc[primary_topic_mask, f'{field}_Primary_Topic_Name'] = topic_name
            
            # Print topic name and top papers
            print(f"\nTopic {topic_idx + 1}: {topic_name if topic_name else 'Unnamed'}")
            print(f"Keywords: {', '.join(topic_keywords[topic_idx]['top_words'])}")
            for paper in papers:
                print(f"- {paper['title']} (dominance: {paper['dominance_ratio']:.4f})")
        
        # Store results
        all_author_stats[field] = author_stats
        all_topic_names[field] = field_topic_names
        
        # Add topic keywords to dataframe metadata
        df_analyzed.attrs[f'{field}_topic_keywords'] = topic_keywords
        df_analyzed.attrs[f'{field}_topic_names'] = field_topic_names
    
    # Save results
    current_date = datetime.now().strftime("%Y_%m_%d")
    output_filename = os.path.join(SAVE_DIR,f"semantic_scholar_{current_date}{output_suffix}.csv")
    
    # Save main results
    df_analyzed.to_csv(output_filename, sep=';', encoding='utf-8', 
                      quoting=csv.QUOTE_NONNUMERIC, escapechar='\\')
    
    # Save author statistics with topic frequencies
    author_filename = os.path.join(SAVE_DIR,f"semantic_scholar_{current_date}_author_analysis.csv")
    save_author_analysis(all_author_stats, author_filename)
    
    # Save topic names
    converted_topic_names = {}
    for field, topics in all_topic_names.items():
        converted_topic_names[field] = {
        int(topic_idx): name 
        for topic_idx, name in topics.items()
        }

    topic_names_filename = os.path.join(SAVE_DIR,f"semantic_scholar_{current_date}_topic_names.json")
    with open(topic_names_filename, 'w', encoding='utf-8') as f:
        json.dump(all_topic_names, f, ensure_ascii=False, indent=2)
    
    # Print credit usage
    print("\nAPI Usage Statistics:")
    stats = credit_tracker.get_stats()
    print(f"Total tokens: {stats['total_tokens']}")
    print(f"Estimated cost: ${stats['total_cost']}")
    
    return df_analyzed, all_author_stats, all_topic_names
"""

def analyze_papers_with_topic_names(df, fields_to_analyze=None, n_papers=5, output_suffix="_analyzed_results"):
    """
    Analyze papers with topic naming
    
    Parameters:
    df - DataFrame containing papers
    fields_to_analyze - List of fields to filter papers by (if None, use all papers)
    n_papers - Number of top papers to extract per topic
    output_suffix - Suffix for output files
    """
    # Initialize OpenAI client
    client, model_type = initialize_openai()
    credit_tracker = CreditTracker()
    
    # Create a copy of the original dataframe to store all results
    df_analyzed = df.copy()
    all_author_stats = {}
    all_topic_names = {}
    
    # Filter dataset by fields if specified
    if fields_to_analyze:
        df_filtered = df[df['fieldsOfStudy'].apply(lambda x: any(field in x for field in fields_to_analyze))]
        print(f"Filtered to {len(df_filtered)} papers from fields: {', '.join(fields_to_analyze)}")
    else:
        df_filtered = df
        print(f"Using all {len(df_filtered)} papers for analysis")
    
    if df_filtered.empty:
        print("No papers found after filtering")
        return None, None, None
    
    # Step 1: Model topics on the filtered dataset
    lda_model, vectorizer, topic_distributions, df_field, topic_keywords = model_topics(
        df_filtered)
    
    if lda_model is None:
        return None, None, None
        
    # Step 2: Classify papers and add to main dataframe
    paper_classifications = classify_papers(topic_distributions, df_field)
    
    # Step 3: Generate topic names first, so we can use them in column names
    top_papers, author_stats = get_top_papers(paper_classifications, df_field, n_top=n_papers)
    
    # Generate topic names
    topic_names = {}
    for topic_idx, papers in top_papers.items():
        # Combine titles and extract keywords
        input_text = "\n\n".join([str(paper.get('title', '')) for paper in papers])
        keywords = topic_keywords[topic_idx]['top_words']
        
        # Generate topic name
        topic_name = generate_topic_name_multiple(input_text, keywords, client, model_type, credit_tracker)
        
        if topic_name:
            topic_names[topic_idx] = topic_name
            
        # Print topic name and top papers
        print(f"\nTopic {topic_idx + 1}: {topic_name if topic_name else 'Unnamed'}")
        print(f"Keywords: {', '.join(topic_keywords[topic_idx]['top_words'])}")
        for paper in papers:
            print(f"- {paper['title']} (dominance: {paper['dominance_ratio']:.4f})")
    
    # Step 4: Add classifications to the analyzed dataframe using topic names
    for p in paper_classifications:
        idx = df_field.index[p['paper_idx']]
        primary_topic_idx = p['primary_topic']
        secondary_topic_idx = p['secondary_topic']
        
        # Store topic indices for reference
        df_analyzed.loc[idx, 'Primary_Topic_Index'] = primary_topic_idx
        df_analyzed.loc[idx, 'Secondary_Topic_Index'] = secondary_topic_idx
        
        # Store topic names as primary columns
        primary_name = topic_names.get(primary_topic_idx, f"Topic_{primary_topic_idx}")
        secondary_name = topic_names.get(secondary_topic_idx, f"Topic_{secondary_topic_idx}")
        
        df_analyzed.loc[idx, 'Primary_Topic'] = primary_name
        df_analyzed.loc[idx, 'Secondary_Topic'] = secondary_name
        df_analyzed.loc[idx, 'Primary_Score'] = p['primary_score']
        df_analyzed.loc[idx, 'Dominance_Ratio'] = p['dominance_ratio']
    
    # Store results
    all_author_stats['all'] = author_stats
    all_topic_names['all'] = topic_names
    
    # Add topic keywords to dataframe metadata
    df_analyzed.attrs['topic_keywords'] = topic_keywords
    df_analyzed.attrs['topic_names'] = topic_names
    
    # Save results
    current_date = datetime.now().strftime("%Y_%m_%d")
    output_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{current_date}{output_suffix}.csv")
    
    # Save main results
    df_analyzed.to_csv(output_filename, sep=';', encoding='utf-8', 
                      quoting=csv.QUOTE_NONNUMERIC, escapechar='\\')
    
    # Save author statistics with topic frequencies
    author_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{current_date}_author_analysis.csv")
    save_author_analysis(all_author_stats, author_filename)
    
    # Save topic names
    converted_topic_names = {}
    for field, topics in all_topic_names.items():
        converted_topic_names[field] = {
            int(topic_idx): name 
            for topic_idx, name in topics.items()
        }

    topic_names_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{current_date}_topic_names.json")
    with open(topic_names_filename, 'w', encoding='utf-8') as f:
        json.dump(converted_topic_names, f, ensure_ascii=False, indent=2)
    
    # Print credit usage
    print("\nAPI Usage Statistics:")
    stats = credit_tracker.get_stats()
    print(f"Total tokens: {stats['total_tokens']}")
    print(f"Estimated cost: ${stats['total_cost']}")
    
    return df




In [100]:
def analyze_papers_with_topic_names_old(df, fields_to_analyze=None, n_papers=5, output_suffix="_analyzed_results"):
    """
    Analyze papers with topic naming, save all components, and detailed logging
    """
    start_time = time.time()
    logger.info("Starting topic analysis")
    
    # Initialize OpenAI client
    step_start = time.time()
    client, model_type = initialize_openai()
    credit_tracker = CreditTracker()
    logger.info(f"✓ Initialized OpenAI client ({time.time() - step_start:.2f}s)")
    
    # Create a copy of the original dataframe to store all results
    step_start = time.time()
    df_analyzed = df.copy()
    all_author_stats = {}
    all_topic_names = {}
    logger.info(f"✓ Created dataframe copy ({time.time() - step_start:.2f}s)")
    
    # Filter dataset by fields if specified
    step_start = time.time()
    if fields_to_analyze:
        df_filtered = df[df['fieldsOfStudy'].apply(lambda x: any(field in x for field in fields_to_analyze))]
        logger.info(f"✓ Filtered to {len(df_filtered)} papers from fields: {', '.join(fields_to_analyze)} ({time.time() - step_start:.2f}s)")
    else:
        df_filtered = df
        logger.info(f"✓ Using all {len(df_filtered)} papers for analysis ({time.time() - step_start:.2f}s)")
    
    if df_filtered.empty:
        logger.error("No papers found after filtering")
        return None, None, None
    
    # Step 1: Model topics on the filtered dataset
    step_start = time.time()
    logger.info("🔄 Starting LDA topic modeling...")
    lda_model, vectorizer, topic_distributions, df_field, topic_keywords = model_topics(df_filtered)
    logger.info(f"✓ LDA topic modeling completed ({time.time() - step_start:.2f}s)")
    
    if lda_model is None:
        logger.error("LDA model creation failed")
        return None, None, None
        
    # Step 2: Classify papers and add to main dataframe
    step_start = time.time()
    logger.info("🔄 Classifying papers...")
    paper_classifications = classify_papers(topic_distributions, df_field)
    logger.info(f"✓ Paper classification completed ({time.time() - step_start:.2f}s)")
    
    # Step 3: Generate topic names first, so we can use them in column names
    step_start = time.time()
    logger.info("🔄 Getting top papers and author stats...")
    top_papers, author_stats = get_top_papers(paper_classifications, df_field, n_top=n_papers)
    logger.info(f"✓ Top papers and author stats completed ({time.time() - step_start:.2f}s)")
    
    # Generate topic names
    step_start = time.time()
    logger.info("🔄 Generating topic names with LLM...")
    topic_names = {}
    for topic_idx, papers in top_papers.items():
        topic_start = time.time()
        # Combine titles and extract keywords
        input_text = "\n\n".join([str(paper.get('title', '')) for paper in papers])
        keywords = topic_keywords[topic_idx]['top_words']
        
        # Generate topic name
        topic_name = generate_topic_name_multiple(input_text, keywords, client, model_type, credit_tracker)
        
        if topic_name:
            topic_names[topic_idx] = topic_name
            
        logger.info(f"  ✓ Topic {topic_idx + 1}: {topic_name if topic_name else 'Unnamed'} ({time.time() - topic_start:.2f}s)")
        
        # Print topic name and top papers
        print(f"\nTopic {topic_idx + 1}: {topic_name if topic_name else 'Unnamed'}")
        print(f"Keywords: {', '.join(topic_keywords[topic_idx]['top_words'])}")
        for paper in papers:
            print(f"- {paper['title']} (dominance: {paper['dominance_ratio']:.4f})")
    
    logger.info(f"✓ Topic name generation completed ({time.time() - step_start:.2f}s)")
    
    # Step 4: Add classifications to the analyzed dataframe using topic names
    step_start = time.time()
    logger.info("🔄 Adding classifications to dataframe...")
    for p in paper_classifications:
        idx = df_field.index[p['paper_idx']]
        primary_topic_idx = int(p['primary_topic'])        # Ensure integer
        secondary_topic_idx = int(p['secondary_topic'])    # Ensure integer

        # Store topic indices for reference
        df_analyzed.loc[idx, 'Primary_Topic_Index'] = primary_topic_idx
        df_analyzed.loc[idx, 'Secondary_Topic_Index'] = secondary_topic_idx

        # Store topic names as primary columns
        primary_name = topic_names.get(primary_topic_idx, f"Topic_{primary_topic_idx}")
        secondary_name = topic_names.get(secondary_topic_idx, f"Topic_{secondary_topic_idx}")

        df_analyzed.loc[idx, 'Primary_Topic'] = primary_name
        df_analyzed.loc[idx, 'Secondary_Topic'] = secondary_name
        df_analyzed.loc[idx, 'Primary_Score'] = p['primary_score']
        df_analyzed.loc[idx, 'Dominance_Ratio'] = p['dominance_ratio']
    
    logger.info(f"✓ Dataframe classification completed ({time.time() - step_start:.2f}s)")
    
    # Store results
    all_author_stats['all'] = author_stats
    all_topic_names['all'] = topic_names
    
    # Add topic keywords to dataframe metadata
    df_analyzed.attrs['topic_keywords'] = topic_keywords
    df_analyzed.attrs['topic_names'] = topic_names
    
    # Save results
    step_start = time.time()
    logger.info("🔄 Saving all results...")
    current_date = datetime.now().strftime("%Y_%m_%d")
    
    # Save main results
    output_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{current_date}{output_suffix}.csv")
    df_analyzed.to_csv(output_filename, sep=';', encoding='utf-8', 
                      quoting=csv.QUOTE_NONNUMERIC, escapechar='\\')
    
    # Save author statistics
    author_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{current_date}_author_analysis.csv")
    save_author_analysis(all_author_stats, author_filename)
    
    # Save topic names
    converted_topic_names = {}
    for field, topics in all_topic_names.items():
        converted_topic_names[field] = {
            int(topic_idx): name 
            for topic_idx, name in topics.items()
        }

    topic_names_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{current_date}_topic_names.json")
    with open(topic_names_filename, 'w', encoding='utf-8') as f:
        json.dump(converted_topic_names, f, ensure_ascii=False, indent=2)
    
    # Save topic keywords
    save_topic_keywords(topic_keywords, current_date)
    
    # Save LDA model and components
    save_lda_components(lda_model, vectorizer, topic_distributions, current_date)
    
    # Save top papers for reference
    top_papers_filename = os.path.join(SAVE_DIR, f"top_papers_{current_date}.json")
    with open(top_papers_filename, 'w', encoding='utf-8') as f:
        # Convert numpy.int64 keys to regular Python int
        top_papers_serializable = {
            int(topic_idx): papers 
            for topic_idx, papers in top_papers.items()
        }
        json.dump(top_papers_serializable, f, ensure_ascii=False, indent=2, default=str)
    
    logger.info(f"✓ All files saved ({time.time() - step_start:.2f}s)")
    
    # Print credit usage
    print("\nAPI Usage Statistics:")
    stats = credit_tracker.get_stats()
    print(f"Total tokens: {stats['total_tokens']}")
    print(f"Estimated cost: ${stats['total_cost']}")
    
    total_time = time.time() - start_time
    logger.info(f"🎉 Analysis completed successfully! Total time: {total_time:.2f}s ({total_time/60:.1f} minutes)")
    
    return df_analyzed, all_author_stats, all_topic_names

### Current version of workflow

In [184]:
def analyze_papers_with_topic_names(
    df, 
    fields_to_analyze=None, 
    n_papers=5, 
    num_keywords=10, 
    output_suffix="_analyzed_results", 
    keywords=None
):
    """
    Analyze papers with topic naming and targeted method classification.
    Combines LDA, TF-IDF, and LLM-based topic naming with precise method assignment.
    """
    import time
    from datetime import datetime
    import os
    import csv

    start_time = time.time()
    logger.info("Starting topic analysis")

    # Initialize OpenAI client
    client, model_type = initialize_openai()
    credit_tracker = CreditTracker()
    logger.info("✓ Initialized OpenAI client")

    # Copy DataFrame for all analysis steps
    df_analyzed = df.copy().reset_index(drop=True)
    all_author_stats = {}
    all_topic_names = {}

    # Step 1: Filtering (if needed)
    if fields_to_analyze:
        df_filtered = df_analyzed[df_analyzed['fieldsOfStudy'].apply(
            lambda x: any(field in x for field in fields_to_analyze)
        )].copy()
        logger.info(f"✓ Filtered to {len(df_filtered)} papers from fields: {', '.join(fields_to_analyze)}")
    else:
        df_filtered = df_analyzed.copy()
        logger.info(f"✓ Using all {len(df_filtered)} papers for analysis")

    if df_filtered.empty:
        logger.error("No papers found after filtering")
        return None, None, None

    # Steps 2-6: LDA Topic Modeling and Classification (unchanged)
    logger.info("🔄 Starting LDA topic modeling...")
    lda_model, vectorizer, topic_distributions, df_field, topic_keywords = model_topics(
        df_filtered, num_words=num_keywords
    )
    df_field = df_field.reset_index(drop=True)
    logger.info("✓ LDA topic modeling completed")

    if lda_model is None:
        logger.error("LDA model creation failed")
        return None, None, None

    logger.info("🔄 Classifying papers by topic...")
    paper_classifications = classify_papers(topic_distributions, df_field)
    logger.info("✓ Paper classification completed")

    df_field['Primary_Topic_Index'] = [int(p['primary_topic']) for p in paper_classifications]

    logger.info("🔄 Extracting TF-IDF n-grams for topics...")
    tfidf_vectorizer = TfidfVectorizer(
        ngram_range=(1, 3),
        min_df=2,
        max_df=0.95,
        token_pattern=r'\b[\w_-]+\b'
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(df_field['processed_text'])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_ngrams = get_top_tfidf_ngrams_per_topic(
        df_field, tfidf_matrix, feature_names,
        topic_col='Primary_Topic_Index', top_k=num_keywords
    )
    logger.info("✓ TF-IDF n-gram extraction completed")

    logger.info("🔄 Generating topic names with LLM (advanced)...")
    top_papers, author_stats = get_top_papers(paper_classifications, df_field, n_top=n_papers)
    topic_names = {}
    for topic_idx, papers in top_papers.items():
        lda_ngrams = topic_keywords[topic_idx]['top_words'][:100]
        tfidf_ng = [ngram for ngram, _ in tfidf_ngrams.get(topic_idx, [])][:100]
        top_titles = get_top_titles_for_topic(df_field, paper_classifications, topic_idx, n_titles=20)
        topic_name = generate_topic_name_multiple_advanced(
            lda_ngrams, tfidf_ng, top_titles, client, model_type, credit_tracker, search_keywords=keywords
        )
        if topic_name:
            topic_names[topic_idx] = topic_name
        logger.info(f"  ✓ Topic {topic_idx + 1}: {topic_name if topic_name else 'Unnamed'}")

    # Map topic classifications back to df_analyzed
    filtered_indices = df_filtered.index
    for i, p in enumerate(paper_classifications):
        idx = filtered_indices[i]
        primary_topic_idx = int(p['primary_topic'])
        secondary_topic_idx = int(p['secondary_topic'])
        df_analyzed.loc[idx, 'Primary_Topic_Index'] = primary_topic_idx
        df_analyzed.loc[idx, 'Secondary_Topic_Index'] = secondary_topic_idx
        primary_name = topic_names.get(primary_topic_idx, f"Topic_{primary_topic_idx}")
        secondary_name = topic_names.get(secondary_topic_idx, f"Topic_{secondary_topic_idx}")
        df_analyzed.loc[idx, 'Primary_Topic'] = primary_name
        df_analyzed.loc[idx, 'Secondary_Topic'] = secondary_name
        df_analyzed.loc[idx, 'Primary_Score'] = p['primary_score']
        df_analyzed.loc[idx, 'Dominance_Ratio'] = p['dominance_ratio']

    # Step 7: TARGETED METHOD CLASSIFICATION
    logger.info("🔄 Performing targeted method classification...")
    
    # Extract candidate terms from full dataset
    candidate_terms = extract_candidate_terms(df_analyzed, max_features=5000)
    method_phrases = get_method_phrases(candidate_terms, client, model_type, credit_tracker)
    method_phrases = clean_method_phrases_fixed(method_phrases)
    
    logger.info(f"Got {len(method_phrases)} method phrases from LLM")

    # Validate with stricter criteria
    all_text = ' '.join(df_analyzed['processed_text']).lower()
    validated_phrases = []
    
    for phrase in method_phrases:
        is_valid, match_type = strict_phrase_validation(phrase, all_text)
        if is_valid:
            validated_phrases.append(phrase)
            logger.info(f"  ✓ {match_type.upper()}: '{phrase}'")
    
    logger.info(f"Validated {len(validated_phrases)} phrases with strict matching")
    
    # Apply targeted selection
    classification_data = []
    for idx, row in df_analyzed.iterrows():
        has_method, found_phrases = targeted_contains_method(row['processed_text'])
        classification_data.append({
            'index': idx,
            'has_method': has_method,
            'found_phrases': found_phrases
        })
    
    # Select documents for classification
    docs_to_classify_indices = [d['index'] for d in classification_data if d['has_method']]
    docs_to_classify = df_analyzed.loc[docs_to_classify_indices].copy()
    
    logger.info(f"Documents to classify: {len(docs_to_classify)} ({len(docs_to_classify)/len(df_analyzed)*100:.1f}%)")
    
    if len(docs_to_classify) > 0 and len(validated_phrases) > 0:
        # PRECISE TF-IDF CLASSIFICATION
        method_tfidf_vectorizer = TfidfVectorizer(
            vocabulary=validated_phrases,
            ngram_range=(1, 3),
            min_df=1,
            max_df=0.95,
            sublinear_tf=True,
            norm='l2'
        )
        
        method_tfidf_matrix = method_tfidf_vectorizer.fit_transform(docs_to_classify['processed_text'])
        method_scores = method_tfidf_matrix.max(axis=1).toarray().ravel()
        
        # Handle classification
        if method_tfidf_matrix.shape[1] == 1:
            primary_methods = [method_tfidf_vectorizer.get_feature_names_out()[0]] * len(docs_to_classify)
        else:
            argmax_indices = method_tfidf_matrix.argmax(axis=1).A1
            feature_names = method_tfidf_vectorizer.get_feature_names_out()
            primary_methods = [feature_names[i] for i in argmax_indices]
        
        # Apply minimum score threshold
        min_threshold = 0.1
        final_methods = []
        for method, score in zip(primary_methods, method_scores):
            if score >= min_threshold:
                final_methods.append(method)
            else:
                final_methods.append('Low_Confidence_Method')
        
        # Assign results to df_analyzed
        df_analyzed['Method_Detected'] = 'No_Method_Found'
        df_analyzed['Method_Score'] = 0.0
        df_analyzed['Primary_Method'] = ''
        
        df_analyzed.loc[docs_to_classify.index, 'Primary_Method'] = primary_methods
        df_analyzed.loc[docs_to_classify.index, 'Method_Score'] = method_scores
        df_analyzed.loc[docs_to_classify.index, 'Method_Detected'] = final_methods
        
        # Log results
        final_counts = df_analyzed['Method_Detected'].value_counts()
        logger.info(f"Method classification distribution:\n{final_counts}")
        
    else:
        logger.warning("No documents selected for method classification")
        df_analyzed['Method_Detected'] = 'No_Methods_Available'
        df_analyzed['Method_Score'] = 0.0
        df_analyzed['Primary_Method'] = ''

    logger.info("✓ Targeted method classification completed")

    # Save Results (unchanged)
    current_date = datetime.now().strftime("%Y_%m_%d")
    keyword_str = keywords_to_filename_part(keywords) if keywords else ""
    output_filename = os.path.join(
        SAVE_DIR, f"semantic_scholar_{current_date}{keyword_str}{output_suffix}.csv"
    )
    df_analyzed.to_csv(output_filename, sep=';', encoding='utf-8', 
                      quoting=csv.QUOTE_NONNUMERIC, escapechar='\\')

    # Save Metadata
    all_author_stats['all'] = author_stats
    all_topic_names['all'] = topic_names
    df_analyzed.attrs.update({
        'topic_keywords': topic_keywords,
        'topic_names': topic_names,
        'tfidf_ngrams': tfidf_ngrams
    })

    # Save supporting files
    save_supporting_files(
        lda_model=lda_model,
        vectorizer=vectorizer,
        topic_distributions=topic_distributions,
        suffix_string=f"{current_date}{keyword_str}",
        author_stats=all_author_stats,
        top_papers=top_papers,
        topic_keywords=topic_keywords
    )

    logger.info("✓ All files saved")
    print("\nAPI Usage Statistics:")
    stats = credit_tracker.get_stats()
    print(f"Total tokens: {stats['total_tokens']}")
    print(f"Estimated cost: ${stats['total_cost']}")
    total_time = time.time() - start_time
    logger.info(f"🎉 Analysis completed successfully! Total time: {total_time:.2f}s ({total_time/60:.1f} minutes)")
    
    return df_analyzed, all_author_stats, all_topic_names



def save_supporting_files(lda_model, vectorizer, topic_distributions, suffix_string, author_stats, top_papers, topic_keywords):
    """Helper function to save all analysis components"""
    # Save author statistics
    author_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{suffix_string}_author_analysis.csv")
    save_author_analysis(author_stats, author_filename)
    
    # Save topic components
    save_topic_terms(topic_keywords, suffix_string, method='lda')
    save_lda_components(lda_model, vectorizer, topic_distributions, suffix_string)
    
    # Save top papers
    top_papers_filename = os.path.join(SAVE_DIR, f"top_papers_{suffix_string}.json")
    with open(top_papers_filename, 'w', encoding='utf-8') as f:
        json.dump({
            int(k): v for k, v in top_papers.items()
        }, f, ensure_ascii=False, indent=2, default=str)




#### Version with Bertopic

In [68]:
# new better version of the analyzis:

import time
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


#old version with bertopic and method phrases
def analyze_papers_with_topic_names_bert(
    df, fields_to_analyze=None, n_papers=5, num_keywords=100, output_suffix="_analyzed_results", keywords=None
):
    import time
    import numpy as np
    start_time = time.time()
    logger.info("Starting topic analysis")

    # Initialize OpenAI client
    client, model_type = initialize_openai()
    credit_tracker = CreditTracker()
    logger.info("✓ Initialized OpenAI client")

    # Filter dataset by fields if specified
    if fields_to_analyze:
        df_filtered = df[df['fieldsOfStudy'].apply(lambda x: any(field in x for field in fields_to_analyze))]
        logger.info(f"✓ Filtered to {len(df_filtered)} papers from fields: {', '.join(fields_to_analyze)}")
    else:
        df_filtered = df
        logger.info(f"✓ Using all {len(df_filtered)} papers for analysis")

    if df_filtered.empty:
        logger.error("No papers found after filtering")
        return None, None

    # Step 1: LDA Topic Modeling (with phrase detection)
    logger.info("🔄 Starting LDA topic modeling...")
    lda_model, vectorizer, topic_distributions, df_field, topic_keywords = model_topics(
        df_filtered, num_topics=10, num_words=num_keywords
    )
    df_field = df_field.reset_index(drop=True)
    logger.info("✓ LDA topic modeling completed")

    # Step 2: Classify papers by topic
    logger.info("🔄 Classifying papers by topic...")
    paper_classifications = classify_papers(topic_distributions, df_field)
    logger.info("✓ Paper classification completed")

    # Add topic index column for TF-IDF extraction
    topic_indices = [int(p['primary_topic']) for p in paper_classifications]
    df_field = df_field.copy()
    df_field['Primary_Topic_Index'] = topic_indices

    # Step 3: TF-IDF extraction (with n-grams)
    logger.info("🔄 Extracting TF-IDF n-grams for topics...")
    tfidf_vectorizer = TfidfVectorizer(
        ngram_range=(1, 3),
        min_df=1,
        max_df=0.95,
        token_pattern=r'\b[\w_-]+\b',
        max_features=10000
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(df_field['processed_text'])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    topic_keywords = extract_topic_keywords(lda_model, feature_names, num_keywords)

    tfidf_ngrams = get_top_tfidf_ngrams_per_topic(
        df_field, tfidf_matrix, feature_names,
        topic_col='Primary_Topic_Index', top_k=num_keywords
    )
    logger.info("✓ TF-IDF n-gram extraction completed")

    # Step 4: Extract candidate method/algorithm/technique phrases using LLM
    logger.info("🔄 Extracting candidate method phrases with LLM...")
    corpus_terms = list(feature_names)
    method_phrases = get_method_phrases(corpus_terms, client, model_type, credit_tracker)
    logger.info(f"✓ Extracted {len(method_phrases)} method phrases: {method_phrases[:10]}...")

    # Step 5: BERTopic Guided Topic Modeling using method phrases as seeds
    logger.info("🔄 Running BERTopic Guided Topic Modeling with method phrases as seeds...")
    max_guided_topics = min(15, len(method_phrases))
    seed_topics = [[phrase] for phrase in method_phrases[:max_guided_topics]]
    guided_topic_model, guided_topics, guided_probs, df_guided, guided_topic_keywords = model_topics_guided_bertopic(
        df_filtered, num_topics=max_guided_topics, seed_topics=seed_topics
    )
    logger.info("✓ BERTopic Guided Topic Modeling completed")

    # Step 6: Classify papers by dominant method (from guided topics)
    logger.info("🔄 Classifying papers by method topics...")
    df_guided = df_guided.copy()
    df_guided['Primary_Method_Index'] = guided_topics
    method_idx_to_phrase = {idx: phrase[0] for idx, phrase in enumerate(seed_topics)}
    df_guided['Primary_Method'] = df_guided['Primary_Method_Index'].map(method_idx_to_phrase)

    # Step 7: Generate topic names (with LLM, as before)
    logger.info("🔄 Generating topic names with LLM (advanced)...")
    top_papers, author_stats = get_top_papers(paper_classifications, df_field, n_top=n_papers)
    topic_names = {}
    for topic_idx, papers in top_papers.items():
        lda_ngrams = topic_keywords[topic_idx]['top_words']
        tfidf_ng = [ngram for ngram, _ in tfidf_ngrams.get(topic_idx, [])]
        top_titles = get_top_titles_for_topic(df_field, paper_classifications, topic_idx, n_titles=20)
        topic_name = generate_topic_name_multiple_advanced(
            lda_ngrams, tfidf_ng, top_titles, client, model_type, credit_tracker
        )
        if topic_name:
            topic_names[topic_idx] = topic_name
        logger.info(f"  ✓ Topic {topic_idx + 1}: {topic_name if topic_name else 'Unnamed'}")
        print(f"\nTopic {topic_idx + 1}: {topic_name if topic_name else 'Unnamed'}")
        print(f"LDA n-grams: {', '.join(lda_ngrams)}")
        print(f"TF-IDF n-grams: {', '.join(tfidf_ng)}")
        for paper in papers:
            print(f"- {paper['title']} (dominance: {paper['dominance_ratio']:.4f})")

    # Step 8: Add classifications to the analyzed dataframe using topic names
    df_analyzed = df.copy()
    for p in paper_classifications:
        idx = df_field.index[p['paper_idx']]
        primary_topic_idx = int(p['primary_topic'])
        secondary_topic_idx = int(p['secondary_topic'])
        df_analyzed.loc[idx, 'Primary_Topic_Index'] = primary_topic_idx
        df_analyzed.loc[idx, 'Secondary_Topic_Index'] = secondary_topic_idx
        primary_name = topic_names.get(primary_topic_idx, f"Topic_{primary_topic_idx}")
        secondary_name = topic_names.get(secondary_topic_idx, f"Topic_{secondary_topic_idx}")
        df_analyzed.loc[idx, 'Primary_Topic'] = primary_name
        df_analyzed.loc[idx, 'Secondary_Topic'] = secondary_name
        df_analyzed.loc[idx, 'Primary_Score'] = p['primary_score']
        df_analyzed.loc[idx, 'Dominance_Ratio'] = p['dominance_ratio']

    # Step 9: Add method topic classification to analyzed dataframe
    df_analyzed['Primary_Method_Index'] = np.nan
    df_analyzed['Primary_Method'] = np.nan
    for idx, row in df_guided.iterrows():
        df_analyzed.loc[idx, 'Primary_Method_Index'] = row.get('Primary_Method_Index', np.nan)
        df_analyzed.loc[idx, 'Primary_Method'] = row.get('Primary_Method', np.nan)

    # Save results
    all_author_stats = {'all': author_stats}
    all_topic_names = {'all': topic_names}
    current_date = datetime.now().strftime("%Y_%m_%d")
    if keywords:
        keyword_str = keywords_to_filename_part(keywords)
        output_suffix = f"{keyword_str}{output_suffix}"
    else:
        keyword_str = "keywords"
    output_filename = os.path.join(SAVE_DIR, f"semantic_scholar_{current_date}{output_suffix}.csv")
    df_analyzed.to_csv(output_filename, sep=';', encoding='utf-8', 
                      quoting=csv.QUOTE_NONNUMERIC, escapechar='\\')

    # Save LDA and BERTopic results (keywords, models, etc.)
    suffix_string = f"{current_date}{keyword_str}"
    save_topic_terms(topic_keywords, suffix_string, method='lda')
    guided_terms = {idx: {'top_words': guided_topic_keywords[idx]['top_words'],
                          'word_weights': guided_topic_keywords[idx]['word_weights']}
                    for idx in guided_topic_keywords}
    save_topic_terms(guided_terms, suffix_string, method='bertopic')

    save_lda_components(lda_model, vectorizer, topic_distributions, suffix_string)
    # Optionally: Save BERTopic model if you wish (see BERTopic docs)

    # Save top papers for reference
    top_papers_filename = os.path.join(SAVE_DIR, f"top_papers_{suffix_string}.json")
    with open(top_papers_filename, 'w', encoding='utf-8') as f:
        top_papers_serializable = {
            int(topic_idx): papers 
            for topic_idx, papers in top_papers.items()
        }
        json.dump(top_papers_serializable, f, ensure_ascii=False, indent=2, default=str)

    logger.info("✓ All files saved")
    print("\nAPI Usage Statistics:")
    stats = credit_tracker.get_stats()
    print(f"Total tokens: {stats['total_tokens']}")
    print(f"Estimated cost: ${stats['total_cost']}")
    total_time = time.time() - start_time
    logger.info(f"🎉 Analysis completed successfully! Total time: {total_time:.2f}s ({total_time/60:.1f} minutes)")

    metadata = {
        'author_stats': all_author_stats,
        'topic_names': all_topic_names,
        'topic_keywords': topic_keywords,
        'tfidf_ngrams': tfidf_ngrams,
        'top_papers': top_papers,
        'lda_model': lda_model,
        'vectorizer': vectorizer,
        'topic_distributions': topic_distributions,
        'guided_topic_model': guided_topic_model,
        'guided_topics': guided_topics,
        'guided_probs': guided_probs,
        'guided_topic_keywords': guided_topic_keywords,
        'method_phrases': method_phrases,
        'output_filename': output_filename
    }
    return df_analyzed, metadata

def save_author_analysis(all_author_stats, filename):
    """Save detailed author analysis including topic frequencies"""
    author_rows = []
    
    for field, author_stats in all_author_stats.items():
        for author_id, stats in author_stats.items():
            for topic, topic_stats in stats['topics'].items():
                author_rows.append({
                    'Field': field,
                    'Author_ID': author_id,
                    'Author_Name': stats['name'],
                    'Topic': topic,
                    'Paper_Count': topic_stats['paper_count'],
                    'Avg_Dominance': topic_stats['avg_dominance'],
                    'Total_Papers': stats['total_papers']
                })
    
    author_df = pd.DataFrame(author_rows)
    author_df.to_csv(filename, sep=';', encoding='utf-8', 
                    quoting=csv.QUOTE_NONNUMERIC, index=False)


## Implementation

In [45]:
load_dir='Saved_files'
filename='semantic_scholar_2025_02_14_reliability_resilience_power_systems_results.csv' # if the search key words are changed you need to update the filename here. should be automatic...
#filename='semantic_scholar_2025_04_24_failure_rate_power_system__fault_modelling_results.csv' # if the search key words are changed you need to update the filename here. should be automatic...
#filename='semantic_scholar_2025_02_27_graph_neural_network_power_system__load_flow_results.csv'
filepath=os.path.join(load_dir,filename)
df=pd.read_csv(filepath,sep=";")
df['text'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')
search_keywords = extract_keywords_from_filename(filename)
print(f"Extracted search keywords: {search_keywords}")
keyword_str = keywords_to_filename_part(search_keywords)
# keyword_str = 'reliability_resilience_power_system_capacity_utilization'

# Now preprocess and clean
#search_keywords = ['reliability', 'resilience', 'power system', 'capacity utilization']
df = preprocess_dataframe(df, text_col='text', search_keywords=search_keywords)


Extracted search keywords: ['reliability', 'resilience', 'power', 'systems']


In [51]:
print(metadata['method_phrases'])


['```python\nmethods_algorithms_techniques = [\n    "abandon"', '"abc"', '"abstract"', '"abstract buy"', '"abstraction"', '"accelerate"', '"acceleration"', '"accelerator"', '"act"', '"action"', '"activate"', '"activation"', '"adapt"', '"adaptability"', '"adaptable"', '"adaptation"', '"adaptive"', '"adaptive control"', '"adaptive method"', '"add"', '"algorithm"', '"algorithm apply"', '"algorithm compare"', '"algorithm design"', '"algorithm determine"', '"algorithm developed"', '"algorithm implement"', '"algorithm improve"', '"algorithm paper"', '"algorithm present"', '"algorithm propose"', '"algorithm simulation"', '"algorithm solve"', '"algorithm use"', '"algorithm utilized"', '"allocation algorithm"', '"allocation method"', '"allocation scheme"', '"allocation strategy"', '"analysis"', '"analysis approach"', '"analysis method"', '"analysis model"', '"analysis perform"', '"analysis present"', '"analysis propose"', '"analytical method"', '"analytical model"', '"analytical result"', '"ana

In [None]:
# for testing the llm method phrases extraction and filtering
from sklearn.feature_extraction.text import CountVectorizer



candidate_terms = extract_candidate_terms(df_analyzed, max_features=5000)
print(candidate_terms[:20])  # Show first 20 terms for prompt testing
client, model_type = initialize_openai()
credit_tracker = CreditTracker()
test_phrases=get_method_phrases(candidate_terms, client, model_type, credit_tracker)
print(test_phrases)

['system', 'power', 'energy', 'use', 'capacity', 'reliability', 'model', 'network', 'control', 'propose', 'method', 'high', 'base', 'result', 'paper', 'design', 'performance', 'data', 'study', 'load']


2025-06-09 23:02:08,811 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


['```python\n[\n    "Monte Carlo simulation"', '"deep learning"', '"finite element method"', '"support vector machine"', '"particle swarm optimization"', '"multi-objective optimization"', '"reinforcement learning"', '"dynamic programming"', '"state estimation"', '"load flow analysis"', '"system reliability"', '"optimum power flow"', '"machine learning"', '"neural network"', '"time series analysis"', '"risk assessment"', '"power quality assessment"', '"signal processing"', '"global optimization"', '"genetic algorithm"', '"maximum power point tracking"', '"resource allocation"', '"non-orthogonal multiple access"', '"decision tree"', '"multi-state reliability"', '"adaptive control"', '"fuzzy logic"', '"distributed generation"', '"demand response"', '"failure mode and effects analysis"', '"quantum optimization"', '"acoustic monitoring"', '"swarm intelligence"', '"hydrodynamic modeling"', '"principal component analysis"', '"data-driven decision making"', '"load forecasting"', '"frequency re

In [57]:
print(candidate_terms)

['system', 'power', 'energy', 'use', 'capacity', 'reliability', 'model', 'network', 'control', 'propose', 'method', 'high', 'base', 'result', 'paper', 'design', 'performance', 'data', 'study', 'load', 'grid', 'wind', 'storage', 'increase', 'generation', 'power system', 'cost', 'improve', 'technology', 'operation', 'analysis', 'provide', 'battery', 'distribution', 'voltage', 'transmission', 'application', 'also', 'efficiency', 'present', 'algorithm', 'supply', 'approach', 'time', 'current', 'renewable', 'solar', 'reduce', 'source', 'device', 'resource', 'new', 'utilization', 'low', 'show', 'simulation', 'communication', 'electric', 'different', 'demand', 'optimization', 'research', 'consider', 'test', 'problem', 'plant', 'well', 'technique', 'utilize', 'management', 'include', 'energy storage', 'development', 'electricity', 'strategy', 'condition', 'renewable energy', 'optimal', 'one', 'process', 'pv', 'enhance', 'unit', 'level', 'rate', 'need', 'due', 'scheme', 'work', 'fault', 'soluti

In [46]:
# Processing to convert title and abstract to text field.
print("Preprocessing text data...")
#search_keywords=['reliability', 'resilience', 'power system', 'capacity utilization'] # should be automitically retrieved from search script, but for the time beeing its manually input here...
df['processed_text'] = df['text'].apply(lambda x:preprocess_text(x,search_keywords=search_keywords))
print("Text preprocessing completed.")

Preprocessing text data...
Text preprocessing completed.


In [47]:
# preparing for analysis (need to fill the empty fieldsOfStudy with ''):
valid_fields= ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics', 'Medicine', 'Business', 'Environmental Science', 'Chemistry', 'Materials Science', 'Geography', 'Biology', 'Geology', 'Political Science', 'Psychology', 'Com']
df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)

In [33]:
# simple analysis before topic modelling
# Usage
extract_keywords_and_ngrams(df,max_features=3000)

Results saved to Saved_files_new\term_frequencies_2025_06_09.json


In [185]:
fields_to_analyze = ['Computer Science', 'Engineering', 'Physics', 'Mathematics', 'Business', 'Environmental Science']

# Run the main analysis function
df_analyzed, all_author_stats, all_topic_names = analyze_papers_with_topic_names(
    df,
    fields_to_analyze=fields_to_analyze,
    n_papers=20,
    num_keywords=500,
    output_suffix="_results",
    keywords=search_keywords
)


2025-06-11 09:52:47,136 - INFO - Starting topic analysis
2025-06-11 09:52:47,479 - INFO - ✓ Initialized OpenAI client
2025-06-11 09:52:47,507 - INFO - ✓ Filtered to 18103 papers from fields: Computer Science, Engineering, Physics, Mathematics, Business, Environmental Science
2025-06-11 09:52:47,518 - INFO - 🔄 Starting LDA topic modeling...
2025-06-11 09:52:47,718 - INFO - collecting all words and their counts
2025-06-11 09:52:47,728 - INFO - PROGRESS: at sentence #0, processed 0 words and 0 word types
2025-06-11 09:52:49,225 - INFO - PROGRESS: at sentence #10000, processed 1488682 words and 772532 word types
2025-06-11 09:52:50,505 - INFO - collected 1204526 token types (unigram + bigrams) from a corpus of 2642416 words and 18103 sentences
2025-06-11 09:52:50,505 - INFO - merged Phrases<1204526 vocab, min_count=10, threshold=50, max_vocab_size=40000000>
2025-06-11 09:52:50,505 - INFO - Phrases lifecycle event {'msg': 'built Phrases<1204526 vocab, min_count=10, threshold=50, max_vocab_s


Processing topic 0
Papers for topic 0: 1147
Processing author: {'authorId': '153103985', 'name': 'C. Chong'}
Processing author: {'authorId': '49493559', 'name': 'G. Fedde'}
Processing author: {'authorId': '30639984', 'name': 'S. Borowski'}
Processing author: {'authorId': '144536104', 'name': 'J. S. Clark'}
Processing author: {'authorId': '84290723', 'name': 'R. Sefcik'}
Processing author: {'authorId': '5716306', 'name': 'R. Corban'}
Processing author: {'authorId': '96981914', 'name': 'S. Alexander'}
Processing author: {'authorId': '90263542', 'name': 'Hufiadi Hufiadi'}
Processing author: {'authorId': '72199515', 'name': 'Sugeng Hari Wisudo'}
Processing author: {'authorId': '101319241', 'name': 'A. Hamdi'}
Processing author: {'authorId': '31696831', 'name': 'B. Edwards'}
Processing author: {'authorId': '102533676', 'name': 'H. E. Bennett'}
Processing author: {'authorId': '96736303', 'name': 'J. Downer'}
Processing author: {'authorId': '69366785', 'name': 'David Eisenhaure'}
Processing 

2025-06-11 09:54:50,758 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:54:51,395 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:54:51,839 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:54:51,839 - INFO -   ✓ Topic 1: "Advanced Power Systems"
2025-06-11 09:54:52,483 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:54:52,957 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:54:53,425 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:54:53,427 - INFO -   ✓ Topic 2: Wireless Sensor Networks
2025-06-11 09:54:54,209 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:54:54,782 - INFO - HTTP Request: POST https://api.ope

No clear common topic name found. Increasing iterations to 5.


2025-06-11 09:55:02,435 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:55:02,969 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:55:03,411 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:55:03,797 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:55:04,183 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


No clear common topic name found. Increasing iterations to 7.


2025-06-11 09:55:04,694 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:55:06,862 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:55:07,422 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:55:07,805 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:55:08,643 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:55:09,088 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:55:09,462 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:55:09,465 - INFO -   ✓ Topic 6: Power System Fault Diagnosis
2025-06-11 09:55:10,085 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-11 09:55:10,5

LDA topic terms saved to Saved_files_new\lda_topic_terms_2025_06_11reliability_resilience_power_systems.json
LDA model saved to Saved_files_new\lda_model_2025_06_11reliability_resilience_power_systems.joblib
Vectorizer saved to Saved_files_new\vectorizer_2025_06_11reliability_resilience_power_systems.joblib
Topic distributions saved to Saved_files_new\topic_distributions_2025_06_11reliability_resilience_power_systems.npy

API Usage Statistics:
Total tokens: 32668
Estimated cost: $0.0049


### Testing of method classification

In [151]:
def debug_tfidf_scoring(df_sample, method_phrases):
    """Debug TF-IDF scoring to understand low scores"""
    print(f"Debugging with {len(method_phrases)} method phrases:")
    print(f"Method phrases: {method_phrases}")
    
    # Create vectorizer
    vectorizer = TfidfVectorizer(vocabulary=method_phrases, ngram_range=(1,3))
    tfidf_matrix = vectorizer.fit_transform(df_sample['processed_text'])
    
    print(f"TF-IDF Matrix shape: {tfidf_matrix.shape}")
    print(f"Non-zero elements: {tfidf_matrix.nnz}")
    
    if tfidf_matrix.shape[1] > 0:
        density = tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])
        print(f"Density: {density:.4f}")
        
        # Check individual phrase scores
        feature_names = vectorizer.get_feature_names_out()
        for i, phrase in enumerate(feature_names):
            phrase_scores = tfidf_matrix[:, i].toarray().ravel()
            max_score = phrase_scores.max()
            count_nonzero = (phrase_scores > 0).sum()
            print(f"  {phrase}: max_score={max_score:.4f}, appears_in={count_nonzero} docs")
    
    return tfidf_matrix, vectorizer

def classify_only_documents_with_llm_methods(df, sample_size=None):
    """
    Use LLM-assisted method phrase extraction but only classify documents that contain those methods
    """
    
    if sample_size:
        df_test = df.head(sample_size).copy()
    else:
        df_test = df.copy()
    
    logger.info(f"Processing {len(df_test)} documents")
    
    # Step 1: Extract candidate terms using your original function
    logger.info("🔄 Extracting candidate terms...")
    candidate_terms = extract_candidate_terms(df_test, max_features=1000)
    logger.info(f"Extracted {len(candidate_terms)} candidate terms")
    
    # Step 2: Get method phrases using your LLM function with improved prompt
    logger.info("🔄 Getting method phrases from LLM...")
    client, model_type = initialize_openai()
    credit_tracker = CreditTracker()
    method_phrases = get_method_phrases(candidate_terms, client, model_type, credit_tracker)
    method_phrases = clean_method_phrases_fixed(method_phrases)
    logger.info(f"Got {len(method_phrases)} method phrases: {method_phrases}")
    
    # Step 3: Validate which phrases actually exist in the corpus
    all_text = ' '.join(df_test['processed_text']).lower()
    validated_phrases = []
    for phrase in method_phrases:
        phrase_lower = phrase.lower()
        if phrase_lower in all_text:
            validated_phrases.append(phrase)
            logger.info(f"  ✓ Found: '{phrase}'")
    
    logger.info(f"Validated {len(validated_phrases)}/{len(method_phrases)} phrases")
    
    # Step 4: SELECTIVE CLASSIFICATION - Only classify documents containing method terms
    if validated_phrases:
        # Identify documents that contain at least one method phrase
        def contains_method_phrase(text):
            text_lower = text.lower()
            return any(phrase in text_lower for phrase in validated_phrases)
        
        df_test['contains_method'] = df_test['processed_text'].apply(contains_method_phrase)
        docs_with_methods = df_test[df_test['contains_method']].copy()
        docs_without_methods = df_test[~df_test['contains_method']].copy()
        
        logger.info(f"Documents WITH methods: {len(docs_with_methods)}")
        logger.info(f"Documents WITHOUT methods: {len(docs_without_methods)}")
        
        # Step 5: Classify only documents with methods
        if len(docs_with_methods) > 0:
            logger.info("🔄 Classifying documents with method terms...")
            
            # TF-IDF on documents with methods
            tfidf_vectorizer = TfidfVectorizer(
                vocabulary=validated_phrases,
                ngram_range=(1, 3),
                min_df=1,
                max_df=0.99,
                sublinear_tf=True
            )
            
            tfidf_matrix = tfidf_vectorizer.fit_transform(docs_with_methods['processed_text'])
            method_scores = tfidf_matrix.max(axis=1).toarray().ravel()
            
            # Handle argmax correctly
            if tfidf_matrix.shape[1] == 1:
                primary_methods = [tfidf_vectorizer.get_feature_names_out()[0]] * len(docs_with_methods)
            else:
                argmax_indices = tfidf_matrix.argmax(axis=1).A1
                feature_names = tfidf_vectorizer.get_feature_names_out()
                primary_methods = [feature_names[i] for i in argmax_indices]
            
            # Assign results to documents with methods
            docs_with_methods['Primary_Method'] = primary_methods
            docs_with_methods['Method_Score'] = method_scores
            docs_with_methods['Method_Detected'] = primary_methods  # All get their primary method
            
            # Debug: Show method distribution for documents with methods
            method_counts_with = docs_with_methods['Primary_Method'].value_counts()
            logger.info(f"Method distribution (docs with methods):\n{method_counts_with}")
            
            # Show actual scores
            score_stats = docs_with_methods['Method_Score'].describe()
            logger.info(f"Score statistics (docs with methods):\n{score_stats}")
        
        # Step 6: Assign 'No_Method_Found' to documents without methods
        docs_without_methods['Primary_Method'] = 'No_Method_Found'
        docs_without_methods['Method_Score'] = 0.0
        docs_without_methods['Method_Detected'] = 'No_Method_Found'
        
        # Step 7: Combine results
        df_result = pd.concat([docs_with_methods, docs_without_methods]).sort_index()
        
        # Final results
        final_method_counts = df_result['Method_Detected'].value_counts()
        logger.info("=== FINAL CLASSIFICATION RESULTS ===")
        logger.info(f"Final method distribution:\n{final_method_counts}")
        
        # Show examples of classified documents
        logger.info("\nSample classifications:")
        classified_docs = df_result[df_result['Method_Detected'] != 'No_Method_Found'].head(5)
        for idx, row in classified_docs.iterrows():
            logger.info(f"  Doc {idx}: {row['Method_Detected']} (score: {row['Method_Score']:.4f})")
        
        return df_result
    
    else:
        logger.error("No validated method phrases found!")
        df_test['Method_Detected'] = 'No_Methods_Available'
        df_test['Method_Score'] = 0.0
        return df_test

# Test the combined approach
result_combined = classify_only_documents_with_llm_methods(df_analyzed, sample_size=500)




def debug_text_content(df_sample, method_phrases):
    """Debug the actual text content and phrase matching"""
    
    print("=== TEXT CONTENT ANALYSIS ===")
    
    # Sample a few documents
    for i in range(min(3, len(df_sample))):
        text = df_sample.iloc[i]['processed_text']
        print(f"\nDocument {i+1} (first 200 chars):")
        print(f"'{text[:200]}...'")
        
        # Check which phrases appear in this document
        text_lower = text.lower()
        found_phrases = []
        for phrase in method_phrases:
            if phrase in text_lower:
                found_phrases.append(phrase)
        
        print(f"Found phrases: {found_phrases}")
        
        # Check word overlap
        text_words = set(text_lower.split())
        phrase_words = set()
        for phrase in method_phrases:
            phrase_words.update(phrase.split())
        
        overlap = text_words & phrase_words
        print(f"Word overlap: {list(overlap)[:10]}")

# Run text content debug
debug_text_content(df_test.head(5), matches)


2025-06-10 12:35:33,233 - INFO - Processing 500 documents
2025-06-10 12:35:33,234 - INFO - 🔄 Extracting candidate terms...
2025-06-10 12:35:33,746 - INFO - Extracted 1000 candidate terms
2025-06-10 12:35:33,748 - INFO - 🔄 Getting method phrases from LLM...
2025-06-10 12:35:45,382 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-10 12:35:45,386 - INFO - Got 24 method phrases: ['time series analysis', 'dynamic programming', 'genetic algorithm', 'probabilistic analysis', 'load flow analysis', 'optimization techniques', 'monte carlo simulation', 'decision analysis', 'markov process', 'failure mode and effects analysis', 'reliability assessment', 'linear programming', 'monte carlo methods', 'statistical process control', 'sensitivity analysis', 'power quality assessment', 'system identification', 'finite element analysis', 'numerical methods', 'computer simulation', 'signal processing techniques', 'capacity planning methods', 'data envelopment

=== TEXT CONTENT ANALYSIS ===

Document 1 (first 200 chars):
'new electric locomotive pennsylvania railroad simplicity reliability keynote design new single-phase a-c electric locomotive pennsylvania railroad late improvement material manufacturing process fully...'
Found phrases: []
Word overlap: []

Document 2 (first 200 chars):
'digital computer manchester university new universal high-speed digital compute machine work compute machine laboratory manchester university described high-speed storage capacity binary digit interme...'
Found phrases: []
Word overlap: []

Document 3 (first 200 chars):
'reliability research cod circuitry abstract study indicate amplitude signal noise gaussian distribution threshold effect discount always increase average information signal power divide equally among ...'
Found phrases: []
Word overlap: []


In [154]:
def relaxed_method_phrase_matching(df, validated_phrases):
    """More flexible method phrase matching"""
    
    def flexible_contains_method(text):
        text_lower = text.lower()
        
        # Exact phrase match (current approach)
        if any(phrase in text_lower for phrase in validated_phrases):
            return True
            
        # Partial word matching for compound terms
        for phrase in validated_phrases:
            phrase_words = phrase.split()
            if len(phrase_words) > 1:
                # If 70% of phrase words appear in text
                word_matches = sum(1 for word in phrase_words if word in text_lower)
                if word_matches >= len(phrase_words) * 0.7:
                    return True
        
        # Stem/root word matching
        method_roots = ['simulat', 'analys', 'optim', 'model', 'assess', 'evaluat']
        if any(root in text_lower for root in method_roots):
            return True
            
        return False
    
    df['contains_method'] = df['processed_text'].apply(flexible_contains_method)
    return df


def relaxed_tfidf_classification(docs_with_methods, validated_phrases):
    """More relaxed TF-IDF classification"""
    
    tfidf_vectorizer = TfidfVectorizer(
        vocabulary=validated_phrases,
        ngram_range=(1, 4),     # Include 4-grams for complex method names
        min_df=1,               # Allow rare terms
        max_df=0.95,            # More permissive for common terms
        sublinear_tf=True,      # Helps with score distribution
        smooth_idf=True,        # Better handling of unseen terms
        norm='l2'               # L2 normalization for better comparison
    )
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(docs_with_methods['processed_text'])
    method_scores = tfidf_matrix.max(axis=1).toarray().ravel()
    
    # Very low threshold - classify more documents
    threshold = max(0.001, np.percentile(method_scores[method_scores > 0], 10))
    
    return method_scores, threshold

def hybrid_confidence_classification(df_test, validated_phrases):
    """Combine multiple signals for more confident classification"""
    
    # Method 1: Exact phrase matching (high confidence)
    exact_matches = df_test['processed_text'].apply(
        lambda text: any(phrase in text.lower() for phrase in validated_phrases)
    )
    
    # Method 2: Partial word matching (medium confidence)  
    partial_matches = df_test['processed_text'].apply(
        lambda text: any(
            sum(word in text.lower() for word in phrase.split()) >= len(phrase.split()) * 0.6
            for phrase in validated_phrases
        )
    )
    
    # Method 3: Method indicator words (low confidence)
    method_indicators = ['method', 'approach', 'technique', 'algorithm', 'analysis', 'simulation']
    indicator_matches = df_test['processed_text'].apply(
        lambda text: any(indicator in text.lower() for indicator in method_indicators)
    )
    
    # Assign confidence levels
    df_test['method_confidence'] = 'none'
    df_test.loc[indicator_matches, 'method_confidence'] = 'low'
    df_test.loc[partial_matches, 'method_confidence'] = 'medium'  
    df_test.loc[exact_matches, 'method_confidence'] = 'high'
    
    # Classify based on confidence
    classify_mask = df_test['method_confidence'].isin(['high', 'medium', 'low'])
    
    return df_test[classify_mask].copy(), df_test['method_confidence']


def relaxed_method_classification(df, sample_size=None):
    """Comprehensive relaxed method classification"""
    
    if sample_size:
        df_test = df.head(sample_size).copy()
    else:
        df_test = df.copy()
    
    logger.info(f"Processing {len(df_test)} documents with relaxed criteria")
    
    # Get method phrases (your existing LLM approach)
    candidate_terms = extract_candidate_terms(df_test, max_features=1000)
    client, model_type = initialize_openai()
    credit_tracker = CreditTracker()
    method_phrases = get_method_phrases(candidate_terms, client, model_type, credit_tracker)
    method_phrases = clean_method_phrases_fixed(method_phrases)
    
    # Validate with relaxed criteria
    all_text = ' '.join(df_test['processed_text']).lower()
    validated_phrases = []
    for phrase in method_phrases:
        if (phrase.lower() in all_text or 
            any(word in all_text for word in phrase.split() if len(word) > 3)):
            validated_phrases.append(phrase)
    
    logger.info(f"Validated {len(validated_phrases)} phrases with relaxed matching")
    
    # Hybrid confidence classification
    docs_to_classify, confidence_levels = hybrid_confidence_classification(df_test, validated_phrases)
    
    logger.info(f"Documents to classify: {len(docs_to_classify)} ({len(docs_to_classify)/len(df_test)*100:.1f}%)")
    logger.info(f"Confidence distribution:\n{confidence_levels.value_counts()}")
    
    if len(docs_to_classify) > 0:
        # Relaxed TF-IDF
        method_scores, threshold = relaxed_tfidf_classification(docs_to_classify, validated_phrases)
        
        # Classification with confidence weighting
        tfidf_vectorizer = TfidfVectorizer(vocabulary=validated_phrases, ngram_range=(1,4))
        tfidf_matrix = tfidf_vectorizer.fit_transform(docs_to_classify['processed_text'])
        
        argmax_indices = tfidf_matrix.argmax(axis=1).A1
        feature_names = tfidf_vectorizer.get_feature_names_out()
        primary_methods = [feature_names[i] for i in argmax_indices]
        
        # Assign results
        docs_to_classify['Primary_Method'] = primary_methods
        docs_to_classify['Method_Score'] = method_scores
        docs_to_classify['Method_Detected'] = primary_methods
        
        # Merge back
        df_result = df_test.copy()
        df_result.loc[docs_to_classify.index, 'Primary_Method'] = primary_methods
        df_result.loc[docs_to_classify.index, 'Method_Score'] = method_scores
        df_result.loc[docs_to_classify.index, 'Method_Detected'] = primary_methods
        df_result['Method_Detected'] = df_result['Method_Detected'].fillna('No_Method_Found')
        
        # Results
        final_counts = df_result['Method_Detected'].value_counts()
        logger.info(f"Final classification distribution:\n{final_counts}")
        
        classified_pct = (len(df_result) - final_counts.get('No_Method_Found', 0)) / len(df_result) * 100
        logger.info(f"Classification coverage: {classified_pct:.1f}%")
        
        return df_result
    
    return df_test

# Test relaxed approach
result_relaxed = relaxed_method_classification(df_analyzed, sample_size=500)


2025-06-10 12:46:31,250 - INFO - Processing 500 documents with relaxed criteria
2025-06-10 12:46:44,751 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-10 12:46:44,759 - INFO - Validated 21 phrases with relaxed matching
2025-06-10 12:46:45,473 - INFO - Documents to classify: 450 (90.0%)
2025-06-10 12:46:45,475 - INFO - Confidence distribution:
method_confidence
medium    363
high       71
none       50
low        16
Name: count, dtype: int64
2025-06-10 12:46:46,116 - INFO - Final classification distribution:
Method_Detected
system reliability        419
nan                        50
data processing            11
design study               10
monte carlo simulation      3
performance evaluation      2
statistical analysis        2
data analysis               1
reliability assessment      1
failure analysis            1
Name: count, dtype: int64
2025-06-10 12:46:46,117 - INFO - Classification coverage: 100.0%


In [156]:
def targeted_method_classification(df, sample_size=None):
    """More targeted method classification with specific phrase matching"""
    
    if sample_size:
        df_test = df.head(sample_size).copy()
    else:
        df_test = df.copy()
    
    logger.info(f"Processing {len(df_test)} documents with targeted criteria")
    
    # Get method phrases (your existing LLM approach)
    candidate_terms = extract_candidate_terms(df_test, max_features=1000)
    client, model_type = initialize_openai()
    credit_tracker = CreditTracker()
    method_phrases = get_method_phrases(candidate_terms, client, model_type, credit_tracker)
    method_phrases = clean_method_phrases_fixed(method_phrases)
    
    logger.info(f"Original method phrases: {method_phrases}")
    
    # STRICTER VALIDATION: Require more specific matching
    def strict_phrase_validation(phrase, all_text):
        phrase_lower = phrase.lower()
        
        # Exact phrase match (highest confidence)
        if phrase_lower in all_text:
            return True, 'exact'
            
        # For compound phrases, require most words to be present
        words = phrase_lower.split()
        if len(words) > 1:
            word_matches = sum(1 for word in words if len(word) > 3 and word in all_text)
            # Require at least 80% of significant words
            if word_matches >= len([w for w in words if len(w) > 3]) * 0.8:
                return True, 'compound'
        
        # For single technical terms, be more selective
        if len(words) == 1 and len(phrase_lower) > 6:
            # Only include if it's a specific technical term, not generic
            generic_terms = ['analysis', 'method', 'approach', 'technique', 'system', 
                           'design', 'study', 'evaluation', 'assessment', 'processing']
            if phrase_lower not in generic_terms and phrase_lower in all_text:
                return True, 'technical'
        
        return False, 'none'
    
    # Validate with stricter criteria
    all_text = ' '.join(df_test['processed_text']).lower()
    validated_phrases = []
    phrase_types = []
    
    for phrase in method_phrases:
        is_valid, match_type = strict_phrase_validation(phrase, all_text)
        if is_valid:
            validated_phrases.append(phrase)
            phrase_types.append(match_type)
            logger.info(f"  ✓ {match_type.upper()}: '{phrase}'")
        else:
            logger.info(f"  ✗ REJECTED: '{phrase}'")
    
    logger.info(f"Validated {len(validated_phrases)} phrases with strict matching")
    
    # TARGETED DOCUMENT SELECTION: Only classify documents with specific method indicators
    def targeted_contains_method(text):
        text_lower = text.lower()
        
        # Exact phrase matches only
        exact_matches = [phrase for phrase in validated_phrases if phrase.lower() in text_lower]
        
        if exact_matches:
            return True, exact_matches
        
        # For compound phrases, check if key technical words appear together
        for phrase in validated_phrases:
            words = phrase.split()
            if len(words) > 1:
                # Check if words appear within reasonable proximity (within 50 characters)
                for i, word in enumerate(words[:-1]):
                    if word in text_lower:
                        word_pos = text_lower.find(word)
                        next_word = words[i+1]
                        nearby_text = text_lower[word_pos:word_pos+50]
                        if next_word in nearby_text:
                            return True, [phrase]
        
        return False, []
    
    # Apply targeted selection
    classification_data = []
    for idx, row in df_test.iterrows():
        has_method, found_phrases = targeted_contains_method(row['processed_text'])
        classification_data.append({
            'index': idx,
            'has_method': has_method,
            'found_phrases': found_phrases,
            'phrase_count': len(found_phrases)
        })
    
    # Select documents for classification
    docs_to_classify_indices = [d['index'] for d in classification_data if d['has_method']]
    docs_to_classify = df_test.loc[docs_to_classify_indices].copy()
    
    logger.info(f"Documents to classify: {len(docs_to_classify)} ({len(docs_to_classify)/len(df_test)*100:.1f}%)")
    
    # Show phrase distribution
    phrase_counts = {}
    for d in classification_data:
        for phrase in d['found_phrases']:
            phrase_counts[phrase] = phrase_counts.get(phrase, 0) + 1
    
    logger.info("Phrase distribution in selected documents:")
    for phrase, count in sorted(phrase_counts.items(), key=lambda x: x[1], reverse=True):
        logger.info(f"  {phrase}: {count} documents")
    
    if len(docs_to_classify) > 0 and len(validated_phrases) > 0:
        # PRECISE TF-IDF CLASSIFICATION
        tfidf_vectorizer = TfidfVectorizer(
            vocabulary=validated_phrases,
            ngram_range=(1, 3),     # Focused n-gram range
            min_df=1,
            max_df=0.85,            # Slightly stricter than before
            sublinear_tf=True,
            norm='l2'
        )
        
        tfidf_matrix = tfidf_vectorizer.fit_transform(docs_to_classify['processed_text'])
        method_scores = tfidf_matrix.max(axis=1).toarray().ravel()
        
        # Handle classification
        if tfidf_matrix.shape[1] == 1:
            primary_methods = [tfidf_vectorizer.get_feature_names_out()[0]] * len(docs_to_classify)
        else:
            argmax_indices = tfidf_matrix.argmax(axis=1).A1
            feature_names = tfidf_vectorizer.get_feature_names_out()
            primary_methods = [feature_names[i] for i in argmax_indices]
        
        # Apply minimum score threshold to avoid weak assignments
        min_threshold = 0.1  # Require meaningful TF-IDF score
        final_methods = []
        for method, score in zip(primary_methods, method_scores):
            if score >= min_threshold:
                final_methods.append(method)
            else:
                final_methods.append('Low_Confidence_Method')
        
        # Assign results
        docs_to_classify['Primary_Method'] = primary_methods
        docs_to_classify['Method_Score'] = method_scores
        docs_to_classify['Method_Detected'] = final_methods
        
        # Merge back to full dataset
        df_result = df_test.copy()
        df_result['Method_Detected'] = 'No_Method_Found'
        df_result['Method_Score'] = 0.0
        
        df_result.loc[docs_to_classify.index, 'Primary_Method'] = primary_methods
        df_result.loc[docs_to_classify.index, 'Method_Score'] = method_scores
        df_result.loc[docs_to_classify.index, 'Method_Detected'] = final_methods
        
        # Results
        final_counts = df_result['Method_Detected'].value_counts()
        logger.info("=== TARGETED CLASSIFICATION RESULTS ===")
        logger.info(f"Final classification distribution:\n{final_counts}")
        
        # Show score statistics for classified documents
        classified_scores = df_result[df_result['Method_Detected'] != 'No_Method_Found']['Method_Score']
        if len(classified_scores) > 0:
            logger.info(f"Score statistics for classified documents:\n{classified_scores.describe()}")
        
        return df_result
    
    else:
        logger.warning("No documents selected for classification")
        df_test['Method_Detected'] = 'No_Methods_Available'
        return df_test

# Test targeted approach
result_targeted = targeted_method_classification(df_analyzed, sample_size=500)


2025-06-10 13:17:33,867 - INFO - Processing 500 documents with targeted criteria
2025-06-10 13:17:53,069 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-10 13:17:53,074 - INFO - Original method phrases: ['probabilistic modeling', 'power flow study', 'time series analysis', 'dynamic programming', 'load forecasting', 'fault analysis', 'load flow analysis', 'design review', 'optimization technique', 'monte carlo simulation', 'contingency analysis', 'reliability assessment', 'risk assessment', 'statistical process control', 'power quality assessment', 'sensitivity analysis', 'dynamic simulation', 'life cycle cost analysis', 'failure mode effect analysis', 'finite element analysis', 'numerical methods', 'data-driven approach', 'state estimation']
2025-06-10 13:17:53,081 - INFO -   ✗ REJECTED: 'probabilistic modeling'
2025-06-10 13:17:53,082 - INFO -   ✓ COMPOUND: 'power flow study'
2025-06-10 13:17:53,083 - INFO -   ✓ COMPOUND: 'time series a

In [123]:
    # Step 6: Hybrid Method Classification
#candidate_terms = extract_candidate_terms(df_analyzed, max_features=5000)
#client, model_type = initialize_openai()
#credit_tracker = CreditTracker()
#method_phrases = get_method_phrases(candidate_terms, client, model_type, credit_tracker)
logger.info(f"Method phrases: {method_phrases[:5]}... (total {len(method_phrases)})")

# Ensure method phrases are formatted as in processed_text
#method_phrases = clean_method_phrases(method_phrases)
#logger.info(f"✓ Cleaned method phrases: {method_phrases[:5]}... (total {len(method_phrases)})")

print(df['processed_text'].sample(5).tolist())
print(method_phrases)

all_tokens = set(' '.join(df['processed_text']).split())
#print a sample of tokens
print(f"Number of tokens: {len(all_tokens)}")
matches = set(method_phrases) & all_tokens
print(len(matches))
# Load the model
#lda_model = joblib.load("Saved_files_new/lda_model_2025_06_10reliability_resilience_power_systems.joblib")
# Load the vectorizer (needed to transform new documents)
#vectorizer = joblib.load("Saved_files_new/vectorizer_2025_06_10reliability_resilience_power_systems.joblib")




# Classify
df_analyzed = classify_methods_tfidf(
    df_analyzed,
    method_phrases=method_phrases,  # Only use phrases actually present
    threshold=0.005  # Lower threshold for more matches
)


2025-06-10 10:34:36,560 - INFO - Method phrases: ['time series analysis', 'dynamic programming', 'genetic algorithm', 'support vector machine', 'harmonic analysis']... (total 24)


['study improve utilization efficiency distribution transformer seasonal special application situation investment coal electricity policy economic indicator distribution network significantly decrease power supply capacity local distribution equipment limited asset utilization efficiency high utilization efficiency distribution transformer decrease firstly article introduces evaluation system key indicator utilization efficiency distribution transformer analyzes reason low utilization efficiency distribution transformer secondly base seasonal special application scenario coal electricity well irrigation select typical region analyze key indicator related utilization efficiency distribution transformer study change different application scenario addition effective prompt path method utilization efficiency distribution transformer study base special application scenario finally advantage disadvantage different scheme analyze compare strategy improve utilization efficiency distribution tr

2025-06-10 10:34:40,479 - INFO - Assigned methods to 0 documents


In [118]:
#print 20 samples of the columns "processed_text", "primary_method_index" and "primary_method"
print("Primary Method Index and Primary Method samples:")
print(df_analyzed[['processed_text', 'Primary_Method_Index', 'Primary_Method']].sample(20, random_state=42))

print(df_analyzed['Primary_Method_Index'].unique())
print(df_analyzed['Primary_Method'].unique())

Primary Method Index and Primary Method samples:


KeyError: "['Primary_Method_Index'] not in index"

## Other shit

In [None]:

# Analyze papers with topic naming
fields_to_analyze = ['Computer Science', 'Engineering', 'Physics', 'Mathematics', 'Business', 'Environmental Science']


# Run analysis with complete saving
df_analyzed = analyze_papers_with_topic_names(df, fields_to_analyze=fields_to_analyze, n_papers=10, num_keywords=3000,keywords=search_keywords)




2025-06-09 10:43:50,712 - INFO - Starting topic analysis
2025-06-09 10:43:51,045 - INFO - ✓ Initialized OpenAI client
2025-06-09 10:43:51,083 - INFO - ✓ Filtered to 18103 papers from fields: Computer Science, Engineering, Physics, Mathematics, Business, Environmental Science
2025-06-09 10:43:51,083 - INFO - 🔄 Starting LDA topic modeling...
2025-06-09 10:43:51,256 - INFO - collecting all words and their counts
2025-06-09 10:43:51,256 - INFO - PROGRESS: at sentence #0, processed 0 words and 0 word types


Analyzing 18103 papers across all fields


2025-06-09 10:43:52,649 - INFO - PROGRESS: at sentence #10000, processed 1488682 words and 772532 word types
2025-06-09 10:43:53,765 - INFO - collected 1204526 token types (unigram + bigrams) from a corpus of 2642416 words and 18103 sentences
2025-06-09 10:43:53,765 - INFO - merged Phrases<1204526 vocab, min_count=10, threshold=50, max_vocab_size=40000000>
2025-06-09 10:43:53,765 - INFO - Phrases lifecycle event {'msg': 'built Phrases<1204526 vocab, min_count=10, threshold=50, max_vocab_size=40000000> in 2.51s', 'datetime': '2025-06-09T10:43:53.765772', 'gensim': '4.3.2', 'python': '3.11.11 (main, Jan 14 2025, 22:46:06) [MSC v.1942 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'created'}
2025-06-09 10:43:53,765 - INFO - collecting all words and their counts
2025-06-09 10:43:53,765 - INFO - PROGRESS: at sentence #0, processed 0 words and 0 word types
2025-06-09 10:43:57,066 - INFO - PROGRESS: at sentence #10000, processed 1443341 words and 791856 word types
2025-06


Processing topic 0
Papers for topic 0: 4729
Processing author: {'authorId': '95851456', 'name': 'Bennie Hoisington'}
Processing author: {'authorId': '96033858', 'name': 'Steven Begay'}
Processing author: {'authorId': '2369479', 'name': 'D. Koval'}
Processing author: {'authorId': '145874144', 'name': 'J. Carrasco'}
Processing author: {'authorId': '1756639', 'name': 'L. Franquelo'}
Processing author: {'authorId': '2442938', 'name': 'J. Bialasiewicz'}
Processing author: {'authorId': '145142685', 'name': 'E. Galván'}
Processing author: {'authorId': '2240913553', 'name': 'Ramón Portillo'}
Processing author: {'authorId': '2248348665', 'name': 'Guisado'}
Processing author: {'authorId': '2248341665', 'name': 'Ma Ángeles'}
Processing author: {'authorId': '2248348347', 'name': 'Martín Prats'}
Processing author: {'authorId': '2248336099', 'name': 'Ignacio León'}
Processing author: {'authorId': '1431231418', 'name': 'Narciso Moreno-Alfonso'}
Processing author: {'authorId': '2077892653', 'name': '

2025-06-09 10:47:55,315 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:47:56,165 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:47:56,852 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:47:56,865 - INFO -   ✓ Topic 1: **Renewable Energy Integration**



Topic 1: **Renewable Energy Integration**
LDA n-grams: power, energy, grid, generation, capacity, storage, load, distribution, renewable, supply, power system, operation, increase, cost, electricity, use, reliability, source, renewable energy, electric, pv, demand, distribute, network, solar, paper, utilization, transmission, improve, electrical, power generation, power grid, reduce, base, high, resource, line, also, control, power supply, microgrid, provide, new, result, storage system, photovoltaic, study, distribution network, energy source, present, distribution system, dg, electric power, utility, due, quality, smart_grid, integration, hybrid, unit, management, voltage, market, impact, technology, economic, level, need, large, loss, plant, efficiency, well, battery, instal, one, benefit, problem, equipment, penetration, generate, low, however, exist, consider, integrate, distribute generation, station, main, require, generator, energy system, transformer, operating, power quality

2025-06-09 10:47:57,917 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:47:58,628 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:47:59,239 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:47:59,241 - INFO -   ✓ Topic 2: "Cloud IoT Systems"



Topic 2: "Cloud IoT Systems"
LDA n-grams: iot, memory, cloud, cloud_compute, dan, la, platform, en, el, blockchain, untuk, virtualization, pada, wsn, dengan, wireless_sensor_network, cyber-physical, gpu, de_la, wsns, compute, que, di, dari, per, un, gpus, sistem, noc, iot device, runtime, una, cyber-physical system, internet_thing_iot, dalam, io, del, hpc, autonomous_vehicle, ssd, virtual_machine, data, plts, e, daya, thread, cps, adalah, dapat, iot system, flash_memory, menggunakan, sebesar, analisis, nvm, yaitu, observer, en_el, cloud storage, exascale, energi, edge_compute, sensor node, datacenter, listrik, hadoop, scalability, ini, sebagai, en_la, akan, kapasitas, iot application, program, menjadi, wireless_sensor_network wsns, mem, cloud-based, virtualization technology, con, row, cloud service, nilai, java, pembangkit, fog_compute, tegangan, dilakukan, cloud_server, neuromorphic, compute node, ssds, reram, encryption, mlc, pln, secara, terhadap, como, de_los, kalimantan, cloud e

2025-06-09 10:47:59,816 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:00,350 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:00,867 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


No clear common topic name found. Increasing iterations to 5.


2025-06-09 10:48:01,350 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:01,818 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:02,236 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:02,716 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:03,050 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:03,050 - INFO -   ✓ Topic 3: "Battery Power Management"



Topic 3: "Battery Power Management"
LDA n-grams: power, battery, high, design, use, test, control, module, temperature, device, charge, vehicle, low, capacity, drive, supply, current, unit, cell, motor, reliability, output, material, efficiency, structure, component, operation, pump, reduce, circuit, condition, provide, cable, power supply, thermal, method, application, flow, two, electric, utilize, connect, type, field, machine, performance, require, range, space, electrical, speed, development, work, developed, cool, pressure, electric_vehicle, solar, mechanical, life, maximum, one, engine, cost, characteristic, source, time, phase, control system, increase, voltage, result, safety, energy, equipment, density, light, present, include, operating, discharge, main, improve, requirement, utilized, pulse, comprises, small, array, large, surface, size, input, laser, part, monitoring, high power, also, mission, effect, mode, electronic, measure, station, tank, switch, storage, hybrid, stre

2025-06-09 10:48:04,221 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:04,766 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:05,271 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:05,273 - INFO -   ✓ Topic 4: Wireless Resource Allocation



Topic 4: Wireless Resource Allocation
LDA n-grams: network, channel, communication, wireless, scheme, transmission, capacity, user, power, allocation, antenna, signal, resource, interference, transmit, rate, performance, communication system, relay, spectrum, mimo, link, power allocation, result, receiver, multiple, wireless communication, radio, optical, mobile, bandwidth, cellular, base, consider, simulation, throughput, receive, station, problem, resource allocation, show, packet, modulation, traffic, simulation result, coverage, gain, cell, data rate, access, cod, optimal, derive, transmit power, base station, investigate, transmitter, constraint, diversity, ofdm, power control, system capacity, wireless network, propose scheme, utilize, frequency, protocol, average, latency, service, two, selection, spectral_efficiency, high, channel capacity, beamforming, adaptive, uplink, technique, noma, downlink, number, achieve, improve, increase, information, fading, different, decode, pape

2025-06-09 10:48:06,002 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:06,555 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:06,978 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:06,978 - INFO -   ✓ Topic 5: Resilience in Infrastructure Management



Topic 5: Resilience in Infrastructure Management
LDA n-grams: research, development, technology, resource, study, management, water, project, process, infrastructure, industry, need, analysis, include, service, use, provide, information, framework, risk, potential, support, develop, future, program, engineering, market, well, production, safety, new, data, approach, challenge, area, tool, make, work, identify, also, human, focus, review, model, understand, community, policy, change, many, level, business, within, activity, would, quality, design, issue, may, create, one, assessment, security, technical, capacity, address, plan, state, complex, impact, product, year, environment, maintenance, environmental, could, factor, sustainable, facility, transportation, economic, strategy, part, sector, effort, opportunity, help, knowledge, report, planning, key, building, measure, operational, decision, company, concept, objective, role, urban, national, example, world, developed, manufacturing

2025-06-09 10:48:07,615 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:08,135 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:08,958 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:08,960 - INFO -   ✓ Topic 6: **Thermal Energy Systems**



Topic 6: **Thermal Energy Systems**
LDA n-grams: plant, energy, power plant, heat, fuel, thermal, cool, efficiency, gas, use, water, temperature, consumption, study, solar, production, turbine, hydrogen, energy storage, air, heating, nuclear, building, performance, steam, produce, coal, emission, unit, analysis, de, biomass, reactor, utilization, fuel_cell, kw, combine, capacity, generate, result, process, cycle, natural_gas, rate, gas_turbine, technology, geothermal, harvest, electricity, power generation, compare, energy consumption, waste, nuclear power, power, boiler, potential, mw, thermal energy, conversion, cost, engine, chp, co, storage, show, low, generation, combustion, total, solar energy, generator, waste_heat, design, heat power, energy system, source, operating, cogeneration, compressor, integrate, environmental, cold, heat_exchanger, recovery, reduce, oil, pressure, steam_turbine, heat_pump, heat_transfer, ratio, utilize, alternative, hour, condition, high, refrigeratio

2025-06-09 10:48:09,894 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:10,350 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:10,766 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:10,766 - INFO -   ✓ Topic 7: Wind Power Control Systems



Topic 7: Wind Power Control Systems
LDA n-grams: wind, power, control, voltage, wind power, converter, dc, current, fault, wind_turbine, output, frequency, speed, controller, reactive, strategy, wind_farm, wind energy, generator, reactive power, connect, control strategy, propose, ac, simulation, switch, conversion, wind speed, grid, active, topology, harmonic, compensation, method, circuit, transformer, active power, filter, scheme, hvdc, three-phase, high, mode, power converter, variable, transient, base, capacity, wind solar, characteristic, propose control, fault current, conventional, yang, stability, side, wind generation, protection, operation, generation system, variation, power output, line, power factor, conversion system, phase, coordinate, circuit_breaker, rectifier, power compensation, dynamic, connection, series, dfig, excitation, energy conversion, statcom, capacity credit, offshore_wind, unit, input, high voltage, source, voltage current, mppt, constant, power fluctuat

2025-06-09 10:48:11,665 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:12,100 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:12,553 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:12,553 - INFO -   ✓ Topic 8: "Smart Communication Networks"



Topic 8: "Smart Communication Networks"
LDA n-grams: use, data, performance, propose, power, design, application, reliability, high, network, base, control, paper, provide, technology, device, improve, algorithm, time, increase, technique, result, also, communication, capacity, low, present, show, however, approach, achieve, reduce, efficiency, utilize, information, architecture, consumption, solution, utilization, due, new, method, well, node, enhance, different, work, efficient, require, number, compare, analysis, requirement, sensor, large, challenge, monitoring, one, resilience, processing, dynamic, various, novel, level, two, signal, management, environment, make, process, order, service, compute, error, problem, implement, power consumption, demonstrate, real-time, issue, first, distribute, detection, operation, failure, exist, need, support, resource, address, limited, simulation, cost, feature, scheme, hardware, rate, capability, component, model, user, enable, mechanism, util

2025-06-09 10:48:13,183 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:13,603 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:14,398 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:14,415 - INFO -   ✓ Topic 9: "Parallel Inverter Control"



Topic 9: "Parallel Inverter Control"
LDA n-grams: inverter, control, parallel, voltage, switch, objective_function, ups, share, upfc, balance, frequency, power share, modulation, load, voltage source, loss, load balance, bus, fuzzy_logic, droop, method, voltage control, current control, output voltage, dc link, phase, pv inverter, droop_control, parallel operation, unbalance, simulation result, pmu, dc voltage, ups system, self-healing, solar_farm, boost, redundancy, method verify, thd, fault_diagnosis, synchronous, paper_proposes, svc, stable, source inverter, uninterruptible, loop, markov_chain, circuit, circulate_current, matlabsimulink, inverter system, frequency voltage, pwm, uninterruptible power, real, real power, parking_lot, motor_drive, inverter control, kva, multilevel_inverter, pv-statcom, dvr, total_harmonic_distortion, redundant, sub, propose, load share, simulation experiment, wide_area, controller design, wide-area, paper present, therefore propose, paper_proposes_nove

2025-06-09 10:48:15,100 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:15,655 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:16,428 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-09 10:48:16,432 - INFO -   ✓ Topic 10: **Power System Reliability Optimization**



Topic 10: **Power System Reliability Optimization**
LDA n-grams: model, reliability, method, power, propose, use, power system, capacity, base, result, optimal, optimization, energy storage, load, consider, network, paper, algorithm, approach, analysis, simulation, problem, distribution, evaluation, study, index, transmission, different, present, test, technique, constraint, uncertainty, obtain, evaluate, cost, determine, resilience, flow, show, strategy, planning, system reliability, dynamic, loss, solve, generation, parameter, propose method, assessment, state, utilized, apply, failure, schedule, developed, impact, calculate, value, two, methodology, line, probability, condition, framework, prediction, solution, result show, time, compare, effectiveness, level, effect, bus, performance, operation, unit, probabilistic, scenario, component, finally, calculation, include, estimate, estimation, stability, analyze, configuration, outage, simulation result, risk, improve, order, stochasti

2025-06-09 10:48:28,249 - INFO - ✓ All files saved
2025-06-09 10:48:28,250 - INFO - 🎉 Analysis completed successfully! Total time: 277.54s (4.6 minutes)


LDA model saved to Saved_files_new\lda_model_2025_06_09reliability_resilience_power_systems.joblib
Vectorizer saved to Saved_files_new\vectorizer_2025_06_09reliability_resilience_power_systems.joblib
Topic distributions saved to Saved_files_new\topic_distributions_2025_06_09reliability_resilience_power_systems.npy

API Usage Statistics:
Total tokens: 105559
Estimated cost: $0.0158


In [16]:
df_analyzed

(                                        paperId  \
 0      29547a16681084e33162cba2a0455d623f29c605   
 1      c6777c290adecf28c1a7ed29e263109226e27f16   
 2      3dad02dd79b50bfadb717f9ea16bd63c23e62ba6   
 3      a9518ff96ebb481e3dea263facd0fa60d3f7c875   
 4      918e064372add35cf4793329377870b66c5a2d63   
 ...                                         ...   
 28929  403b276995326706cba4bd93028c35d2ee3e3ce6   
 28930  1efcd82effbac857b429bd4dbf47af91da11838e   
 28931  4856677fa0eccb4bb9ba2b8770097bceaf6fc544   
 28932  7c4b5647b8ae7318f27075f71a39ff32a6dacd4c   
 28933  f6e597960e3ffa75b2e96a547001d945a4d1a9fc   
 
                                                    title  \
 0      A new electric locomotive for the Pennsylvania...   
 1             Digital computers at Manchester University   
 2                 RELIABILITY RESEARCH; CODING CIRCUITRY   
 3             SAFEGUARDING OUR MINERAL-DEPENDENT ECONOMY   
 4      Engineering Features of the Union Oil-Shale Re...   
 ...    

In [None]:
# Define valid fields and clean fields of study
valid_fields = ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics', 'Medicine', 'Business', 'Environmental Science', 'Chemistry', 'Materials Science', 'Geography', 'Biology', 'Geology', 'Political Science', 'Psychology', 'Com']
df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)
# Filter out papers with excluded fields
exclude_fields = ['Chemistry', 'Materials Science', 'Geography', 'Biology', 'Geology', 'Medicine', 'Political Science', 'Psychology', 'Com']
df_filtered = df[df['fieldsOfStudy'].apply(lambda x: not set(x).issubset(set(exclude_fields)))]
# Get unique fields of study
unique_fields = set([field for fields in df_filtered['fieldsOfStudy'] for field in fields if field not in exclude_fields])
