In [1]:

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.lda_model
import numpy as np
from openai import OpenAI
import configparser
#import tiktoken
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from difflib import SequenceMatcher
from datetime import datetime
import json
import ast
import csv
import os

SAVE_DIR = "Saved_files"
os.makedirs(SAVE_DIR, exist_ok=True)

In [None]:
# Download necessary NLTK data and handle stop words
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def get_custom_stop_words(search_keywords=None):
    
    # Get standard stopwords
    stop_words = set(stopwords.words('english'))
    
    # Words to keep (search keywords)
    words_to_keep = set()
    if search_keywords:
        # Convert keywords to lowercase and split multi-word terms
        for keyword in search_keywords:
            keyword = keyword.lower()
            # Add both the full phrase and individual words
            words_to_keep.add(keyword)
            for word in keyword.split():
                words_to_keep.add(word)
    
    # Remove search keywords from stopwords
    stop_words = stop_words - words_to_keep
    
    # Scientific paper terms to exclude (add to stopwords)
    scientific_terms = {
        # Citation terms
        'et', 'al', 'ref', 'reference', 'references', 'cited', 'cite',
        # Figure and table references
        'fig', 'figure', 'figures', 'table', 'tables', 'chart', 'charts',
        # Publication terms
        'published', 'journal', 'conference', 'proceedings',
        # Measurement units and numbers
        'vol', 'volume', 'pp', 'page', 'pages', 'doi'
    }
    
    # Add scientific terms to stopwords
    stop_words = stop_words.union(scientific_terms)
    
    return stop_words


[nltk_data] Downloading package punkt to C:\Users\STSI/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\STSI/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\STSI/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Preprocess function
def preprocess_text(text, search_keywords):
    if not isinstance(text, (str, int, float)):
        return ''
    text = str(text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s-]', '', text)
    tokens = word_tokenize(text) # Split text into individual words/tokens using NLTK's tokenizer
    stop_words = get_custom_stop_words(search_keywords)
    tokens = [token for token in tokens if token not in stop_words] # Remove all stop words from our tokens using list comprehension
    lemmatizer = WordNetLemmatizer() # Initialize WordNet lemmatizer to reduce words to their base/dictionary form
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # Convert each token to its lemma (e.g., "systems" → "system", "running" → "run")
    return ' '.join(tokens) # Combine all processed tokens back into a single string with spaces between words


# Function to convert string representation of list to actual list, replacing long strings with "Unknown"
def string_to_list(s):
    if isinstance(s, str):
        # Remove brackets and split by comma
        fields = [field.strip().strip("'") for field in s.strip('[]').split(',')]
        # Replace fields wit more than 100 characters with "Unknown"
        return ["Unknown" if len(field) > 100 else field for field in fields]
    return ["Unknown"]  # Return ["Unknown"] for empty or non-string entries

# Clean fields of study
def clean_fields_of_study(s):
    valid_fields= ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics', 'Medicine', 'Business', 'Environmental Science', 'Chemistry', 'Materials Science', 'Geography', 'Biology', 'Geology', 'Political Science', 'Psychology', 'Com']
    if pd.isna(s) or s == '[]':
        return ["Unknown"]
    if isinstance(s, str):
        fields = [field.strip().strip("'\"") for field in s.strip('[]').split(',')]
        cleaned_fields = []
        for field in fields:
            if field in valid_fields:
                cleaned_fields.append(field)
            else:
                cleaned_fields.append("Unknown")
        return cleaned_fields if cleaned_fields else ["Unknown"]
    return ["Unknown"]

## Simple analysis to extract keyword and n-gram frequency

In [15]:
def get_term_frequencies(vectorizer, texts):
    matrix = vectorizer.fit_transform(texts)
    terms = vectorizer.get_feature_names_out()
    freqs = matrix.sum(axis=0).A1
    return dict(sorted(zip(terms, freqs.tolist()), key=lambda x: x[1], reverse=True))

def extract_keywords_and_ngrams(df, max_features=1000):
    # Create vectorizers
    keyword_vectorizer = CountVectorizer(
        max_df=0.97, 
        min_df=2,
        stop_words='english',
        max_features=max_features,
        token_pattern=r'(?u)\b[A-Za-z][A-Za-z-]+[A-Za-z]\b'
    )
    
    bigram_vectorizer = CountVectorizer(
        ngram_range=(2,2),
        max_df=0.97,
        min_df=2,
        max_features=max_features,
        stop_words='english'
    )
    
    trigram_vectorizer = CountVectorizer(
        ngram_range=(3,3),
        max_df=0.97,
        min_df=2,
        max_features=max_features,
        stop_words='english'
    )
    
    # Extract frequencies
    keyword_freq = get_term_frequencies(keyword_vectorizer, df['processed_text'])
    bigram_freq = get_term_frequencies(bigram_vectorizer, df['processed_text'])
    trigram_freq = get_term_frequencies(trigram_vectorizer, df['processed_text'])
    
    # Save results
    current_date = datetime.now().strftime("%Y_%m_%d")
    filename=os.path.join(SAVE_DIR,f'term_frequencies_{current_date}.json')
    results = {
        'keywords': keyword_freq,
        'bigrams': bigram_freq,
        'trigrams': trigram_freq
    }
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"Results saved to {filename}")

In [6]:

def extract_topic_keywords(lda_model, vectorizer, num_words=10):
    """Extract keywords and n-grams for each topic"""
    feature_names = vectorizer.get_feature_names_out()
    topic_keywords = {}
    
    for topic_idx, topic in enumerate(lda_model.components_):
        # Get top words
        top_words = [feature_names[i] for i in topic.argsort()[:-num_words-1:-1]]
        
        # Get word weights
        word_weights = [(feature_names[i], topic[i]) 
                       for i in topic.argsort()[:-num_words-1:-1]]
        
        topic_keywords[topic_idx] = {
            'top_words': top_words,
            'word_weights': word_weights
        }
    
    return topic_keywords

# Model topics function
def model_topics(df, field, num_topics=10, num_words=5):
    df_field = df[df['fieldsOfStudy'].apply(lambda x: field in x)] # filtering so that only the documents within the field is keept.
    print (f"Analyzing {len(df_field)} papers")
    if df_field.empty:
            print(f"No papers found for field: {field}")            #warning if no papers found
            return None, None, None, None, None
    text_data = df_field['abstract'].fillna('')                     # filtering to avoid errors due to missing fields
    
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') # vectorizing with max_df = 0.95 as default (terms that appear in more than 95% of documents) and min_df =2 (terms that appear in less than 2 of the documents). Values below 1 indicates percentile, and values above 1 indicates number of docs.
    doc_term_matrix = vectorizer.fit_transform(text_data) #creating the term matrix:  vectorizer : sklearn.feature_extraction.text.(CountVectorizer, TfIdfVectorizer).vectorizer used to convert raw documents to document-term matrix (`dtm`)
    
    #Using the sklearn decomposition LatentDirchletAllocation (see https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html). 
    #Fit the LDA model
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42) 
    #Randomstate 42 is passed to maintain reproducibility in results (0 and 42 are commonly used values. Default "None" will result in using globa radnom instande (numpy.radom) and re-running may produce different results
    
    topic_distribution = lda_model.fit_transform(doc_term_matrix)
    topic_keywords=extract_topic_keywords(lda_model, vectorizer)
    
    #feature_names = vectorizer.get_feature_names_out()
    """
    topic_names = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]
        topic_name = f"Topic_{field}_{topic_idx+1}"
        print(f"{topic_name}: {', '.join(top_words)}")
        topic_names.append(topic_name)
    """    
    for topic_idx, keywords in topic_keywords.items():
        print(f"\nTopic {topic_idx + 1}:")
        # Format each word with its weight in parentheses
        formatted_words = [f"{word} ({weight:.2f})" 
                         for word, weight in keywords['word_weights']]
        print(", ".join(formatted_words)) 
    return lda_model, vectorizer, topic_distribution, df_field, topic_keywords

def classify_papers(topic_distributions, df_field):
    """Classify papers based on topic distributions"""
    paper_classifications = []
    
    for idx, dist in enumerate(topic_distributions): #looping throug the papers one-by-one (index by index) and topics distribution 
        # Get primary and secondary topics (only primary is used in dominance ratio)
        top_2_topics = np.argsort(dist)[-2:][::-1] # extracting the to last [-2:] the order is reversed to highest first with the [::-1]
        
        # Calculate dominance ratios by :
        primary_score = dist[top_2_topics[0]] # 1) extracting the probability of the primary topic
        other_topics_sum = sum(dist) - primary_score # 2) calc. the sum of all the other topics (all topic-primary)
        dominance_ratio = primary_score / (other_topics_sum + 1e-10) #3) dividing the primary score by the sum (adding very small number to avoid dividing by zero if only one topic)
        
        # Storing the paper classifications
        paper_classifications.append({
            'paper_idx': idx,                   #storing the paper index
            'primary_topic': top_2_topics[0],   # the most probable topic
            'secondary_topic': top_2_topics[1], # second most probable topic
            'primary_score': primary_score,     # the probability of the most probable topic 
            'dominance_ratio': dominance_ratio  # the domincane ratio (eg. is the paper only about this, or also covering other topics?)
        })
    
    return paper_classifications

# Get top papers per topic per field

def get_top_papers(paper_classifications, df_field, n_top=5):
    top_papers = {}
    author_topic_stats = {}
    
    # Debug print
    #print(f"Total papers to analyze: {len(paper_classifications)}")
    
    for topic in set(p['primary_topic'] for p in paper_classifications): # creating a set of the unique primary topics in the datasett, e.g. if there are only three primary topics and all papers "belong" to one of them, the set will be {Primary topic 1, Primary topic 2, Primary topic 3}
        print(f"\nProcessing topic {topic}")
        topic_papers = [p for p in paper_classifications if p['primary_topic'] == topic]  # Get all the papers for the current topic
        print(f"Papers for topic {topic}: {len(topic_papers)}")
        topic_papers.sort(key=lambda x: x['dominance_ratio'], reverse=True) # Sort by dominance ratio, reverse=True to get the highest ratios at the top.
        top_papers[topic] = []
        
        # Get top n papers where n is the numebr spesified as top_n when calling the functino
        for p in topic_papers[:n_top]: #loop through the top_n number of papers
            paper_idx = p['paper_idx']
            authors = df_field.iloc[paper_idx]['authors']
            
            # Debug print
            #print(f"\nPaper index: {paper_idx}")
            #print(f"Authors data type: {type(authors)}")
            #print(f"Authors content: {authors}")
            
            # Check if authors is string (might be stored as JSON string)
            if isinstance(authors, str):
                try:
                    authors = ast.literal_eval(authors)
                except (ValueError, SyntaxError):
                    print(f"Failed to parse authors string: {authors}")
                    authors = []
            
            if isinstance(authors, list):
                author_list = []
                for author in authors:
                    # Debug print
                    print(f"Processing author: {author}")
                    if isinstance(author, dict):
                        author_list.append({
                            'name': author.get('name', 'Unknown'),
                            'id': author.get('authorId', 'Unknown')
                        })
                    else:
                        print(f"Unexpected author format: {author}")
            else:
                print(f"Unexpected authors format: {authors}")
                author_list = []
            
            # Debug print
            #print(f"Processed author list: {author_list}")
            
            top_papers[topic].append({
                'paperId': df_field.iloc[paper_idx]['paperId'],
                'title': df_field.iloc[paper_idx]['title'],
                'abstract': df_field.iloc[paper_idx]['abstract'],
                'authors': author_list,
                'score': float(p['primary_score']),
                'dominance_ratio': float(p['dominance_ratio'])
            })
            
            for author in author_list:
                author_id = author['id']
                if author_id not in author_topic_stats:
                    author_topic_stats[author_id] = {
                        'name': author['name'],
                        'topics': {},
                        'total_papers': 0,
                        'top_papers': 0
                    }
                
                if topic not in author_topic_stats[author_id]['topics']:
                    author_topic_stats[author_id]['topics'][topic] = {
                        'paper_count': 0,
                        'avg_dominance': 0,
                        'top_papers': []
                    }
                
                author_stats = author_topic_stats[author_id]['topics'][topic]
                author_stats['paper_count'] += 1
                author_stats['avg_dominance'] = (
                    (author_stats['avg_dominance'] * (author_stats['paper_count'] - 1) + 
                     float(p['dominance_ratio'])) / author_stats['paper_count']
                )
                author_stats['top_papers'].append({
                    'title': df_field.iloc[paper_idx]['title'],
                    'dominance_ratio': float(p['dominance_ratio'])
                })
                
                author_topic_stats[author_id]['total_papers'] += 1
                author_topic_stats[author_id]['top_papers'] += 1
    
      # Convert topic numbers to regular integers for JSON serialization
    author_topic_stats_clean = {}
    for author_id, stats in author_topic_stats.items():
        author_topic_stats_clean[author_id] = stats.copy()
        author_topic_stats_clean[author_id]['topics'] = {
            int(topic): topic_stats 
            for topic, topic_stats in stats['topics'].items()
        }
    
    # Debug print with cleaned data
    #print(f"\nFinal author_topic_stats: {json.dumps(author_topic_stats_clean, indent=2)}")
    
    return top_papers, author_topic_stats


def print_author_analysis(author_topic_stats, min_papers=2):
    """Print detailed author analysis"""
    print("\nAuthor Analysis:")
    for author_id, stats in author_topic_stats.items():
        if stats['total_papers'] >= min_papers:
            print(f"\nAuthor: {stats['name']}")
            print(f"Total papers in top lists: {stats['total_papers']}")
            print("Topics:")
            for topic, topic_stats in stats['topics'].items():
                print(f"\nTopic {topic}:")
                print(f"  Paper count: {topic_stats['paper_count']}")
                print(f"  Average dominance ratio: {topic_stats['avg_dominance']:.4f}")
                print("  Top papers:")
                for paper in topic_stats['top_papers']:
                    print(f"    - {paper['title']} (dominance: {paper['dominance_ratio']:.4f})")


In [11]:
# old code
                    
""" 
def get_top_papers_per_topic_per_field(df, text_column, field_column, fields_to_analyze, num_topics=5, n_top=5):
    all_top_papers = {}
    for field in fields_to_analyze:
        print(f"\nProcessing field: {field}")
        df_field = df[df[field_column].apply(lambda x: field in x)]
        if df_field.empty:
            print(f"No papers found for field: {field}")
            continue
        text_data = df_field[text_column].fillna('')
        vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
        doc_term_matrix = vectorizer.fit_transform(text_data)
        lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
        topic_distributions = lda_model.fit_transform(doc_term_matrix) #kjører ny lda model fitting. 
        
        
        top_papers = {}
        for topic in range(num_topics):
            topic_scores = topic_distributions[:, topic]
            other_topics_sum = topic_distributions.sum(axis=1) - topic_scores
            topic_dominance = topic_scores / (other_topics_sum + 1e-10)  # Avoid division by zero
            top_indices = np.argsort(topic_dominance)[-n_top:][::-1]
            top_papers[topic] = [
                {
                    'paperId': df_field.iloc[i]['paperId'],  # Include paperId
                    'title': df_field.iloc[i]['title'],
                    'document': df_field.iloc[i][text_column],
                    'abstract': df_field.iloc[i]['abstract'],
                    'score': float(topic_scores[i]),
                    'dominance_ratio': float(topic_dominance[i])
                }
                for i in top_indices
            ]
        all_top_papers[field] = top_papers
        
        # Print top words for each topic
        feature_names = vectorizer.get_feature_names_out()
        for topic_idx, topic in enumerate(lda_model.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")
    
    return all_top_papers
"""
"""def classify_paper(row):
    # Get all topic columns
    topic_columns = [col for col in row.index if col.startswith("Topic_")]
    
    if not topic_columns:
        return 'N/A', 'N/A'
    
    topic_scores = row[topic_columns]
    
    # Convert topic scores to numeric, replacing non-numeric values with NaN
    topic_scores = pd.to_numeric(topic_scores, errors='coerce')
    
    # Drop any NaN values
    topic_scores = topic_scores.dropna()
    
    if topic_scores.empty:
        return 'N/A', 'N/A'
    
    # Get top 2 topics
    top_2_topics = topic_scores.nlargest(2)
    
    # Extract topic names without the "Topic_" prefix
    primary_topic = top_2_topics.index[0].split('_', 1)[1] if len(top_2_topics) > 0 else 'N/A'
    secondary_topic = top_2_topics.index[1].split('_', 1)[1] if len(top_2_topics) > 1 else 'N/A'
    
    return primary_topic, secondary_topic
"""

'def classify_paper(row):\n    # Get all topic columns\n    topic_columns = [col for col in row.index if col.startswith("Topic_")]\n    \n    if not topic_columns:\n        return \'N/A\', \'N/A\'\n    \n    topic_scores = row[topic_columns]\n    \n    # Convert topic scores to numeric, replacing non-numeric values with NaN\n    topic_scores = pd.to_numeric(topic_scores, errors=\'coerce\')\n    \n    # Drop any NaN values\n    topic_scores = topic_scores.dropna()\n    \n    if topic_scores.empty:\n        return \'N/A\', \'N/A\'\n    \n    # Get top 2 topics\n    top_2_topics = topic_scores.nlargest(2)\n    \n    # Extract topic names without the "Topic_" prefix\n    primary_topic = top_2_topics.index[0].split(\'_\', 1)[1] if len(top_2_topics) > 0 else \'N/A\'\n    secondary_topic = top_2_topics.index[1].split(\'_\', 1)[1] if len(top_2_topics) > 1 else \'N/A\'\n    \n    return primary_topic, secondary_topic\n'

In [25]:
""" needs fixing"""

def analyze_specific_topic(df, topic, num_subtopics=5, n_top=5):
    # Filter the dataframe for the specific topic
    df_topic = df[df['Primary_Topic'] == topic].copy()
    
    print(f"Analyzing topic: {topic}")
    print(f"Number of papers: {len(df_topic)}")
    
    if len(df_topic) < 10:  # Adjust this threshold as needed
        print("Not enough papers for meaningful subtopic analysis.")
        return None, None, None
    
    # Ensure index is unique
    df_topic = df_topic.reset_index(drop=True)
    
    # Prepare the text data
    text_data = df_topic['abstract'].fillna('')
    
    # Create document-term matrix
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    doc_term_matrix = vectorizer.fit_transform(text_data)
    
    # Create and fit LDA model for subtopics
    lda_model = LatentDirichletAllocation(n_components=num_subtopics, random_state=42)
    subtopic_distributions = lda_model.fit_transform(doc_term_matrix)
    
    # Get top words for each subtopic
    feature_names = vectorizer.get_feature_names_out()
    for idx, subtopic in enumerate(lda_model.components_):
        top_words = [feature_names[i] for i in subtopic.argsort()[:-10 - 1:-1]]
        print(f"Subtopic {idx + 1}: {', '.join(top_words)}")
    
    # Get top papers for each subtopic
    top_papers = {}
    for subtopic in range(num_subtopics):
        subtopic_scores = subtopic_distributions[:, subtopic]
        other_subtopics_sum = subtopic_distributions.sum(axis=1) - subtopic_scores
        subtopic_dominance = subtopic_scores / (other_subtopics_sum + 1e-10)
        top_indices = np.argsort(subtopic_dominance)[-n_top:][::-1]
        
        top_papers[subtopic] = [
            {
                'title': df_topic.iloc[i]['title'],
                'abstract': df_topic.iloc[i]['abstract'],
                'score': float(subtopic_scores[i]),
                'dominance_ratio': float(subtopic_dominance[i])
            }
            for i in top_indices
        ]
    # Generate subtopic names using the same method as before - Skipped until copyright breach issue is resolved
    """
    subtopic_names = {}
    for subtopic, papers in top_papers.items():
        abstracts = "\n\n".join([paper['abstract'] for paper in papers])
        keywords = [word for paper in papers for word in paper['abstract'].split()[:10]]
        subtopic_name = generate_topic_name(abstracts, keywords)
        subtopic_names[subtopic] = subtopic_name
    """
    #Add subtopic scores and names to the dataframe
    for i in range(num_subtopics):
        df_topic[f'Subtopic_{i+1}_Score'] = subtopic_distributions[:, i]
        df_topic['Primary_Subtopic'] = df_topic[[f'Subtopic_{i+1}_Score' for i in range(num_subtopics)]].idxmax(axis=1)
   # df_topic['Primary_Subtopic_Name'] = df_topic['Primary_Subtopic'].map(lambda x: subtopic_names[int(x.split('_')[1]) - 1])
    
    # Update the main dataframe with the new subtopic information
        df_update = df.copy()
        df_update.loc[df_topic.index, [f'Subtopic_{i+1}_Score' for i in range(num_subtopics)]] = df_topic[[f'Subtopic_{i+1}_Score' for i in range(num_subtopics)]]
        df_update.loc[df_topic.index, 'Primary_Subtopic'] = df_topic['Primary_Subtopic']
  # df_update.loc[df_topic.index, 'Primary_Subtopic_Name'] = df_topic['Primary_Subtopic_Name']
    
    return df_update, top_papers#, subtopic_names

In [None]:

filename='semantic_scholar_2025_02_14_reliability_resilience_power_systems_results.csv' # if the search key words are changed you need to update the filename here. should be automatic...
filepath=os.path.join(SAVE_DIR,filename)
df=pd.read_csv(filepath,sep=";")
df['text'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')

In [None]:
# Processing to convert title and abstract to text field.
print("Preprocessing text data...")
search_keywords=['reliability', 'resilience', 'power systems', 'capacity utilization'] # should be automitically retrieved from search script, but for the time beeing its manually input here...
df['processed_text'] = df['text'].apply(lambda x:preprocess_text(x,search_keywords=search_keywords))
print("Text preprocessing completed.")

Preprocessing text data...
Text preprocessing completed.


In [10]:

# preparing for analysis (need to fill the empty fieldsOfStudy with ''):
valid_fields= ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics', 'Medicine', 'Business', 'Environmental Science', 'Chemistry', 'Materials Science', 'Geography', 'Biology', 'Geology', 'Political Science', 'Psychology', 'Com']
df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)

In [16]:
# simple analysis before topic modelling
# Usage
extract_keywords_and_ngrams(df)

Results saved to Saved_files\term_frequencies_2025_02_27.json




In [None]:
# earlier way to analyse step by step:
fields_to_analyze= ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics']#, 'Medicine', 'Business', 'Environmental Science', 'Chemistry', 'Materials Science', 'Geography', 'Biology', 'Geology', 'Political Science', 'Psychology', 'Com']
# For each field
for field in fields_to_analyze:
    print(f"\nAnalyzing field: {field}")
    
# Step 1: Model topics
    lda_model, vectorizer, topic_distributions, df_field, topic_keywords = model_topics(
        df, field, num_topics=10)
    
    if lda_model is None:
        continue
# Step 2: Classify papers
    paper_classifications = classify_papers(topic_distributions, df_field)
    
# Step 3: Get top papers
    top_papers, author_stats = get_top_papers(paper_classifications, df_field, n_top=5)


# Print results
for topic_idx, papers in top_papers.items():
    print(f"\nTop papers for Topic {topic_idx + 1}")
    print(f"Keywords: {', '.join(topic_keywords[topic_idx]['top_words'])}")
    for paper in papers:
        print(f"\nTitle: {paper['title']}")
        print(f"Dominance ratio: {paper['dominance_ratio']:.4f}")

print_author_analysis(author_stats, min_papers=3)

In [12]:
# new better version of the analyzis:
def analyze_papers_by_field(df, fields_to_analyze, output_suffix="_analyzed_results"):
    # Create a copy of the original dataframe to store all results
    df_analyzed = df.copy()
    all_author_stats = {}
    
    for field in fields_to_analyze:
        print(f"\nAnalyzing field: {field}")
        
        # Step 1: Model topics
        lda_model, vectorizer, topic_distributions, df_field, topic_keywords = model_topics(
            df, field, num_topics=10)
        
        if lda_model is None:
            continue
            
        # Step 2: Classify papers and add to main dataframe
        paper_classifications = classify_papers(topic_distributions, df_field)
        
        # Add classifications to the analyzed dataframe
        for p in paper_classifications:
            idx = df_field.index[p['paper_idx']]
            df_analyzed.loc[idx, f'{field}_Primary_Topic'] = p['primary_topic']
            df_analyzed.loc[idx, f'{field}_Secondary_Topic'] = p['secondary_topic']
            df_analyzed.loc[idx, f'{field}_Primary_Score'] = p['primary_score']
            df_analyzed.loc[idx, f'{field}_Dominance_Ratio'] = p['dominance_ratio']
        
        # Step 3: Get top papers and author stats
        top_papers, author_stats = get_top_papers(paper_classifications, df_field, n_top=5)
        
        # Store author stats for this field
        all_author_stats[field] = author_stats
        
        # Add topic keywords to dataframe metadata
        df_analyzed.attrs[f'{field}_topic_keywords'] = topic_keywords
    
    # Save results
    current_date = datetime.now().strftime("%Y_%m_%d")
    output_filename =os.path.join(SAVE_DIR,f"semantic_scholar_{current_date}{output_suffix}.csv")
        # Save main results
    df_analyzed.to_csv(output_filename, sep=';', encoding='utf-8', 
                      quoting=csv.QUOTE_NONNUMERIC, escapechar='\\')
    
    # Save author statistics with topic frequencies
    author_filename = f"semantic_scholar_{current_date}_author_analysis.csv"
    save_author_analysis(all_author_stats, author_filename)
    
    return df_analyzed, all_author_stats

def save_author_analysis(all_author_stats, filename):
    """Save detailed author analysis including topic frequencies"""
    author_rows = []
    
    for field, author_stats in all_author_stats.items():
        for author_id, stats in author_stats.items():
            for topic, topic_stats in stats['topics'].items():
                author_rows.append({
                    'Field': field,
                    'Author_ID': author_id,
                    'Author_Name': stats['name'],
                    'Topic': topic,
                    'Paper_Count': topic_stats['paper_count'],
                    'Avg_Dominance': topic_stats['avg_dominance'],
                    'Total_Papers': stats['total_papers']
                })
    
    author_df = pd.DataFrame(author_rows)
    author_df.to_csv(filename, sep=';', encoding='utf-8', 
                    quoting=csv.QUOTE_NONNUMERIC, index=False)

In [13]:
#Use of the new function:
# Define fields to analyze
fields_to_analyze = ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics']

# Analyze papers and get results
df_analyzed, author_stats = analyze_papers_by_field(df, fields_to_analyze)

# Print summary
for field in fields_to_analyze:
    print(f"\nField: {field}")
    print(f"Topics found: {len(df_analyzed.attrs.get(f'{field}_topic_keywords', []))}")
    print(f"Authors analyzed: {len(author_stats.get(field, {}))}")



Analyzing field: Computer Science
Analyzing 9597 papers

Topic 1:
et (375.03), al (327.83), systems (315.81), 10 (228.37), research (223.54), engineering (188.10), information (164.13), use (155.11), structure (140.80), design (140.71)

Topic 2:
power (1550.56), memory (1422.04), performance (1142.10), high (1128.69), design (1046.79), systems (947.18), reliability (809.00), low (736.76), capacity (644.68), based (628.42)

Topic 3:
systems (2193.72), power (1994.09), grid (1583.05), smart (1379.90), resilience (1329.37), data (1136.93), reliability (1000.48), paper (884.64), based (817.18), control (808.96)

Topic 4:
computing (1656.09), data (1482.51), cloud (1119.52), performance (1067.99), systems (912.07), applications (884.59), reliability (766.06), based (760.05), software (727.65), time (679.25)

Topic 5:
power (2238.32), proposed (1493.42), algorithm (1472.31), capacity (1418.76), problem (1295.22), optimization (1202.67), optimal (1136.44), results (961.22), energy (895.85), 

In [15]:
print_author_analysis(author_stats, min_papers=2)


Author Analysis:

Author: Y. Hayashi
Total papers in top lists: 3
Topics:

Topic 0:
  Paper count: 3
  Average dominance ratio: 179.7217
  Top papers:
    - Determination Method of Loss-Minimum Configuration with Mathematical Optimality in a Three Sectionalized and Three Connected Distribution Feeder Network (dominance: 193.4248)
    - Calculation of the Loss-minimum Configuration in IEEJ Local Power System Model (dominance: 175.6483)
    - Calculation of the loss‐minimum configuration in the IEEJ local power system model (dominance: 170.0920)

Author: J. Matsuki
Total papers in top lists: 3
Topics:

Topic 0:
  Paper count: 3
  Average dominance ratio: 179.7217
  Top papers:
    - Determination Method of Loss-Minimum Configuration with Mathematical Optimality in a Three Sectionalized and Three Connected Distribution Feeder Network (dominance: 193.4248)
    - Calculation of the Loss-minimum Configuration in IEEJ Local Power System Model (dominance: 175.6483)
    - Calculation of the lo

In [None]:
# Define valid fields and clean fields of study
valid_fields = ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics', 'Medicine', 'Business', 'Environmental Science', 'Chemistry', 'Materials Science', 'Geography', 'Biology', 'Geology', 'Political Science', 'Psychology', 'Com']
df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)
# Filter out papers with excluded fields
exclude_fields = ['Chemistry', 'Materials Science', 'Geography', 'Biology', 'Geology', 'Medicine', 'Political Science', 'Psychology', 'Com']
df_filtered = df[df['fieldsOfStudy'].apply(lambda x: not set(x).issubset(set(exclude_fields)))]
# Get unique fields of study
unique_fields = set([field for fields in df_filtered['fieldsOfStudy'] for field in fields if field not in exclude_fields])
