In [10]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.lda_model
import numpy as np
from openai import OpenAI
import configparser
import tiktoken
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from difflib import SequenceMatcher

In [11]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\STSI/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\STSI/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\STSI/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
# Preprocess function
def preprocess_text(text):
    if not isinstance(text, (str, int, float)):
        return ''
    text = str(text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)
# Function to convert string representation of list to actual list, replacing long strings with "Unknown"
def string_to_list(s):
    if isinstance(s, str):
        # Remove brackets and split by comma
        fields = [field.strip().strip("'") for field in s.strip('[]').split(',')]
        # Replace fields wit more than 100 characters with "Unknown"
        return ["Unknown" if len(field) > 100 else field for field in fields]
    return ["Unknown"]  # Return ["Unknown"] for empty or non-string entries

# Clean fields of study
def clean_fields_of_study(s):
    if pd.isna(s) or s == '[]':
        return ["Unknown"]
    if isinstance(s, str):
        fields = [field.strip().strip("'\"") for field in s.strip('[]').split(',')]
        cleaned_fields = []
        for field in fields:
            if field in valid_fields:
                cleaned_fields.append(field)
            else:
                cleaned_fields.append("Unknown")
        return cleaned_fields if cleaned_fields else ["Unknown"]
    return ["Unknown"]

In [None]:
# Model topics function
def model_topics(df, field, num_topics=10, num_words=5):
    df_field = df[df['fieldsOfStudy'].apply(lambda x: field in x)] # filtering so that only the documents within the field is keept.
    if df_field.empty:
            print(f"No papers found for field: {field}")            #warning if no papers found
    text_data = df_field['abstract'].fillna('')                     # filtering to avoid errors due to missing fields
    
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') # vectorizing with max_df = 0.95 as default (terms that appear in more than 95% of documents) and min_df =2 (terms that appear in less than 2 of the documents). Values below 1 indicates percentile, and values above 1 indicates number of docs.
    doc_term_matrix = vectorizer.fit_transform(text_data) #creating the term matrix:  vectorizer : sklearn.feature_extraction.text.(CountVectorizer, TfIdfVectorizer).vectorizer used to convert raw documents to document-term matrix (`dtm`)
    
    #Using the sklearn decomposition LatentDirchletAllocation (see https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html). 
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42) 
    #Randomstate 42 is passed to maintain reproducibility in results (0 and 42 are commonly used values. Default "None" will result in using globa radnom instande (numpy.radom) and re-running may produce different results
    
    lda_output = lda_model.fit_transform(doc_term_matrix)
    feature_names = vectorizer.get_feature_names_out()
    
    topic_names = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]
        topic_name = f"Topic_{field}_{topic_idx+1}"
        print(f"{topic_name}: {', '.join(top_words)}")
        topic_names.append(topic_name)
    
    return lda_output, df_field.index, topic_names


# Get top papers per topic per field
def get_top_papers_per_topic_per_field(df, text_column, field_column, fields_to_analyze, num_topics=5, n_top=5):
    all_top_papers = {}
    for field in fields_to_analyze:
        print(f"\nProcessing field: {field}")
        df_field = df[df[field_column].apply(lambda x: field in x)]
        if df_field.empty:
            print(f"No papers found for field: {field}")
            continue
        text_data = df_field[text_column].fillna('')
        vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
        doc_term_matrix = vectorizer.fit_transform(text_data)
        lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
        topic_distributions = lda_model.fit_transform(doc_term_matrix) #kjører ny lda model fitting. 
        
        
        top_papers = {}
        for topic in range(num_topics):
            topic_scores = topic_distributions[:, topic]
            other_topics_sum = topic_distributions.sum(axis=1) - topic_scores
            topic_dominance = topic_scores / (other_topics_sum + 1e-10)  # Avoid division by zero
            top_indices = np.argsort(topic_dominance)[-n_top:][::-1]
            top_papers[topic] = [
                {
                    'paperId': df_field.iloc[i]['paperId'],  # Include paperId
                    'title': df_field.iloc[i]['title'],
                    'document': df_field.iloc[i][text_column],
                    'abstract': df_field.iloc[i]['abstract'],
                    'score': float(topic_scores[i]),
                    'dominance_ratio': float(topic_dominance[i])
                }
                for i in top_indices
            ]
        all_top_papers[field] = top_papers
        
        # Print top words for each topic
        feature_names = vectorizer.get_feature_names_out()
        for topic_idx, topic in enumerate(lda_model.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
            print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")
    
    return all_top_papers

def classify_paper(row):
    # Get all topic columns
    topic_columns = [col for col in row.index if col.startswith("Topic_")]
    
    if not topic_columns:
        return 'N/A', 'N/A'
    
    topic_scores = row[topic_columns]
    
    # Convert topic scores to numeric, replacing non-numeric values with NaN
    topic_scores = pd.to_numeric(topic_scores, errors='coerce')
    
    # Drop any NaN values
    topic_scores = topic_scores.dropna()
    
    if topic_scores.empty:
        return 'N/A', 'N/A'
    
    # Get top 2 topics
    top_2_topics = topic_scores.nlargest(2)
    
    # Extract topic names without the "Topic_" prefix
    primary_topic = top_2_topics.index[0].split('_', 1)[1] if len(top_2_topics) > 0 else 'N/A'
    secondary_topic = top_2_topics.index[1].split('_', 1)[1] if len(top_2_topics) > 1 else 'N/A'
    
    return primary_topic, secondary_topic


In [None]:
""" needs fixing"""

def analyze_specific_topic(df, topic, num_subtopics=5, n_top=5):
    # Filter the dataframe for the specific topic
    df_topic = df[df['Primary_Topic'] == topic].copy()
    
    print(f"Analyzing topic: {topic}")
    print(f"Number of papers: {len(df_topic)}")
    
    if len(df_topic) < 10:  # Adjust this threshold as needed
        print("Not enough papers for meaningful subtopic analysis.")
        return None, None, None
    
    # Ensure index is unique
    df_topic = df_topic.reset_index(drop=True)
    
    # Prepare the text data
    text_data = df_topic['abstract'].fillna('')
    
    # Create document-term matrix
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    doc_term_matrix = vectorizer.fit_transform(text_data)
    
    # Create and fit LDA model for subtopics
    lda_model = LatentDirichletAllocation(n_components=num_subtopics, random_state=42)
    subtopic_distributions = lda_model.fit_transform(doc_term_matrix)
    
    # Get top words for each subtopic
    feature_names = vectorizer.get_feature_names_out()
    for idx, subtopic in enumerate(lda_model.components_):
        top_words = [feature_names[i] for i in subtopic.argsort()[:-10 - 1:-1]]
        print(f"Subtopic {idx + 1}: {', '.join(top_words)}")
    
    # Get top papers for each subtopic
    top_papers = {}
    for subtopic in range(num_subtopics):
        subtopic_scores = subtopic_distributions[:, subtopic]
        other_subtopics_sum = subtopic_distributions.sum(axis=1) - subtopic_scores
        subtopic_dominance = subtopic_scores / (other_subtopics_sum + 1e-10)
        top_indices = np.argsort(subtopic_dominance)[-n_top:][::-1]
        
        top_papers[subtopic] = [
            {
                'title': df_topic.iloc[i]['title'],
                'abstract': df_topic.iloc[i]['abstract'],
                'score': float(subtopic_scores[i]),
                'dominance_ratio': float(subtopic_dominance[i])
            }
            for i in top_indices
        ]
    # Generate subtopic names using the same method as before - Skipped until copyright breach issue is resolved
    """
    subtopic_names = {}
    for subtopic, papers in top_papers.items():
        abstracts = "\n\n".join([paper['abstract'] for paper in papers])
        keywords = [word for paper in papers for word in paper['abstract'].split()[:10]]
        subtopic_name = generate_topic_name(abstracts, keywords)
        subtopic_names[subtopic] = subtopic_name
    """
    #Add subtopic scores and names to the dataframe
    for i in range(num_subtopics):
        df_topic[f'Subtopic_{i+1}_Score'] = subtopic_distributions[:, i]
        df_topic['Primary_Subtopic'] = df_topic[[f'Subtopic_{i+1}_Score' for i in range(num_subtopics)]].idxmax(axis=1)
   # df_topic['Primary_Subtopic_Name'] = df_topic['Primary_Subtopic'].map(lambda x: subtopic_names[int(x.split('_')[1]) - 1])
    
    # Update the main dataframe with the new subtopic information
        df_update = df.copy()
        df_update.loc[df_topic.index, [f'Subtopic_{i+1}_Score' for i in range(num_subtopics)]] = df_topic[[f'Subtopic_{i+1}_Score' for i in range(num_subtopics)]]
        df_update.loc[df_topic.index, 'Primary_Subtopic'] = df_topic['Primary_Subtopic']
  # df_update.loc[df_topic.index, 'Primary_Subtopic_Name'] = df_topic['Primary_Subtopic_Name']
    
    return df_update, top_papers#, subtopic_names

In [25]:
df=pd.read_csv('semantic_scholar_results_1st.csv',sep=";")
df['text'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')

In [26]:
print("Preprocessing text data...")
df['processed_text'] = df['text'].apply(preprocess_text)
print("Text preprocessing completed.")

Preprocessing text data...


KeyboardInterrupt: 

In [None]:
# Define valid fields and clean fields of study
valid_fields = ['Computer Science', 'Economics', 'Engineering', 'Physics', 'Mathematics', 'Medicine', 'Business', 'Environmental Science', 'Chemistry', 'Materials Science', 'Geography', 'Biology', 'Geology', 'Political Science', 'Psychology', 'Com']
df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(clean_fields_of_study)
# Filter out papers with excluded fields
exclude_fields = ['Chemistry', 'Materials Science', 'Geography', 'Biology', 'Geology', 'Medicine', 'Political Science', 'Psychology', 'Com']
df_filtered = df[df['fieldsOfStudy'].apply(lambda x: not set(x).issubset(set(exclude_fields)))]
# Get unique fields of study
unique_fields = set([field for fields in df_filtered['fieldsOfStudy'] for field in fields if field not in exclude_fields])
