In [1]:
# Importing pandas library for data manipulation
#%pip install torch pandas scikit-learn transformers

import pandas as pd

# Load the dataset
# Note: Make sure the file path is correct, adjust the filename if necessary
data = pd.read_csv("./WOS.csv")

# Display the first few rows of the dataset to inspect the data structure
# This helps us understand the contents and structure of each row
data.head()

# Display the column names to verify that all required columns are present
# We need to ensure columns like 'Title', 'Abstract', 'Keywords', etc. are included
print("Columns in the dataset:", data.columns.tolist())

Columns in the dataset: ['Publication Type', 'Authors', 'Book Authors', 'Book Editors', 'Book Group Authors', 'Author Full Names', 'Book Author Full Names', 'Group Authors', 'Article Title', 'Source Title', 'Book Series Title', 'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title', 'Conference Date', 'Conference Location', 'Conference Sponsor', 'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract', 'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses', 'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred', 'Funding Text', 'Cited References', 'Cited Reference Count', 'Times Cited, WoS Core', 'Times Cited, All Databases', '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher', 'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN', 'Journal Abbreviation', 'Journal ISO Abbreviation', 'Publication Date', 'Publication Year', 'Volume', 'Issue', 'Part Number', 'Supplement', 'Special Issue', 'Meeting Abstract', 'Start Pag

In [2]:
# Define a list of essential columns to retain
# This focuses on the columns most relevant for systematic review and data analysis
essential_columns = [
    "Article Title",  # Title of the paper
    "Abstract",  # Abstract text
    "Author Keywords",  # Keywords provided by the authors
    "Keywords Plus",  # Additional keywords provided by Web of Science
    "Authors",  # Names of authors
    "Source Title",  # Journal or source title
    "Publication Year",  # Year of publication
    "Document Type",  # Type of document (e.g., research article, review)
    "DOI",  # DOI for unique identification
    "Times Cited, WoS Core",  # Number of times cited
    "Research Areas",  # Areas of research (e.g., manufacturing, engineering)
]

# Create a new DataFrame with only the essential columns
filtered_data = data[essential_columns]

# Display the first few rows of the new DataFrame to verify selected columns
filtered_data.head()

# Check the column names to ensure only essential columns are included
print("Columns in the filtered dataset:", filtered_data.columns.tolist())

Columns in the filtered dataset: ['Article Title', 'Abstract', 'Author Keywords', 'Keywords Plus', 'Authors', 'Source Title', 'Publication Year', 'Document Type', 'DOI', 'Times Cited, WoS Core', 'Research Areas']


In [3]:
# Check if 'Author Keywords' and 'Keywords Plus' are in the DataFrame
author_keywords_exists = "Author Keywords" in filtered_data.columns
keywords_plus_exists = "Keywords Plus" in filtered_data.columns

# Combine 'Author Keywords' and 'Keywords Plus' into a single 'Keywords' column
if author_keywords_exists and keywords_plus_exists:
    # If both columns exist, combine them with a separator
    filtered_data.loc[:, "Keywords"] = filtered_data["Author Keywords"].fillna("") + "; " + filtered_data["Keywords Plus"].fillna("")
elif author_keywords_exists:
    # If only 'Author Keywords' exists
    filtered_data.loc[:, "Keywords"] = filtered_data["Author Keywords"]
elif keywords_plus_exists:
    # If only 'Keywords Plus' exists
    filtered_data.loc[:, "Keywords"] = filtered_data["Keywords Plus"]
else:
    # If neither column exists, create an empty 'Keywords' column
    filtered_data.loc[:, "Keywords"] = ""

# Drop the original 'Author Keywords' and 'Keywords Plus' columns if they exist
filtered_data.drop(columns=[col for col in ["Author Keywords", "Keywords Plus"] if col in filtered_data.columns], inplace=True)

# Display the first few rows to confirm the new 'Keywords' column
filtered_data[["Article Title", "Keywords"]].head()

# Check the final columns to ensure only the combined 'Keywords' column remains
print("Columns after combining keywords:", filtered_data.columns.tolist())

Columns after combining keywords: ['Article Title', 'Abstract', 'Authors', 'Source Title', 'Publication Year', 'Document Type', 'DOI', 'Times Cited, WoS Core', 'Research Areas', 'Keywords']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.loc[:, "Keywords"] = filtered_data["Author Keywords"].fillna("") + "; " + filtered_data["Keywords Plus"].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.drop(columns=[col for col in ["Author Keywords", "Keywords Plus"] if col in filtered_data.columns], inplace=True)


In [4]:
# Define a list of exclusion keywords for fields outside manufacturing
exclusion_keywords = ["medicine", "education", "healthcare", "surgery", "nursing"]


# Function to check for exclusion keywords in both Abstract and Keywords columns
def is_relevant(text):
    # Convert to lowercase to ensure case-insensitive matching
    text = str(text).lower()
    # Return False if any exclusion keyword is found, meaning the article is irrelevant
    for word in exclusion_keywords:
        if word in text:
            return False
    return True


# Apply the function to both 'Abstract' and 'Keywords' columns
# We will keep rows where both columns pass the relevance check
filtered_data = filtered_data[filtered_data["Abstract"].apply(is_relevant) & filtered_data["Keywords"].apply(is_relevant)]

# Display the number of relevant articles after filtering
print(f"Number of relevant papers after rule-based filtering: {len(filtered_data)}")

# Display the first few rows to confirm filtering
filtered_data[["Article Title", "Abstract", "Keywords"]].head()

Number of relevant papers after rule-based filtering: 218


Unnamed: 0,Article Title,Abstract,Keywords
0,Determining Cognitive Workload Using Physiolog...,The adoption of Industry 4.0 technologies in m...,cognitive workload; task performance; pupillom...
1,Smart Production and Manufacturing: A Research...,The concepts of Smart Production and Industry ...,Cognitive workload; Industry 4.0; Mental fatig...
2,Evaluation of AI-Based Digital Assistants in S...,Industry 5.0 complements the Industry 4.0 para...,Industry 5.0; Evaluation methodology; Trustwor...
3,Assessment of a large language model based dig...,The use of Digital Intelligent Assistants (DIA...,Chatbot; Experimental design; Artificial intel...
4,A Decision Support System tailored to the Main...,Industry 5.0 addresses the human challenges of...,Industry 5.0; Operator 5.0; Maintenance tasks;...


In [5]:
# Import TfidfVectorizer from sklearn for keyword extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Define priority keywords that are relevant to cognitive load in manufacturing
priority_keywords = ["cognitive load", "workload", "assembly line", "industry 4.0", "manufacturing", "human-robot interaction"]

# Combine 'Title' and 'Abstract' columns into a single text field for analysis
filtered_data["combined_text"] = filtered_data["Article Title"] + " " + filtered_data["Abstract"]

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

# Fit and transform the combined text data
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_data["combined_text"])

# Get feature names (terms) from the vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()


# Function to calculate relevance score based on priority keywords
def calculate_keyword_score(tfidf_vector, keywords):
    score = 0
    for keyword in keywords:
        # Check if the keyword is in the feature names
        if keyword in feature_names:
            # Add the TF-IDF score of the keyword to the score
            score += tfidf_vector[0, feature_names.tolist().index(keyword)]
    return score


# Apply the function to calculate scores for each article
filtered_data["keyword_score"] = [calculate_keyword_score(tfidf_matrix[i], priority_keywords) for i in range(tfidf_matrix.shape[0])]

# Sort the DataFrame by 'keyword_score' in descending order
filtered_data = filtered_data.sort_values(by="keyword_score", ascending=False).reset_index(drop=True)

# Display the top 5 articles by relevance
filtered_data[["Article Title", "keyword_score"]].head()

Unnamed: 0,Article Title,keyword_score
0,A theoretical framework for evaluating mental ...,0.374864
1,Profiling cognitive workload in an unmanned ve...,0.360587
2,Towards the Integration and Evaluation of Onli...,0.335313
3,Determining Cognitive Workload Using Physiolog...,0.330884
4,Using Past and Present Indicators of Human Wor...,0.312137


In [6]:
# Import the summarization pipeline from transformers
from transformers import pipeline

# Initialize the summarization model
# The summarizer will generate concise summaries of each abstract
summarizer = pipeline("summarization", device="cuda")


# Function to summarize abstract text
def summarize_text(text):
    try:
        # Generate summary for the text with specified length constraints
        summary = summarizer(text, max_length=50, min_length=25, do_sample=False)[0]["summary_text"]
    except Exception as e:
        # In case of an error (e.g., if text is too short), return original text as a fallback
        summary = text
    return summary


# Apply the summarization function to the 'Abstract' column
# Store the summaries in a new column called 'summary'
filtered_data["summary"] = filtered_data["Abstract"].apply(summarize_text)

# Display the top 5 articles with their titles, keyword scores, and summaries
filtered_data[["Article Title", "keyword_score", "summary"]].head()

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


RuntimeError: At least one of TensorFlow 2.0 or PyTorch should be installed. To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ To install PyTorch, read the instructions at https://pytorch.org/.

In [None]:
# Adjust the threshold to increase the number of selected articles for a more comprehensive review
# Let's try a lower threshold to capture the top ~20-30 articles initially
#top_articles = filtered_data[filtered_data['keyword_score'] > 0.2].reset_index(drop=True)

# Display the count of articles and review them
#print(f"Total top articles selected with adjusted threshold: {len(top_articles)}")
#top_articles[['Article Title', 'keyword_score', 'summary']].head(30)  # Show top 30 if available

# Further adjust the threshold for a larger selection, aiming for around 45-70 articles
# Set a lower threshold to capture more articles for full-text review
top_articles = filtered_data[filtered_data['keyword_score'] > 0.12].reset_index(drop=True)

# Display the count of selected articles and review them
print(f"Total top articles selected with further adjusted threshold: {len(top_articles)}")
top_articles[['Article Title', 'keyword_score', 'summary']].head(60)  # Show top articles


Total top articles selected with further adjusted threshold: 53


Unnamed: 0,Article Title,keyword_score,summary
0,A theoretical framework for evaluating mental ...,0.374864,As the nature of manufacturing work is changi...
1,Profiling cognitive workload in an unmanned ve...,0.360587,"In the present study, we use Cognitive Metric..."
2,Towards the Integration and Evaluation of Onli...,0.335313,Adapting automation systems to the workload l...
3,Determining Cognitive Workload Using Physiolog...,0.330884,The adoption of Industry 4.0 technologies in ...
4,Using Past and Present Indicators of Human Wor...,0.312137,There is a lack of evidence for a direct rela...
5,Effect of cognitive automation in a material h...,0.311043,The application of advanced automation techno...
6,Overloaded and at Work: Investigating the Effe...,0.249787,Little evidence is available on the effect th...
7,Assessing the Relationship between Cognitive W...,0.245253,Collaborative robots are revolutionising the ...
8,What do subjective workload scales really meas...,0.242383,Lack of convergence of subjective scales with...
9,Effect of interface design on cognitive worklo...,0.224322,Unmanned Aerial Vehicle (UAV) control interfa...


In [None]:
# Load the initial dataset and verify row count
data = pd.read_csv('WOS.csv')
print(f"Total rows loaded: {len(data)}")  # Expected: 226

# Count rows after rule-based filtering
print(f"Rows after rule-based filtering: {len(filtered_data)}")


# Verify row count after TF-IDF and ensure all rows have a 'keyword_score'
print(f"Rows with keyword_score: {len(filtered_data)}")
print(f"Missing keyword_score values: {filtered_data['keyword_score'].isna().sum()}")


# Check that summaries exist for each row
print(f"Rows with summaries: {filtered_data['summary'].notna().sum()}")


# Verify final selection count for top articles
print(f"Total top articles selected: {len(top_articles)}")


Total rows loaded: 226
Rows after rule-based filtering: 218
Rows with keyword_score: 218
Missing keyword_score values: 0
Rows with summaries: 218
Total top articles selected: 53


In [None]:
# Extend your DataFrame to include tracking columns for full-text review
top_articles['Full Text Retrieved'] = False
top_articles['Objective/Purpose'] = ''
top_articles['Cognitive Load Measurement Methods'] = ''
top_articles['Key Findings'] = ''
top_articles['Relevance to Manufacturing Context'] = ''
top_articles['Strengths/Limitations'] = ''

# Save this as a CSV to track your progress
top_articles.to_csv("Full_Text_Review_Progress.csv", index=False)


In [35]:
# Revised function to add status of each DOI retrieval
def get_full_text_url(doi):
    if pd.isna(doi) or not isinstance(doi, str):
        # Skip rows with missing or non-string DOIs
        return None, "Invalid DOI"
    try:
        url = f"https://api.unpaywall.org/v2/{doi}?email=niloofar.rezaei1991@gmail.com"
        response = requests.get(url)
        response.raise_for_status()
        result = response.json()
        if result.get('is_oa'):
            # Return the PDF link if available, else the main open-access link
            return (result['best_oa_location'].get('url_for_pdf') or result['best_oa_location'].get('url')), "Found"
        else:
            return None, "Not Open Access"
    except Exception as e:
        print(f"Error retrieving DOI {doi}: {e}")
        return None, "Error"

# Apply function to get both link and status
data[['Full Text Link', 'Status']] = data['DOI'].apply(lambda doi: pd.Series(get_full_text_url(doi)))

# Save all data with status to a new CSV for tracking
data.to_csv("Full_Text_Links_Status.csv", index=False)

print("Full text links with status have been saved to 'Full_Text_Links_Status.csv'")

Error retrieving DOI 10.3233/978-1-61499-792-4-114: 404 Client Error: NOT FOUND for url: https://api.unpaywall.org/v2/10.3233/978-1-61499-792-4-114?email=niloofar.rezaei1991@gmail.com
Full text links with status have been saved to 'Full_Text_Links_Status.csv'
