In [1]:
SCO_csv = "./Scopus/scopus.csv"
SD_csv = "./ScienceDirect/ScienceDirect.csv"
WOS_csv = "./WOS/wos.csv"

In [2]:
# Import TfidfVectorizer from sklearn for keyword extraction
from sklearn.feature_extraction.text import TfidfVectorizer
# Import the summarization pipeline from transformers
from transformers import pipeline

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
from glob import glob

In [4]:
data = pd.read_csv("./WOS/wos.csv")

In [5]:
# Define a list of essential columns to retain
# This focuses on the columns most relevant for systematic review and data analysis
essential_columns = [
    "Article Title",  # Title of the paper
    "Abstract",  # Abstract text
    "Author Keywords",  # Keywords provided by the authors
    "Keywords Plus",  # Additional keywords provided by Web of Science
    "Authors",  # Names of authors
    "Source Title",  # Journal or source title
    "Publication Year",  # Year of publication
    "Document Type",  # Type of document (e.g., research article, review)
    "DOI",  # DOI for unique identification
    "Times Cited, WoS Core",  # Number of times cited
    "Research Areas",  # Areas of research (e.g., manufacturing, engineering)
]

# Create a new DataFrame with only the essential columns
filtered_data = data[essential_columns]

# Display the first few rows of the new DataFrame to verify selected columns
filtered_data.head()

# Check the column names to ensure only essential columns are included
print("Columns in the filtered dataset:", filtered_data.columns.tolist())

Columns in the filtered dataset: ['Article Title', 'Abstract', 'Author Keywords', 'Keywords Plus', 'Authors', 'Source Title', 'Publication Year', 'Document Type', 'DOI', 'Times Cited, WoS Core', 'Research Areas']


In [6]:

# Define priority keywords that are relevant to cognitive load in manufacturing
priority_keywords = ["cognitive load", "workload", "assembly line", "industry 4.0", "manufacturing", "human-robot interaction"]

# Combine 'Title' and 'Abstract' columns into a single text field for analysis
filtered_data["combined_text"] = filtered_data["Article Title"] + " " + filtered_data["Abstract"]

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

# Fit and transform the combined text data
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_data["combined_text"])

# Get feature names (terms) from the vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()


# Function to calculate relevance score based on priority keywords
def calculate_keyword_score(tfidf_vector, keywords):
    score = 0
    for keyword in keywords:
        # Check if the keyword is in the feature names
        if keyword in feature_names:
            # Add the TF-IDF score of the keyword to the score
            score += tfidf_vector[0, feature_names.tolist().index(keyword)]
    return score


# Apply the function to calculate scores for each article
filtered_data["keyword_score"] = [calculate_keyword_score(tfidf_matrix[i], priority_keywords) for i in range(tfidf_matrix.shape[0])]

# Sort the DataFrame by 'keyword_score' in descending order
filtered_data = filtered_data.sort_values(by="keyword_score", ascending=False).reset_index(drop=True)

# Display the top 5 articles by relevance
filtered_data[["Article Title", "keyword_score"]].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["combined_text"] = filtered_data["Article Title"] + " " + filtered_data["Abstract"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["keyword_score"] = [calculate_keyword_score(tfidf_matrix[i], priority_keywords) for i in range(tfidf_matrix.shape[0])]


Unnamed: 0,Article Title,keyword_score
0,Formalizing Human-Machine Interactions for Ada...,0.247906
1,Workforce scheduling considering physical and ...,0.241274
2,Identify eight aspects of ergonomics to determ...,0.236773
3,Social sustainability in manufacturing system:...,0.217838
4,Construction Worker Workload Assessment for Hu...,0.202768


In [7]:


# Initialize the summarization model
# The summarizer will generate concise summaries of each abstract
summarizer = pipeline("summarization", device="cuda")


# Function to summarize abstract text
def summarize_text(text):
    try:
        # Generate summary for the text with specified length constraints
        summary = summarizer(text, max_length=50, min_length=25, do_sample=False)[0]["summary_text"]
    except Exception as e:
        # In case of an error (e.g., if text is too short), return original text as a fallback
        summary = text
    return summary


# Apply the summarization function to the 'Abstract' column
# Store the summaries in a new column called 'summary'
filtered_data["summary"] = filtered_data["Abstract"].apply(summarize_text)

# Display the top 5 articles with their titles, keyword scores, and summaries
filtered_data[["Article Title", "keyword_score", "summary"]].head()

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
  return self.fget.__get__(instance, owner)()


model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Unnamed: 0,Article Title,keyword_score,summary
0,Formalizing Human-Machine Interactions for Ada...,0.247906,Human-machine interaction is one of the most ...
1,Workforce scheduling considering physical and ...,0.241274,Manufacturing industry depend heavily on logi...
2,Identify eight aspects of ergonomics to determ...,0.236773,Manufacturing Industry is one of the industri...
3,Social sustainability in manufacturing system:...,0.217838,"In this paper, sustainability is addressed in..."
4,Construction Worker Workload Assessment for Hu...,0.202768,Recent advances in robotics and artificial in...


In [8]:
# Adjust the threshold to increase the number of selected articles for a more comprehensive review
# Let's try a lower threshold to capture the top ~20-30 articles initially
# top_articles = filtered_data[filtered_data['keyword_score'] > 0.2].reset_index(drop=True)

# Display the count of articles and review them
# print(f"Total top articles selected with adjusted threshold: {len(top_articles)}")
# top_articles[['Article Title', 'keyword_score', 'summary']].head(30)  # Show top 30 if available

# Further adjust the threshold for a larger selection, aiming for around 45-70 articles
# Set a lower threshold to capture more articles for full-text review
top_articles = filtered_data[filtered_data["keyword_score"] > 0.08].reset_index(drop=True)

# Display the count of selected articles and review them
print(f"Total top articles selected with further adjusted threshold: {len(top_articles)}")
top_articles[["Article Title", "keyword_score", "summary"]].head(60)  # Show top articles

Total top articles selected with further adjusted threshold: 44


Unnamed: 0,Article Title,keyword_score,summary
0,Formalizing Human-Machine Interactions for Ada...,0.247906,Human-machine interaction is one of the most ...
1,Workforce scheduling considering physical and ...,0.241274,Manufacturing industry depend heavily on logi...
2,Identify eight aspects of ergonomics to determ...,0.236773,Manufacturing Industry is one of the industri...
3,Social sustainability in manufacturing system:...,0.217838,"In this paper, sustainability is addressed in..."
4,Construction Worker Workload Assessment for Hu...,0.202768,Recent advances in robotics and artificial in...
5,Digital Workers in Cyber-Physical-Social Syste...,0.183355,Workers play a significant role in PCB manufa...
6,Overloaded and at Work: Investigating the Effe...,0.181002,Little evidence is available on the effect th...
7,A Mathematical Programming Approach for Multi-...,0.175752,A weighted mixed-integer linear mathematical ...
8,Cross-Trained Worker Assignment Problem in Cel...,0.174203,Cross-trained worker assignment has become in...
9,Balancing high operator's workload through a n...,0.163375,Research aims to smooth the daily workload by...
