In [1]:
# Import TfidfVectorizer from sklearn for keyword extraction
from sklearn.feature_extraction.text import TfidfVectorizer
# Import the summarization pipeline from transformers
from transformers import pipeline

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
from glob import glob

In [2]:
SCO_csv = "./Scopus/scopus.csv"
SD_csv = "./ScienceDirect/ScienceDirect.csv"
WOS_csv = "./WOS/wos.csv"

all_csv = "./All.csv"
combined_csv = "./COMBINED.csv"

In [3]:
data = pd.read_csv(combined_csv)

In [4]:
data.columns

Index(['Unnamed: 0', 'Title', 'Abstract', 'Authors', 'Keywords', 'DOI', 'ISSN',
       'Publication Year'],
      dtype='object')

In [5]:
# Define a list of essential columns to retain
# This focuses on the columns most relevant for systematic review and data analysis
# essential_columns = [
#     "Article Title",  # Title of the paper
#     "Abstract",  # Abstract text
#     "Author Keywords",  # Keywords provided by the authors
#     "Keywords Plus",  # Additional keywords provided by Web of Science
#     "Authors",  # Names of authors
#     "Source Title",  # Journal or source title
#     "Publication Year",  # Year of publication
#     "Document Type",  # Type of document (e.g., research article, review)
#     "DOI",  # DOI for unique identification
#     "Times Cited, WoS Core",  # Number of times cited
#     "Research Areas",  # Areas of research (e.g., manufacturing, engineering)
# ]

essential_columns = ["Title", "Abstract", "Authors", "Keywords", "DOI", "ISSN", "Publication Year"]

# Create a new DataFrame with only the essential columns
filtered_data = data[essential_columns]

# Display the first few rows of the new DataFrame to verify selected columns
filtered_data.head()

# Check the column names to ensure only essential columns are included
print("Columns in the filtered dataset:", filtered_data.columns.tolist())

Columns in the filtered dataset: ['Title', 'Abstract', 'Authors', 'Keywords', 'DOI', 'ISSN', 'Publication Year']


In [6]:

# Define priority keywords that are relevant to cognitive load in manufacturing
priority_keywords = ["cognitive load", "workload", "assembly line", "industry 4.0", "manufacturing", "human-robot interaction"]

# Combine 'Title' and 'Abstract' columns into a single text field for analysis
filtered_data["combined_text"] = filtered_data["Title"] + " " + filtered_data["Abstract"]

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

# Fit and transform the combined text data
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_data["combined_text"])

# Get feature names (terms) from the vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()


# Function to calculate relevance score based on priority keywords
def calculate_keyword_score(tfidf_vector, keywords):
    score = 0
    for keyword in keywords:
        # Check if the keyword is in the feature names
        if keyword in feature_names:
            # Add the TF-IDF score of the keyword to the score
            score += tfidf_vector[0, feature_names.tolist().index(keyword)]
    return score


# Apply the function to calculate scores for each article
filtered_data["keyword_score"] = [calculate_keyword_score(tfidf_matrix[i], priority_keywords) for i in range(tfidf_matrix.shape[0])]

# Sort the DataFrame by 'keyword_score' in descending order
filtered_data = filtered_data.sort_values(by="keyword_score", ascending=False).reset_index(drop=True)

# Display the top 5 articles by relevance
filtered_data[["Title", "keyword_score"]].head()

Unnamed: 0,Title,keyword_score
0,An automatic procedure based on virtual ergono...,0.310729
1,An automatic procedure based on virtual ergono...,0.293089
2,Simulation-based analysis of AGV workload used...,0.283515
3,Cognitive and metabolic workload assessment te...,0.278087
4,Workload analysis using the workload analysis ...,0.26731


In [None]:
from transformers import pipeline

# Initialize the summarization model
summarizer = pipeline("summarization", device=0)  # Use GPU (device=0)


# Function to summarize a batch of texts
def summarize_batch(texts, max_length=50, min_length=25):
    try:
        # Generate summaries for the batch
        summaries = summarizer(
            texts,
            max_length=max_length,
            min_length=min_length,
            do_sample=False,
            truncation=True,
        )
        # Extract the summary text from each result
        return [summary["summary_text"] for summary in summaries]
    except Exception as e:
        # If an error occurs, fallback to the original texts
        return texts


# Batch size for processing
batch_size = 64

# Apply summarization in batches
abstracts = filtered_data["Abstract"].tolist()
summaries = []

for i in range(0, len(abstracts), batch_size):
    batch = abstracts[i : i + batch_size]
    summaries.extend(summarize_batch(batch))

# Add the summaries to the DataFrame
filtered_data["summary"] = summaries

# Display the top 5 articles with their titles, keyword scores, and summaries
print(filtered_data[["Title", "keyword_score", "summary"]].head())

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


This cell takes ~6:30 to complete

In [None]:


# Initialize the summarization model
# The summarizer will generate concise summaries of each abstract
summarizer = pipeline("summarization", device="cuda")


# Function to summarize abstract text
def summarize_text(text):
    try:
        # Generate summary for the text with specified length constraints
        summary = summarizer(text, max_length=50, min_length=25, do_sample=False)[0]["summary_text"]
    except Exception as e:
        # In case of an error (e.g., if text is too short), return original text as a fallback
        summary = text
    return summary


# Apply the summarization function to the 'Abstract' column
# Store the summaries in a new column called 'summary'
filtered_data["summary"] = filtered_data["Abstract"].apply(summarize_text)

# Display the top 5 articles with their titles, keyword scores, and summaries
filtered_data[["Title", "keyword_score", "summary"]].head()

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
  return self.fget.__get__(instance, owner)()
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Token indices sequence length is longer than the specified maximum sequence length for this model (1218 > 1024). Running this sequence through the model will result in indexing errors


Unnamed: 0,Title,keyword_score,summary
0,An automatic procedure based on virtual ergono...,0.310729,The paper presents a structured procedure to ...
1,An automatic procedure based on virtual ergono...,0.293089,The paper presents a structured procedure to ...
2,Simulation-based analysis of AGV workload used...,0.283515,Competitiveness in the aircraft manufacturing...
3,Cognitive and metabolic workload assessment te...,0.278087,Ergonomics assessment in the automotive indus...
4,Workload analysis using the workload analysis ...,0.26731,Hidup Baru is an Agro-industry business which...


In [None]:
# Adjust the threshold to increase the number of selected articles for a more comprehensive review
# Let's try a lower threshold to capture the top ~20-30 articles initially
# top_articles = filtered_data[filtered_data['keyword_score'] > 0.2].reset_index(drop=True)

# Display the count of articles and review them
# print(f"Total top articles selected with adjusted threshold: {len(top_articles)}")
# top_articles[['Article Title', 'keyword_score', 'summary']].head(30)  # Show top 30 if available

# Further adjust the threshold for a larger selection, aiming for around 45-70 articles
# Set a lower threshold to capture more articles for full-text review
top_articles = filtered_data[filtered_data["keyword_score"] > 0.08].reset_index(drop=True)

# Display the count of selected articles and review them
print(f"Total top articles selected with further adjusted threshold: {len(top_articles)}")
top_articles[["Title", "keyword_score", "summary"]].head(60)  # Show top articles

Total top articles selected with further adjusted threshold: 205


Unnamed: 0,Title,keyword_score,summary
0,An automatic procedure based on virtual ergono...,0.310729,The paper presents a structured procedure to ...
1,An automatic procedure based on virtual ergono...,0.293089,The paper presents a structured procedure to ...
2,Simulation-based analysis of AGV workload used...,0.283515,Competitiveness in the aircraft manufacturing...
3,Cognitive and metabolic workload assessment te...,0.278087,Ergonomics assessment in the automotive indus...
4,Workload analysis using the workload analysis ...,0.26731,Hidup Baru is an Agro-industry business which...
5,Combining Time and Physical Workload Analysis ...,0.254463,The research objective in this study is valid...
6,Efficiency enhancement in CNC industry using v...,0.238113,Lean manufacturing is a production strategy f...
7,Workforce Scheduling Considering Physical and ...,0.236518,Manufacturing industry depend heavily on logi...
8,Workforce Scheduling Considering Physical and ...,0.236105,Manufacturing industry depend heavily on logi...
9,Workforce scheduling considering physical and ...,0.235537,Manufacturing industry depend heavily on logi...
