In [3]:
SCO_csv = "./Scopus/scopus.csv"
SD_csv = "./ScienceDirect/ScienceDirect.csv"
WOS_csv = "./WOS/wos.csv"

In [10]:
# Import TfidfVectorizer from sklearn for keyword extraction
from sklearn.feature_extraction.text import TfidfVectorizer
# Import the summarization pipeline from transformers
from transformers import pipeline

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
from glob import glob

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [6]:
data = pd.read_csv("../Niloofar/WOS.csv")

In [7]:
# Define a list of essential columns to retain
# This focuses on the columns most relevant for systematic review and data analysis
essential_columns = [
    "Article Title",  # Title of the paper
    "Abstract",  # Abstract text
    "Author Keywords",  # Keywords provided by the authors
    "Keywords Plus",  # Additional keywords provided by Web of Science
    "Authors",  # Names of authors
    "Source Title",  # Journal or source title
    "Publication Year",  # Year of publication
    "Document Type",  # Type of document (e.g., research article, review)
    "DOI",  # DOI for unique identification
    "Times Cited, WoS Core",  # Number of times cited
    "Research Areas",  # Areas of research (e.g., manufacturing, engineering)
]

# Create a new DataFrame with only the essential columns
filtered_data = data[essential_columns]

# Display the first few rows of the new DataFrame to verify selected columns
filtered_data.head()

# Check the column names to ensure only essential columns are included
print("Columns in the filtered dataset:", filtered_data.columns.tolist())

Columns in the filtered dataset: ['Article Title', 'Abstract', 'Author Keywords', 'Keywords Plus', 'Authors', 'Source Title', 'Publication Year', 'Document Type', 'DOI', 'Times Cited, WoS Core', 'Research Areas']


In [8]:

# Define priority keywords that are relevant to cognitive load in manufacturing
priority_keywords = ["cognitive load", "workload", "assembly line", "industry 4.0", "manufacturing", "human-robot interaction"]

# Combine 'Title' and 'Abstract' columns into a single text field for analysis
filtered_data["combined_text"] = filtered_data["Article Title"] + " " + filtered_data["Abstract"]

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

# Fit and transform the combined text data
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_data["combined_text"])

# Get feature names (terms) from the vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()


# Function to calculate relevance score based on priority keywords
def calculate_keyword_score(tfidf_vector, keywords):
    score = 0
    for keyword in keywords:
        # Check if the keyword is in the feature names
        if keyword in feature_names:
            # Add the TF-IDF score of the keyword to the score
            score += tfidf_vector[0, feature_names.tolist().index(keyword)]
    return score


# Apply the function to calculate scores for each article
filtered_data["keyword_score"] = [calculate_keyword_score(tfidf_matrix[i], priority_keywords) for i in range(tfidf_matrix.shape[0])]

# Sort the DataFrame by 'keyword_score' in descending order
filtered_data = filtered_data.sort_values(by="keyword_score", ascending=False).reset_index(drop=True)

# Display the top 5 articles by relevance
filtered_data[["Article Title", "keyword_score"]].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["combined_text"] = filtered_data["Article Title"] + " " + filtered_data["Abstract"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["keyword_score"] = [calculate_keyword_score(tfidf_matrix[i], priority_keywords) for i in range(tfidf_matrix.shape[0])]


Unnamed: 0,Article Title,keyword_score
0,A theoretical framework for evaluating mental ...,0.377296
1,Profiling cognitive workload in an unmanned ve...,0.366028
2,Towards the Integration and Evaluation of Onli...,0.34031
3,Determining Cognitive Workload Using Physiolog...,0.334163
4,Using Past and Present Indicators of Human Wor...,0.316948


In [None]:


# Initialize the summarization model
# The summarizer will generate concise summaries of each abstract
summarizer = pipeline("summarization", device="cuda")


# Function to summarize abstract text
def summarize_text(text):
    try:
        # Generate summary for the text with specified length constraints
        summary = summarizer(text, max_length=50, min_length=25, do_sample=False)[0]["summary_text"]
    except Exception as e:
        # In case of an error (e.g., if text is too short), return original text as a fallback
        summary = text
    return summary


# Apply the summarization function to the 'Abstract' column
# Store the summaries in a new column called 'summary'
filtered_data["summary"] = filtered_data["Abstract"].apply(summarize_text)

# Display the top 5 articles with their titles, keyword scores, and summaries
filtered_data[["Article Title", "keyword_score", "summary"]].head()