In [3]:
# Display column names in each dataset to identify discrepancies
print("Scopus columns:", scopus_data.columns)
print("Web of Science columns:", wos_data.columns)
print("ScienceDirect columns:", sd_data.columns)

Scopus columns: Index(['Title', 'Authors', 'Year', 'Source', 'Source', 'Abstract', 'Keywords',
       'DOI', 'Document_Type'],
      dtype='object')
Web of Science columns: Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',
       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',
       'Group Authors', 'Article Title', 'Source', 'Book Series Title',
       'Book Series Subtitle', 'Language', 'Document_Type', 'Conference Title',
       'Conference Date', 'Conference Location', 'Conference Sponsor',
       'Conference Host', 'Keywords', 'Keywords', 'Abstract', 'Addresses',
       'Affiliations', 'Reprint Addresses', 'Email Addresses',
       'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',
       'Funding Text', 'Cited References', 'Cited Reference Count',
       'Times Cited, WoS Core', 'Times Cited, All Databases',
       '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',
       'Publisher City', 'Publisher Address'

In [4]:
# Define the columns we need
columns_to_keep = {
    "Title": ["Title", "primary_title", "Article Title"],
    "Authors": ["Authors", "Authors"],
    "Year": ["Year", "Year"],
    "Source": ["Source", "Source", "journal_name"],
    "Abstract": ["Abstract", "Abstract"],
    "Keywords": ["Keywords", "Keywords", "Keywords"],
    "DOI": ["DOI", "DOI", "DOI"],
    "Document_Type": ["Document_Type", "Publication Type", "type_of_reference"]
}

# Standardize column names in each dataframe
scopus_data.columns = [next((k for k, v in columns_to_keep.items() if col in v), col) for col in scopus_data.columns]
wos_data.columns = [next((k for k, v in columns_to_keep.items() if col in v), col) for col in wos_data.columns]
sd_data.columns = [next((k for k, v in columns_to_keep.items() if col in v), col) for col in sd_data.columns]


In [5]:
# Filter each dataset to retain only the specified columns
scopus_data = scopus_data[list(columns_to_keep.keys())]
wos_data = wos_data[list(columns_to_keep.keys())]
sd_data = sd_data[list(columns_to_keep.keys())]

# Confirming column alignment
print("Scopus data columns:", scopus_data.columns)
print("Web of Science data columns:", wos_data.columns)
print("ScienceDirect data columns:", sd_data.columns)

Scopus data columns: Index(['Title', 'Authors', 'Year', 'Source', 'Source', 'Abstract', 'Keywords',
       'DOI', 'Document_Type'],
      dtype='object')
Web of Science data columns: Index(['Title', 'Authors', 'Year', 'Source', 'Abstract', 'Keywords',
       'Keywords', 'DOI', 'Document_Type', 'Document_Type'],
      dtype='object')
ScienceDirect data columns: Index(['Title', 'Authors', 'Year', 'Source', 'Abstract', 'Keywords', 'DOI',
       'Document_Type'],
      dtype='object')


In [6]:
# Drop duplicate columns in Scopus, Web of Science, and ScienceDirect dataframes

# For Scopus - Drop the second occurrence of 'Source'
scopus_data = scopus_data.loc[:, ~scopus_data.columns.duplicated()]

# For Web of Science - Drop the second occurrence of 'Keywords' and 'Document_Type'
wos_data = wos_data.loc[:, ~wos_data.columns.duplicated()]

# ScienceDirect already looks clean, but this ensures no duplicates
sd_data = sd_data.loc[:, ~sd_data.columns.duplicated()]

# Confirm column alignment after cleaning
print("Scopus data columns:", scopus_data.columns)
print("Web of Science data columns:", wos_data.columns)
print("ScienceDirect data columns:", sd_data.columns)

Scopus data columns: Index(['Title', 'Authors', 'Year', 'Source', 'Abstract', 'Keywords', 'DOI',
       'Document_Type'],
      dtype='object')
Web of Science data columns: Index(['Title', 'Authors', 'Year', 'Source', 'Abstract', 'Keywords', 'DOI',
       'Document_Type'],
      dtype='object')
ScienceDirect data columns: Index(['Title', 'Authors', 'Year', 'Source', 'Abstract', 'Keywords', 'DOI',
       'Document_Type'],
      dtype='object')


In [7]:
# Concatenate all datasets
combined_data = pd.concat([scopus_data, wos_data, sd_data], ignore_index=True)

# Display basic information about the combined dataset to verify
print("Total records in combined dataset:", combined_data.shape[0])
print("Columns in combined dataset:", combined_data.columns)
combined_data.head()

Total records in combined dataset: 3649
Columns in combined dataset: Index(['Title', 'Authors', 'Year', 'Source', 'Abstract', 'Keywords', 'DOI',
       'Document_Type'],
      dtype='object')


Unnamed: 0,Title,Authors,Year,Source,Abstract,Keywords,DOI,Document_Type
0,A vision-enabled fatigue-sensitive human digit...,Chand S.; Zheng H.; Lu Y.,2024,Journal of Manufacturing Systems,Within a Human-centric Human-Robot Collaborati...,Ergonomics; Fatigue assessment; Human digital ...,10.1016/j.jmsy.2024.10.002,Article
1,Knowledge transfer in Digital Twins: The metho...,D'Amico R.D.; Sarkar A.; Karray M.H.; Addepall...,2024,CIRP Journal of Manufacturing Science and Tech...,"In the realm of Digital Twins (DTs), industry ...",Basic Formal Ontology (BFO); Common Core Ontol...,10.1016/j.cirpj.2024.06.007,Article
2,A Dualistic Perspective of Opportunity and Ris...,Liu J.; Yan X.; Gao W.,2024,Journal of Construction Engineering and Manage...,Head-mounted augmented reality (HMD AR) techno...,Cognitive behavior and performance; Cognitive ...,10.1061/JCEMD4.COENG-14684,Article
3,A Segmentation Framework based on Cognitive Sc...,Varni G.; Volpe G.,2024,ACM International Conference Proceeding Series,Industry 5.0 rethinks the role of human operat...,Cognitive Sciences; Human-Centricity; Industry...,10.1145/3656650.3656717,Conference paper
4,Putting workers’ safety front and center: Empl...,Kuang H.-X.; Pan W.; Sun L.-Y.,2024,Journal of Safety Research,Introduction: The global occupational accident...,Employee-organization exchange; Management com...,10.1016/j.jsr.2024.08.007,Article


In [8]:
# Drop duplicate entries based on DOI
deduped_data = combined_data.drop_duplicates(subset="DOI", keep="first")

# In case some articles don’t have a DOI, we can further deduplicate based on Title and Authors
deduped_data = deduped_data.drop_duplicates(subset=["Title", "Authors"], keep="first")

# Display the number of unique articles after deduplication
print("Total records after deduplication:", deduped_data.shape[0])
deduped_data.head()

Total records after deduplication: 2678


Unnamed: 0,Title,Authors,Year,Source,Abstract,Keywords,DOI,Document_Type
0,A vision-enabled fatigue-sensitive human digit...,Chand S.; Zheng H.; Lu Y.,2024,Journal of Manufacturing Systems,Within a Human-centric Human-Robot Collaborati...,Ergonomics; Fatigue assessment; Human digital ...,10.1016/j.jmsy.2024.10.002,Article
1,Knowledge transfer in Digital Twins: The metho...,D'Amico R.D.; Sarkar A.; Karray M.H.; Addepall...,2024,CIRP Journal of Manufacturing Science and Tech...,"In the realm of Digital Twins (DTs), industry ...",Basic Formal Ontology (BFO); Common Core Ontol...,10.1016/j.cirpj.2024.06.007,Article
2,A Dualistic Perspective of Opportunity and Ris...,Liu J.; Yan X.; Gao W.,2024,Journal of Construction Engineering and Manage...,Head-mounted augmented reality (HMD AR) techno...,Cognitive behavior and performance; Cognitive ...,10.1061/JCEMD4.COENG-14684,Article
3,A Segmentation Framework based on Cognitive Sc...,Varni G.; Volpe G.,2024,ACM International Conference Proceeding Series,Industry 5.0 rethinks the role of human operat...,Cognitive Sciences; Human-Centricity; Industry...,10.1145/3656650.3656717,Conference paper
4,Putting workers’ safety front and center: Empl...,Kuang H.-X.; Pan W.; Sun L.-Y.,2024,Journal of Safety Research,Introduction: The global occupational accident...,Employee-organization exchange; Management com...,10.1016/j.jsr.2024.08.007,Article


In [9]:
# Define priority keywords related to the research topic
priority_keywords = [
    "cognitive load", "human-robot interaction", "Industry 4.0", "Industry 5.0", 
    "fatigue", "ergonomics", "assembly line", "automation", "smart manufacturing",
    "mental workload", "cognitive workload", "worker safety", "human factors", 
    "manufacturing systems"
]

# Filter the dataset to retain only relevant articles
# This will look for matches in both the 'Keywords' and 'Abstract' columns
filtered_data = deduped_data[
    deduped_data['Keywords'].str.contains('|'.join(priority_keywords), case=False, na=False) |
    deduped_data['Abstract'].str.contains('|'.join(priority_keywords), case=False, na=False)
]

# Display the number of records after filtering and show a sample
print("Total relevant articles after keyword filtering:", filtered_data.shape[0])
filtered_data.head()

Total relevant articles after keyword filtering: 1245


Unnamed: 0,Title,Authors,Year,Source,Abstract,Keywords,DOI,Document_Type
0,A vision-enabled fatigue-sensitive human digit...,Chand S.; Zheng H.; Lu Y.,2024,Journal of Manufacturing Systems,Within a Human-centric Human-Robot Collaborati...,Ergonomics; Fatigue assessment; Human digital ...,10.1016/j.jmsy.2024.10.002,Article
2,A Dualistic Perspective of Opportunity and Ris...,Liu J.; Yan X.; Gao W.,2024,Journal of Construction Engineering and Manage...,Head-mounted augmented reality (HMD AR) techno...,Cognitive behavior and performance; Cognitive ...,10.1061/JCEMD4.COENG-14684,Article
3,A Segmentation Framework based on Cognitive Sc...,Varni G.; Volpe G.,2024,ACM International Conference Proceeding Series,Industry 5.0 rethinks the role of human operat...,Cognitive Sciences; Human-Centricity; Industry...,10.1145/3656650.3656717,Conference paper
6,Research on the Human–Robot Collaborative Disa...,Jiao J.; Feng G.; Yuan G.,2024,Batteries,The disassembly of spent lithium batteries is ...,disassembly; green manufacturing; human-factor...,10.3390/batteries10060196,Article
8,Multilinear principal component analysis-based...,Al Mamun A.; Islam M.I.; Shohag M.A.S.; Al-Kou...,2024,Pattern Analysis and Applications,Modern textile industry integrates video senso...,Computer vision; Fabric pattern recognition; F...,10.1007/s10044-024-01318-4,Article
