In [85]:
#Important Libraries
import pandas as pd

In [86]:
scopus_data = pd.read_csv("./scopus_final.csv")
wos_data = pd.read_csv("./wos_final.csv")
sd_data = pd.read_csv("./sciencedirect_final.csv")

In [87]:
# Display column names in each dataset to identify discrepancies
print("Scopus columns:", scopus_data.columns)
print("Web of Science columns:", wos_data.columns)
print("ScienceDirect columns:", sd_data.columns)

Scopus columns: Index(['Authors', 'Author full names', 'Author(s) ID', 'Title', 'Year',
       'Source title', 'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end',
       'Page count', 'Cited by', 'DOI', 'Link', 'Affiliations',
       'Authors with affiliations', 'Abstract', 'Author Keywords',
       'Index Keywords', 'Molecular Sequence Numbers', 'Chemicals/CAS',
       'Tradenames', 'Manufacturers', 'Funding Details', 'Funding Texts',
       'References', 'Correspondence Address', 'Editors', 'Publisher',
       'Sponsors', 'Conference name', 'Conference date', 'Conference location',
       'Conference code', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID',
       'Language of Original Document', 'Abbreviated Source Title',
       'Document Type', 'Publication Stage', 'Open Access', 'Source', 'EID'],
      dtype='object')
Web of Science columns: Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',
       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',
  

In [88]:
# Rename columns for consistency
scopus_data.rename(columns={
    'Title': 'Title',
    'Authors': 'Authors',
    'Year': 'Year',
    'Source title': 'Source',
    'DOI': 'DOI',
    'Abstract': 'Abstract',
    'Author Keywords': 'Keywords',
    'Document Type': 'Document_Type'
}, inplace=True)

wos_data.rename(columns={
    'Article Title': 'Title',
    'Authors': 'Authors',
    'Publication Year': 'Year', 
    'Source Title': 'Source',
    'DOI': 'DOI',
    'Abstract': 'Abstract',
    'Author Keywords': 'Keywords', 
    'Document Type': 'Document_Type'
}, inplace=True)

sd_data.rename(columns={
    'primary_title': 'Title',
    'authors': 'Authors',
    'year': 'Year',
    'journal_name': 'Source',
    'doi': 'DOI',
    'abstract': 'Abstract',
    'keywords': 'Keywords',
    'type_of_reference': 'Document_Type'
}, inplace=True)

# Define required columns and add missing ones
columns_to_keep = ['Title', 'Abstract', 'Keywords', 'Source', 'Year', 'Document_Type', 'Authors', 'DOI']


In [89]:
# Drop duplicate columns in Scopus, Web of Science, and ScienceDirect dataframes

# For Scopus - Drop the second occurrence of 'Source'
scopus_data = scopus_data.loc[:, ~scopus_data.columns.duplicated()]

# For Web of Science - Drop the second occurrence of 'Keywords' and 'Document_Type'
wos_data = wos_data.loc[:, ~wos_data.columns.duplicated()]

# ScienceDirect already looks clean, but this ensures no duplicates
sd_data = sd_data.loc[:, ~sd_data.columns.duplicated()]

# Confirm column alignment after cleaning
print("Scopus data columns:", scopus_data.columns)
print("Web of Science data columns:", wos_data.columns)
print("ScienceDirect data columns:", sd_data.columns)

Scopus data columns: Index(['Authors', 'Author full names', 'Author(s) ID', 'Title', 'Year',
       'Source', 'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end',
       'Page count', 'Cited by', 'DOI', 'Link', 'Affiliations',
       'Authors with affiliations', 'Abstract', 'Keywords', 'Index Keywords',
       'Molecular Sequence Numbers', 'Chemicals/CAS', 'Tradenames',
       'Manufacturers', 'Funding Details', 'Funding Texts', 'References',
       'Correspondence Address', 'Editors', 'Publisher', 'Sponsors',
       'Conference name', 'Conference date', 'Conference location',
       'Conference code', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID',
       'Language of Original Document', 'Abbreviated Source Title',
       'Document_Type', 'Publication Stage', 'Open Access', 'EID'],
      dtype='object')
Web of Science data columns: Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',
       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',
       'Group A

In [90]:
# Ensure each dataset has all required columns, filling missing columns with NaN
scopus_data = scopus_data.reindex(columns=columns_to_keep)
wos_data = wos_data.reindex(columns=columns_to_keep)
sd_data = sd_data.reindex(columns=columns_to_keep)

In [91]:
# Concatenate all datasets
combined_data = pd.concat([scopus_data, wos_data, sd_data], ignore_index=True)

# Display basic information about the combined dataset to verify
print("Total records in combined dataset:", combined_data.shape[0])
print("Columns in combined dataset:", combined_data.columns)
combined_data.head()

Total records in combined dataset: 3649
Columns in combined dataset: Index(['Title', 'Abstract', 'Keywords', 'Source', 'Year', 'Document_Type',
       'Authors', 'DOI'],
      dtype='object')


Unnamed: 0,Title,Abstract,Keywords,Source,Year,Document_Type,Authors,DOI
0,A vision-enabled fatigue-sensitive human digit...,Within a Human-centric Human-Robot Collaborati...,Ergonomics; Fatigue assessment; Human digital ...,Journal of Manufacturing Systems,2024,Article,Chand S.; Zheng H.; Lu Y.,10.1016/j.jmsy.2024.10.002
1,Knowledge transfer in Digital Twins: The metho...,"In the realm of Digital Twins (DTs), industry ...",Basic Formal Ontology (BFO); Common Core Ontol...,CIRP Journal of Manufacturing Science and Tech...,2024,Article,D'Amico R.D.; Sarkar A.; Karray M.H.; Addepall...,10.1016/j.cirpj.2024.06.007
2,A Dualistic Perspective of Opportunity and Ris...,Head-mounted augmented reality (HMD AR) techno...,Cognitive behavior and performance; Cognitive ...,Journal of Construction Engineering and Manage...,2024,Article,Liu J.; Yan X.; Gao W.,10.1061/JCEMD4.COENG-14684
3,A Segmentation Framework based on Cognitive Sc...,Industry 5.0 rethinks the role of human operat...,Cognitive Sciences; Human-Centricity; Industry...,ACM International Conference Proceeding Series,2024,Conference paper,Varni G.; Volpe G.,10.1145/3656650.3656717
4,Putting workers’ safety front and center: Empl...,Introduction: The global occupational accident...,Employee-organization exchange; Management com...,Journal of Safety Research,2024,Article,Kuang H.-X.; Pan W.; Sun L.-Y.,10.1016/j.jsr.2024.08.007


In [92]:
# Drop duplicate entries based on DOI
deduped_data = combined_data.drop_duplicates(subset="DOI", keep="first")

# In case some articles don’t have a DOI, we can further deduplicate based on Title and Authors
deduped_data = deduped_data.drop_duplicates(subset=["Title", "Authors"], keep="first")

# Display the number of unique articles after deduplication
print("Total records after deduplication:", deduped_data.shape[0])
deduped_data.head()

Total records after deduplication: 2678


Unnamed: 0,Title,Abstract,Keywords,Source,Year,Document_Type,Authors,DOI
0,A vision-enabled fatigue-sensitive human digit...,Within a Human-centric Human-Robot Collaborati...,Ergonomics; Fatigue assessment; Human digital ...,Journal of Manufacturing Systems,2024,Article,Chand S.; Zheng H.; Lu Y.,10.1016/j.jmsy.2024.10.002
1,Knowledge transfer in Digital Twins: The metho...,"In the realm of Digital Twins (DTs), industry ...",Basic Formal Ontology (BFO); Common Core Ontol...,CIRP Journal of Manufacturing Science and Tech...,2024,Article,D'Amico R.D.; Sarkar A.; Karray M.H.; Addepall...,10.1016/j.cirpj.2024.06.007
2,A Dualistic Perspective of Opportunity and Ris...,Head-mounted augmented reality (HMD AR) techno...,Cognitive behavior and performance; Cognitive ...,Journal of Construction Engineering and Manage...,2024,Article,Liu J.; Yan X.; Gao W.,10.1061/JCEMD4.COENG-14684
3,A Segmentation Framework based on Cognitive Sc...,Industry 5.0 rethinks the role of human operat...,Cognitive Sciences; Human-Centricity; Industry...,ACM International Conference Proceeding Series,2024,Conference paper,Varni G.; Volpe G.,10.1145/3656650.3656717
4,Putting workers’ safety front and center: Empl...,Introduction: The global occupational accident...,Employee-organization exchange; Management com...,Journal of Safety Research,2024,Article,Kuang H.-X.; Pan W.; Sun L.-Y.,10.1016/j.jsr.2024.08.007
