In [38]:
#Important Libraries
import pandas as pd

In [39]:
scopus_data = pd.read_csv("./scopus_final.csv")
wos_data = pd.read_csv("./wos_final.csv")
sd_data = pd.read_csv("./sciencedirect_final.csv")

In [40]:
# Display column names in each dataset to identify discrepancies
print("Scopus columns:", scopus_data.columns)
print("Web of Science columns:", wos_data.columns)
print("ScienceDirect columns:", sd_data.columns)

Scopus columns: Index(['Authors', 'Author full names', 'Author(s) ID', 'Title', 'Year',
       'Source title', 'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end',
       'Page count', 'Cited by', 'DOI', 'Link', 'Affiliations',
       'Authors with affiliations', 'Abstract', 'Author Keywords',
       'Index Keywords', 'Molecular Sequence Numbers', 'Chemicals/CAS',
       'Tradenames', 'Manufacturers', 'Funding Details', 'Funding Texts',
       'References', 'Correspondence Address', 'Editors', 'Publisher',
       'Sponsors', 'Conference name', 'Conference date', 'Conference location',
       'Conference code', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID',
       'Language of Original Document', 'Abbreviated Source Title',
       'Document Type', 'Publication Stage', 'Open Access', 'Source', 'EID'],
      dtype='object')
Web of Science columns: Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',
       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',
  

In [41]:
# Rename columns for consistency
scopus_data.rename(columns={
    'Title': 'Title',
    'Authors': 'Authors',
    'Year': 'Year',
    'Source title': 'Source',
    'DOI': 'DOI',
    'Abstract': 'Abstract',
    'Author Keywords': 'Keywords',
    'Document Type': 'Document_Type'
}, inplace=True)

wos_data.rename(columns={
    'Article Title': 'Title',
    'Authors': 'Authors',
    'Publication Year': 'Year', 
    'Source Title': 'Source',
    'DOI': 'DOI',
    'Abstract': 'Abstract',
    'Author Keywords': 'Keywords', 
    'Document Type': 'Document_Type'
}, inplace=True)

sd_data.rename(columns={
    'primary_title': 'Title',
    'authors': 'Authors',
    'year': 'Year',
    'journal_name': 'Source',
    'doi': 'DOI',
    'abstract': 'Abstract',
    'keywords': 'Keywords',
    'type_of_reference': 'Document_Type'
}, inplace=True)

# Define required columns and add missing ones
columns_to_keep = ['Title', 'Abstract', 'Keywords', 'Source', 'Year', 'Document_Type', 'Authors', 'DOI']


In [42]:
# Drop duplicate columns in Scopus, Web of Science, and ScienceDirect dataframes

# For Scopus - Drop the second occurrence of 'Source'
scopus_data = scopus_data.loc[:, ~scopus_data.columns.duplicated()]

# For Web of Science - Drop the second occurrence of 'Keywords' and 'Document_Type'
wos_data = wos_data.loc[:, ~wos_data.columns.duplicated()]

# ScienceDirect already looks clean, but this ensures no duplicates
sd_data = sd_data.loc[:, ~sd_data.columns.duplicated()]

# Confirm column alignment after cleaning
print("Scopus data columns:", scopus_data.columns)
print("Web of Science data columns:", wos_data.columns)
print("ScienceDirect data columns:", sd_data.columns)

Scopus data columns: Index(['Authors', 'Author full names', 'Author(s) ID', 'Title', 'Year',
       'Source', 'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end',
       'Page count', 'Cited by', 'DOI', 'Link', 'Affiliations',
       'Authors with affiliations', 'Abstract', 'Keywords', 'Index Keywords',
       'Molecular Sequence Numbers', 'Chemicals/CAS', 'Tradenames',
       'Manufacturers', 'Funding Details', 'Funding Texts', 'References',
       'Correspondence Address', 'Editors', 'Publisher', 'Sponsors',
       'Conference name', 'Conference date', 'Conference location',
       'Conference code', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID',
       'Language of Original Document', 'Abbreviated Source Title',
       'Document_Type', 'Publication Stage', 'Open Access', 'EID'],
      dtype='object')
Web of Science data columns: Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',
       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',
       'Group A

In [43]:
# Ensure each dataset has all required columns, filling missing columns with NaN
scopus_data = scopus_data.reindex(columns=columns_to_keep)
wos_data = wos_data.reindex(columns=columns_to_keep)
sd_data = sd_data.reindex(columns=columns_to_keep)

In [44]:
# Concatenate all datasets
combined_data = pd.concat([scopus_data, wos_data, sd_data], ignore_index=True)

# Display basic information about the combined dataset to verify
print("Total records in combined dataset:", combined_data.shape[0])
print("Columns in combined dataset:", combined_data.columns)
combined_data.head()

Total records in combined dataset: 3649
Columns in combined dataset: Index(['Title', 'Abstract', 'Keywords', 'Source', 'Year', 'Document_Type',
       'Authors', 'DOI'],
      dtype='object')


Unnamed: 0,Title,Abstract,Keywords,Source,Year,Document_Type,Authors,DOI
0,A vision-enabled fatigue-sensitive human digit...,Within a Human-centric Human-Robot Collaborati...,Ergonomics; Fatigue assessment; Human digital ...,Journal of Manufacturing Systems,2024,Article,Chand S.; Zheng H.; Lu Y.,10.1016/j.jmsy.2024.10.002
1,Knowledge transfer in Digital Twins: The metho...,"In the realm of Digital Twins (DTs), industry ...",Basic Formal Ontology (BFO); Common Core Ontol...,CIRP Journal of Manufacturing Science and Tech...,2024,Article,D'Amico R.D.; Sarkar A.; Karray M.H.; Addepall...,10.1016/j.cirpj.2024.06.007
2,A Dualistic Perspective of Opportunity and Ris...,Head-mounted augmented reality (HMD AR) techno...,Cognitive behavior and performance; Cognitive ...,Journal of Construction Engineering and Manage...,2024,Article,Liu J.; Yan X.; Gao W.,10.1061/JCEMD4.COENG-14684
3,A Segmentation Framework based on Cognitive Sc...,Industry 5.0 rethinks the role of human operat...,Cognitive Sciences; Human-Centricity; Industry...,ACM International Conference Proceeding Series,2024,Conference paper,Varni G.; Volpe G.,10.1145/3656650.3656717
4,Putting workers’ safety front and center: Empl...,Introduction: The global occupational accident...,Employee-organization exchange; Management com...,Journal of Safety Research,2024,Article,Kuang H.-X.; Pan W.; Sun L.-Y.,10.1016/j.jsr.2024.08.007


In [45]:
# Drop duplicate entries based on DOI
deduped_data = combined_data.drop_duplicates(subset="DOI", keep="first")

# In case some articles don’t have a DOI, we can further deduplicate based on Title and Authors
deduped_data = deduped_data.drop_duplicates(subset=["Title", "Authors"], keep="first")

# Display the number of unique articles after deduplication
print("Total records after deduplication:", deduped_data.shape[0])
deduped_data.head()

Total records after deduplication: 2678


Unnamed: 0,Title,Abstract,Keywords,Source,Year,Document_Type,Authors,DOI
0,A vision-enabled fatigue-sensitive human digit...,Within a Human-centric Human-Robot Collaborati...,Ergonomics; Fatigue assessment; Human digital ...,Journal of Manufacturing Systems,2024,Article,Chand S.; Zheng H.; Lu Y.,10.1016/j.jmsy.2024.10.002
1,Knowledge transfer in Digital Twins: The metho...,"In the realm of Digital Twins (DTs), industry ...",Basic Formal Ontology (BFO); Common Core Ontol...,CIRP Journal of Manufacturing Science and Tech...,2024,Article,D'Amico R.D.; Sarkar A.; Karray M.H.; Addepall...,10.1016/j.cirpj.2024.06.007
2,A Dualistic Perspective of Opportunity and Ris...,Head-mounted augmented reality (HMD AR) techno...,Cognitive behavior and performance; Cognitive ...,Journal of Construction Engineering and Manage...,2024,Article,Liu J.; Yan X.; Gao W.,10.1061/JCEMD4.COENG-14684
3,A Segmentation Framework based on Cognitive Sc...,Industry 5.0 rethinks the role of human operat...,Cognitive Sciences; Human-Centricity; Industry...,ACM International Conference Proceeding Series,2024,Conference paper,Varni G.; Volpe G.,10.1145/3656650.3656717
4,Putting workers’ safety front and center: Empl...,Introduction: The global occupational accident...,Employee-organization exchange; Management com...,Journal of Safety Research,2024,Article,Kuang H.-X.; Pan W.; Sun L.-Y.,10.1016/j.jsr.2024.08.007


In [46]:
deduped_data.to_csv("./deduped.csv")

In [47]:
#screened_data.to_csv("screened_data.csv", index=False)

In [48]:
source_counts = {}
for i, element in enumerate(deduped_data[deduped_data["Source"].notnull()]["Source"]):

    element = element.strip().lower()
    if element not in source_counts:
        source_counts[element] = 1
    else:
        source_counts[element] += 1
    # split = element.split(";") if type(element) is str else print("FUCK FUCK", element)
    # print(split)
    # for sc in element:
    # sc = sc.strip().lower()
    # if sc not in source_counts:
    # source_counts[sc] = 1
    # else:
    # source_counts[sc] += 1

# Convert the dictionary to a DataFrame
keywords_df = pd.DataFrame(list(source_counts.items()), columns=["sources", "count"])

# Sort the DataFrame by count in descending order (optional)
keywords_df = keywords_df.sort_values(by="count", ascending=False).reset_index(drop=True)

keywords_df.head(10)

Unnamed: 0,sources,count
0,procedia cirp,108
1,ifac-papersonline,65
2,acm international conference proceeding series,55
3,procedia computer science,55
4,procedia manufacturing,50
5,robotics and computer-integrated manufacturing,49
6,applied ergonomics,49
7,safety science,48
8,international journal of industrial ergonomics,36
9,advanced engineering informatics,31


In [127]:
# Group by Document_Type and Source, then count occurrences
source_counts = deduped_data.groupby(["Document_Type", "Source"]).size().reset_index(name="count")

# Sort the counts for better readability (optional)
source_counts = source_counts.sort_values(by="count", ascending=False).reset_index(drop=True)

# Separate the counts for journals and conferences (optional)
journal_counts = source_counts[source_counts["Document_Type"] == "Journal"]
conference_counts = source_counts[source_counts["Document_Type"] == "Conference"]

# Display the top 10 for each type (optional)
print("Top 10 Journals:")
print(journal_counts.head(20))

print("\nTop 10 Conferences:")
print(conference_counts.head(20))

Top 10 Journals:
   Document_Type                                             Source  count
0        Journal                                      Procedia CIRP     59
3        Journal                                     Safety Science     47
4        Journal                                 Applied Ergonomics     47
5        Journal     Robotics and Computer-Integrated Manufacturing     47
6        Journal                                  IFAC-PapersOnLine     35
7        Journal                   Advanced Engineering Informatics     31
9        Journal                   Journal of Manufacturing Systems     30
11       Journal                          Procedia Computer Science     27
12       Journal                             Procedia Manufacturing     26
13       Journal                     Applied Sciences (Switzerland)     26
14       Journal                       Sustainability (Switzerland)     26
16       Journal     International Journal of Industrial Ergonomics     23
17      

In [50]:
# Check distribution by year
print(deduped_data["Year"].value_counts().sort_index())

# # Check top journals
# print(deduped_data["journal"].value_counts().head(20))

# # Check top conferences
# print(deduped_data["conferences"].value_counts().head(20))

# Analyze frequent keywords
keywords = deduped_data["Keywords"].str.split(",").explode().value_counts().head(20)
print(keywords)

Year
2014     98
2015    124
2016    137
2017    170
2018    179
2019    277
2020    281
2021    288
2022    379
2023    354
2024    390
2025      1
Name: count, dtype: int64
Keywords
 'Industry 4.0'               20
['Industry 4.0'               14
 'Industry 5.0'               10
 'Ergonomics'                 10
['Augmented reality'          10
 'Human factors'               9
 'Virtual reality'             9
 'Manufacturing'               8
 'Artificial intelligence'     7
['Artificial intelligence'     7
 'Assembly'                    7
 'Eye-tracking'                6
 'Cognitive ergonomics'        6
 'Industry 4.0']               6
 'Operator 4.0'                5
 'Human error'                 5
 'Mental workload'             5
 'Smart manufacturing'         5
 'Industry 5.0']               5
 'Cognitive workload'          5
Name: count, dtype: int64


In [132]:
# Get unique journal names
unique_journals = deduped_data[deduped_data["Document_Type"] == "Journal"]["Source"].unique()

# Get unique conference names
unique_conferences = deduped_data[deduped_data["Document_Type"] == "Conference"]["Source"].unique()

# Display the results
print("Unique Journals:")
print(unique_journals)

print("\nUnique Conferences:")
print(unique_conferences)

Unique Journals:
['Journal of Manufacturing Systems'
 'CIRP Journal of Manufacturing Science and Technology'
 'Journal of Construction Engineering and Management'
 'Journal of Safety Research'
 'Journal of Civil Engineering and Management' 'Batteries'
 'Virtual Reality' 'Pattern Analysis and Applications'
 'Cognitive Computation' 'ACM Transactions on Human-Robot Interaction'
 'Manufacturing Letters' 'Behaviour Research and Therapy'
 'Technological Forecasting and Social Change'
 'Robotics and Computer-Integrated Manufacturing'
 'International Journal of Advanced Manufacturing Technology'
 'Soft Computing' 'Machines' 'Building and Environment' 'Buildings'
 'Management and Production Engineering Review' 'Applied Psychology'
 'Advanced Engineering Informatics' 'Safety Science'
 'Computers in Industry' 'BMC Psychology' 'Interacting with Computers'
 'Computer Applications in Engineering Education'
 'Sustainability (Switzerland) ' 'Personal and Ubiquitous Computing'
 'Mining, Metallurgy and 

In [135]:
from collections import Counter

# Filter out rows with null values in Keywords
keywords_data = deduped_data[deduped_data["Keywords"].notnull()]["Keywords"]

# Split keywords by semicolon and flatten the list
all_keywords = [keyword.strip().lower() for keywords in keywords_data for keyword in keywords.split(";")]

# Count the frequency of each keyword
keyword_counts = Counter(all_keywords)

# Convert the counts to a DataFrame
keywords_df = pd.DataFrame(keyword_counts.items(), columns=["Keyword", "Count"])

# Sort the DataFrame by frequency in descending order
keywords_df = keywords_df.sort_values(by="Count", ascending=False).reset_index(drop=True)

# Display the top 10 most frequent keywords
keywords_df

Unnamed: 0,Keyword,Count
0,industry 4.0,117
1,artificial intelligence,87
2,human factors,82
3,augmented reality,79
4,machine learning,65
...,...,...
6833,informal leadership,1
6834,frontline workers,1
6835,telerehabilitation,1
6836,stroke,1


In [139]:
keywords_df.to_csv("./Keywords sorted by frequency.csv")

Source
Procedia CIRP                                                                                                108
IFAC-PapersOnLine                                                                                             65
ACM International Conference Proceeding Series                                                                55
Procedia Computer Science                                                                                     55
Procedia Manufacturing                                                                                        50
                                                                                                            ... 
Journal of Southeast University (English Edition)                                                              1
Proceedings of International Conference on Innovative Practices in Technology and Management, ICIPTM 2021      1
6th International Forum on Research and Technology for Society and Industry, RTSI 2021 - 