In [20]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
from glob import glob
import rispy

In [21]:
WOS_papers = pd.read_excel("./WOS/wos.xls")
ris_files = glob("./ScienceDirect/*.ris")
SCO_papers = pd.read_csv("./Scopus/scopus.csv")

# List to store each DataFrame
dfs = []

# Process each .ris file and append to the list of DataFrames
for ris_file in ris_files:
    with open(ris_file, "r", encoding="utf-8") as file:
        entries = rispy.load(file)  # Parse .ris file using rispy
        df = pd.DataFrame(entries)  # Convert each file to a DataFrame
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
SD_papers = pd.concat(dfs, ignore_index=True)

In [22]:
# Merge 'Author Keywords' and 'Keywords Plus' in WOS_papers
WOS_papers["Keywords"] = WOS_papers["Author Keywords"].fillna("") + "; " + WOS_papers["Keywords Plus"].fillna("")
WOS_papers["Keywords"] = WOS_papers["Keywords"].str.strip("; ")

# Merge 'Author Keywords' and 'Index Keywords' in SCO_papers
SCO_papers["Keywords"] = SCO_papers["Author Keywords"].fillna("") + "; " + SCO_papers["Index Keywords"].fillna("")
SCO_papers["Keywords"] = SCO_papers["Keywords"].str.strip("; ")

In [23]:
# Rename columns to match the common fields in all dataframes
WOS_papers = WOS_papers.rename(columns={"Article Title": "Title", "Abstract": "Abstract", "Authors": "Authors", "DOI": "DOI", "ISSN": "ISSN", "Publication Year": "Publication Year"})

SC_papers = SC_papers.rename(columns={"Article Title": "Title", "Abstract": "Abstract", "Authors": "Authors", "DOI": "DOI", "ISSN": "ISSN", "Publication Year": "Publication Year"})

SD_papers = SD_papers.rename(columns={"primary_title": "Title", "abstract": "Abstract", "authors": "Authors", "doi": "DOI", "issn": "ISSN", "year": "Publication Year", "keywords": "Keywords"})

In [24]:
common_columns = ["Title", "Abstract", "Authors", "Keywords", "DOI", "ISSN", "Publication Year"]

WOS_papers_common = WOS_papers[common_columns]
SC_papers_common = SC_papers[common_columns]
SD_papers_common = SD_papers[common_columns]

In [25]:
combined_df = pd.concat([WOS_papers_common, SC_papers_common, SD_papers_common], ignore_index=True)

In [27]:
combined_df.to_csv("./COMBINED.csv")

In [28]:
combined_df.shape[0]

886