In [4]:
import os
import pandas as pd

Define paths for input data and output and load data

In [5]:
input_dataset_path = "../data/input_dataset.csv"

# Load input dataset
input_df = pd.read_csv(input_dataset_path)

Check the first few rows of the input dataset.

In [None]:
print("Input Dataset:")
display(input_df.head())

Step 2: Find and load the ASReview output files. Iterate through each subfolder in asreview_output. Find csv files and merge with input dataset.

In [7]:
asreview_output_path = "../data/asreview_output"

# Iterate through each subfolder in asreview_output
for subfolder in os.listdir(asreview_output_path):
    subfolder_path = os.path.join(asreview_output_path, subfolder)
    
    # Check if it's a directory
    if os.path.isdir(subfolder_path):
        # List all CSV files in the subfolder
        for file_name in os.listdir(subfolder_path):
            if file_name.endswith(".csv"):
                file_path = os.path.join(subfolder_path, file_name)
                
                # Load the output CSV file
                output_df = pd.read_csv(file_path)
                
                # Check if the output file contains the required columns
                if 'title' in output_df.columns and 'included' in output_df.columns:
                    # Extract only the "title" and "included" columns
                    temp_df = output_df[['title', 'included']].copy()
                    
                    # Rename the "included" column to the filename (without ".csv")
                    column_name = file_name.replace(".csv", "")
                    temp_df.rename(columns={"included": column_name}, inplace=True)
                    
                    # Merge with the input dataset using the "title" column
                    input_df = input_df.merge(temp_df, on='title', how='left')

Display and save output as .csv

In [None]:
output_path = "../output/merged_dataset.csv"
input_df.to_csv(output_path, index=False)

print(f"Final merged dataset saved to: {output_path}")
display(input_df.head())

Merge any duplicate rows

In [None]:
# Step 1: Check for duplicates in the merged dataset
# Load the merged dataset
merged_dataset_path = "../output/merged_dataset.csv"
merged_df = pd.read_csv(merged_dataset_path)

# Step 2: Identify duplicates based on the 'title' column
duplicates = merged_df[merged_df.duplicated(subset='title', keep=False)]

if not duplicates.empty:
    print("\nDuplicated rows detected:")
    display(duplicates)
    
    # Step 3: Merge duplicated rows by aggregating values
    # Here we keep the first non-null value or combine them logically
    merged_df = (
        merged_df.groupby('title', as_index=False)  # Group by the 'title' column
        .first()                                   # Use the first non-duplicated row
    )
else:
    print("\nNo duplicated rows detected.")

# Step 4: Save the cleaned dataset
cleaned_output_path = "../output/cleaned_merged_dataset.csv"
merged_df.to_csv(cleaned_output_path, index=False)

print(f"Cleaned dataset saved to: {cleaned_output_path}")
display(merged_df.head())

Create file with papers that aren't in any domain.

In [None]:
# Step 1: Check for papers that are not included in any of the reviews
# Load the cleaned merged dataset
cleaned_merged_dataset_path = "../output/cleaned_merged_dataset.csv"
cleaned_merged_df = pd.read_csv(cleaned_merged_dataset_path)

# Step 2: Check if any paper is not included in any of the reviews
not_included_df = cleaned_merged_df[cleaned_merged_df.iloc[:, 4:].isnull().all(axis=1)]

# Step 3: Save the dataset with papers that are not included in any of the reviews
not_included_output_path = "../output/not_included_papers.csv"
not_included_df.to_csv(not_included_output_path, index=False)
