# checks for matches between issns

This works and saves out the raw data from the API to an excel fie.

– This opens an excel file of all titles from a given ISSN and finds duplicates by exact matching.
- reduced load on the crossref server by saving the raw file.
     - could be a pickle instead?




In [1]:
import requests
import pandas as pd

def fetch_metadata_by_issn(issn):
    api_url = f'https://api.crossref.org/works?filter=issn:{issn}&select=DOI,title,container-title,issued'
    response = requests.get(api_url)

    if response.status_code == 200:
        data = response.json()
        return data.get('message', {}).get('items', [])
    else:
        print(f"Error fetching metadata for ISSN {issn}. Status code: {response.status_code}")
        return []

def create_dataframe(metadata):
    rows = []

    for entry in metadata:
        doi = entry.get('DOI', '')
        title = entry.get('title', '')
        journal_title = entry.get('container-title', [''])[0]
        publication_date = entry.get('issued', {}).get('date-parts', [[None]])[0][0]

        rows.append({'DOI': doi, 'Title': title, 'Journal Title': journal_title, 'Publication Date': publication_date})

    return pd.DataFrame(rows)

def save_to_excel(df, file_path):
    df.to_excel(file_path, index=False)
    print(f"DataFrame saved to {file_path}")
    return file_path

def main(issn):
    file_path = ""
    # Fetch metadata for the given ISSN
    metadata = fetch_metadata_by_issn(issn)

    # Create dataframe
    df = create_dataframe(metadata)

    # save dataframe as Excel file
    save_to_excel(df, f'/content/drive/MyDrive/Colab Notebooks/Crossref_notebooks/raw_output_{issn}.xlsx')
    return file_path, df
issn = "18756883"
main(issn)



DataFrame saved to /content/drive/MyDrive/Colab Notebooks/Crossref_notebooks/raw_output_18756883.xlsx


('',
                               DOI  \
 0    10.1080/18756891.2013.865403   
 1   10.1080/18756891.2011.9727792   
 2   10.1080/18756891.2009.9727640   
 3      10.2991/ijcis.d.190614.001   
 4           10.2991/ijcis.11.1.84   
 5      10.1007/s44196-021-00020-1   
 6      10.1007/s44196-023-00354-y   
 7      10.2991/ijcis.d.200731.001   
 8        10.2991/ijcis.2011.4.4.1   
 9       10.2991/ijcis.2011.4.6.29   
 10  10.1080/18756891.2011.9727834   
 11  10.1080/18756891.2011.9727892   
 12  10.1080/18756891.2015.1113735   
 13     10.2991/ijcis.d.200926.001   
 14  10.1080/18756891.2016.1237187   
 15       10.2991/ijcis.2008.1.4.5   
 16       10.2991/ijcis.2008.1.4.9   
 17       10.2991/ijcis.2008.1.4.8   
 18       10.2991/ijcis.2010.3.1.1   
 19       10.2991/ijcis.2010.3.4.5   
 
                                                 Title  \
 0   [A New Adaptive and Self Organizing Fuzzy Poli...   
 1   [Incident Duration Prediction Based on Latent ...   
 2   [Accuracy Evalua

In [3]:
import pandas as pd
import re

def read_excel_file(file_path):
    # Read data from Excel file
    df = pd.read_excel(file_path)
    return df

def clean_title(title):
    # Remove specific characters and square brackets and their contents
    if isinstance(title, str):
        title = re.sub(r'\[[^\]]*\]', '', title)
        title = re.sub('[^a-zA-Z0-9\s]', '', title)
    elif isinstance(title, list):
        # Join the list elements into a string
        title = ' '.join(map(str, title))
        title = re.sub(r'\[[^\]]*\]', '', title)
        title = re.sub('[^a-zA-Z0-9\s]', '', title)
    return title

def create_dataframe(df):
    rows = []

    for index, row in df.iterrows():
        doi = row.get('DOI', '')  # Adjust column name based on your Excel file
        title = row.get('title', '')  # Adjust column name based on your Excel file
        title = clean_title(title)  # Clean the title
        journal_title = row.get('container-title', '')  # Adjust column name based on your Excel file
        publication_date = row.get('issued', None)  # Adjust column name based on your Excel file

        rows.append({'DOI': doi, 'Title': title, 'Journal Title': journal_title, 'Publication Date': publication_date})

    return pd.DataFrame(rows)

def find_duplicates(df):
    duplicates = df[df.duplicated(subset=['Title'], keep=False)]
    return duplicates

def save_to_excel(df, file_path):
    df.to_excel(file_path, index=False)
    print(f"DataFrame saved to {file_path}")

def main(file_path):
    # Read data from Excel file
    df = read_excel_file(file_path)

    # Create dataframe
    df_cleaned = create_dataframe(df)

    # Find and display duplicates based on exact title matching
    duplicates = find_duplicates(df_cleaned)
    print("Duplicates based on exact title matching:")
    print(duplicates)

    # Save duplicates to a new Excel file
    save_to_excel(duplicates, '/content/drive/MyDrive/Colab Notebooks/Crossref_notebooks/duplicates_output.xlsx')

# Replace 'your_excel_file.xlsx' with the path to your Excel file
excel_file_path = '/content/drive/MyDrive/Colab Notebooks/Crossref_notebooks/all_titles_for_ISSN_18756883.xlsx'
main(excel_file_path)


Duplicates based on exact title matching:
                                DOI  \
0           10.2991/jnmp.2008.1.3.1   
5          10.2991/ijcis.2010.3.2.7   
6     10.1080/18756891.2010.9727690   
11    10.1080/18756891.2011.9727879   
12        10.2991/ijcis.2011.4.6.22   
...                             ...   
2110   10.1080/18756891.2014.891369   
2111   10.1080/18756891.2014.963976   
2112  10.1080/18756891.2011.9727766   
2113       10.2991/ijcis.2011.4.1.8   
2114   10.1080/18756891.2013.808426   

                                                  Title  \
0                                                         
5                       A nave glance at Soft Computing   
6                       A nave glance at Soft Computing   
11    A Calibration Method for A Linear Structured L...   
12    A Calibration Method for A Linear Structured L...   
...                                                 ...   
2110                                                      
2111             