# Libraries

In [72]:
import pandas as pd
import os
import requests
import time
from tqdm import tqdm 

In [73]:
folder_path = 'D:\School\GitHub\SMU_MSDS_Capstone\Data'

# Combining DF's that we have

In [74]:
# Get a list of all the CSV files in that folder
all_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.csv')]

# Initialize an empty list to hold the dataframes
list_of_dfs = []

# Loop through each CSV file and read it into a DataFrame, then append to the list
for file in all_files:
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        df = pd.read_csv(f)
    list_of_dfs.append(df)

# Concatenate all the dataframes together
combined_df = pd.concat(list_of_dfs, ignore_index=True)

In [75]:
print(combined_df.head()) # Just print the first few rows as a check

  FirstName LastName  Order      PMID                         DOI Journal  \
0     Sarah   Melzer    1.0  34610277  10.1016/j.cell.2021.09.013    Cell   
1     Elena  Newmark    2.0  34610277  10.1016/j.cell.2021.09.013    Cell   
2     Grace   Mizuno    3.0  34610277  10.1016/j.cell.2021.09.013    Cell   
3    Minsuk     Hyun    4.0  34610277  10.1016/j.cell.2021.09.013    Cell   
4  Adrienne  Philson    5.0  34610277  10.1016/j.cell.2021.09.013    Cell   

     Field  CitationCount Gender Date.Received  ... year_max index  \
0  Biology            4.0      F     23-Oct-20  ...      NaN   NaN   
1  Biology            4.0      F     23-Oct-20  ...      NaN   NaN   
2  Biology            4.0      F     23-Oct-20  ...      NaN   NaN   
3  Biology            4.0      M     23-Oct-20  ...      NaN   NaN   
4  Biology            4.0      F     23-Oct-20  ...      NaN   NaN   

  CollectiveName  Initials order TitleSem Abstract Month  Day Jabbrev  
0            NaN       NaN   NaN      NaN   

In [76]:
# After combining all your DataFrames into combined_df...
output_file_path = 'D:\School\GitHub\SMU_MSDS_Capstone\Output1.csv'
combined_df.to_csv(output_file_path, index=False)

# Compiling a list of DOI's to grab information for

In [77]:
doi_list = combined_df['DOI'].unique().tolist()
doi_df = pd.DataFrame(doi_list, columns=['DOI'])

In [79]:
# Step 1: Query Crossref for a broad set of results (you can refine this as needed)
URL = "https://api.crossref.org/works?rows=1000"
response = requests.get(URL)
data = response.json()

# Extract subjects from the returned works and add to a set for uniqueness
subjects_set = set()
for item in data['message']['items']:
    if 'subject' in item:
        subjects_set.update(item['subject'])

# Convert the set to a list
subjects_list = list(subjects_set)

# Initialize a list to store all the fetched DOIs
all_dois = []

# Initialize tqdm for the progress bar
pbar = tqdm(total=len(subjects_list), desc="Fetching DOIs")

# Step 2: Iterate through each subject title and fetch 100 unique DOIs
for subject_name in subjects_list:
    doi_url = f"https://api.crossref.org/works?query=subject:{subject_name}&rows=100"
    response = requests.get(doi_url)
    data = response.json()
    subject_dois = data.get('message', {}).get('items', [])
    
    # Add the fetched DOIs to the all_dois list
    all_dois.extend(subject_dois)
    
    # Update the progress bar
    pbar.update(1)
    
    # Break the loop if we have fetched 100 unique DOIs for each subject
    if len(all_dois) >= 100 * len(subjects_list):
        break

# Close the progress bar
pbar.close()

# Now you have a list of 100 * len(subjects_list) unique DOIs in the all_dois list
print(len(all_dois))

temp_doi_df = pd.DataFrame(all_dois, columns=['DOI'])
doi_df = pd.concat([doi_df, doi_list], ignore_index=True)

Fetching DOIs:  71%|███████   | 166/233 [03:07<00:51,  1.30it/s]

KeyboardInterrupt: 

In [60]:
doi_df.drop_duplicates(inplace=True)
doi_df.reset_index(drop=True, inplace=True)

Number of rows before deduplication: 840845
Number of rows after deduplication: 6049


# Pulling Info From The DOI's

In [None]:
def fetch_details(doi, cache):
    # If the details for the DOI are already in the cache
    if doi in cache:
        common_details = cache[doi]['common']
        authors = cache[doi]['authors']
    else:
        url = f"https://api.crossref.org/works/{doi}"
        try:
            response = requests.get(url)
            response.raise_for_status() 
            data = response.json()
            message = data['message']

            # Extract common details
            title = message['title'][0] if 'title' in message else None
            abstract = message.get('abstract', None)
            journal = message.get('container-title', [None])[0]
            field = message.get('subject', [None])[0]
            citation_count = message.get('is-referenced-by-count', None)
            date_received = message.get('created', {}).get('date-time', None)
            date_published = message.get('published-online', {}).get('date-parts', [None])[0]
            address = message.get('publisher-location', None)
            language = message.get('language', None)
            score = message.get('score', None)

            common_details = {
                'DOI': doi,
                'Title': title,
                'Abstract': abstract,
                'Journal': journal,
                'Field': field,
                'Citation Count': citation_count,
                'Date Received': date_received,
                'Date Published': date_published,
                'Address': address,
                'Language': language
            }

            authors = message.get('author', [])
            cache[doi] = {'common': common_details, 'authors': authors}

        except requests.RequestException as e:
            print(f"Error fetching DOI {doi}: {e}")
            return []
        except (KeyError, IndexError) as e:
            print(f"Unexpected structure in response for DOI {doi}: {e}")
            return []

    # Getting authors details
    details_list = []
    for idx, author in enumerate(authors):
        author_first_name = author.get('given', None)
        author_last_name = author.get('family', None)
        author_order = idx + 1

        details = common_details.copy()
        details['Author First Name'] = author_first_name
        details['Author Last Name'] = author_last_name
        details['Author Order'] = author_order

        details_list.append(details)

    return details_list

# Assuming you have the DataFrame named combined_df

# Initialize a cache for storing details of already processed DOIs
doi_cache = {}
data_list = []

for idx, row in tqdm(doi_df.iterrows(), total=len(doi_df), desc="Fetching details"):
    if pd.notna(row['DOI']):  
        details = fetch_details(row['DOI'], doi_cache)
        data_list.extend(details)  # Since details is now a list of dictionaries

# Convert list of dictionaries to DataFrame
new_data_df = pd.DataFrame(data_list)

# Append new_data_df to full_df
full_df = pd.concat([full_df, new_data_df], ignore_index=True)

#### Cleaning

In [None]:
print(full_df['DOI'].nunique())
print(full_df.shape[0])

# Assuming your DataFrame is named 'full_df'
subset_columns = ['DOI', 'Author First Name', 'Author Last Name']

# Group by DOI and apply a function to remove duplicates within each group
deduplicated_df = full_df.groupby('DOI', group_keys=False).apply(
    lambda group: group.drop_duplicates(subset=subset_columns)
)

# Print the number of rows before and after deduplication
print(f"Number of rows before deduplication: {len(full_df)}")
print(f"Number of rows after deduplication: {len(deduplicated_df)}")