# Libraries

In [2]:
import pandas as pd
import os
import requests
import time
from tqdm import tqdm 

In [3]:
folder_path = 'D:\School\GitHub\SMU_MSDS_Capstone\Data'

# Combining DF's that we have

In [4]:
# Get a list of all the CSV files in that folder
all_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.csv')]

# Initialize an empty list to hold the dataframes
list_of_dfs = []

# Loop through each CSV file and read it into a DataFrame, then append to the list
for file in all_files:
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        df = pd.read_csv(f)
    list_of_dfs.append(df)

# Concatenate all the dataframes together
combined_df = pd.concat(list_of_dfs, ignore_index=True)

In [5]:
print(combined_df.head()) # Just print the first few rows as a check

  FirstName LastName  Order      PMID                         DOI Journal  \
0     Sarah   Melzer    1.0  34610277  10.1016/j.cell.2021.09.013    Cell   
1     Elena  Newmark    2.0  34610277  10.1016/j.cell.2021.09.013    Cell   
2     Grace   Mizuno    3.0  34610277  10.1016/j.cell.2021.09.013    Cell   
3    Minsuk     Hyun    4.0  34610277  10.1016/j.cell.2021.09.013    Cell   
4  Adrienne  Philson    5.0  34610277  10.1016/j.cell.2021.09.013    Cell   

     Field  CitationCount Gender Date.Received  ... year_max index  \
0  Biology            4.0      F     23-Oct-20  ...      NaN   NaN   
1  Biology            4.0      F     23-Oct-20  ...      NaN   NaN   
2  Biology            4.0      F     23-Oct-20  ...      NaN   NaN   
3  Biology            4.0      M     23-Oct-20  ...      NaN   NaN   
4  Biology            4.0      F     23-Oct-20  ...      NaN   NaN   

  CollectiveName  Initials order TitleSem Abstract Month  Day Jabbrev  
0            NaN       NaN   NaN      NaN   

In [6]:
# After combining all your DataFrames into combined_df...
output_file_path = 'D:\School\GitHub\SMU_MSDS_Capstone\Output1.csv'
combined_df.to_csv(output_file_path, index=False)

# Compiling a list of DOI's to grab information for

In [7]:
doi_list = combined_df['DOI'].unique().tolist()
doi_df = pd.DataFrame(doi_list, columns=['DOI'])

In [8]:
# Step 1: Query Crossref for a broad set of results (you can refine this as needed)
URL = "https://api.crossref.org/works?rows=1000"
response = requests.get(URL)
data = response.json()

# Extract subjects from the returned works and add to a set for uniqueness
subjects_set = set()
for item in data['message']['items']:
    if 'subject' in item:
        subjects_set.update(item['subject'])

# Convert the set to a list
subjects_list = list(subjects_set)

# Initialize a list to store all the fetched DOIs
all_dois = []

# Initialize tqdm for the progress bar
pbar = tqdm(total=len(subjects_list), desc="Fetching DOIs")

# Step 2: Iterate through each subject title and fetch 100 unique DOIs
for subject_name in subjects_list:
    doi_url = f"https://api.crossref.org/works?query=subject:{subject_name}&rows=100"
    response = requests.get(doi_url)
    data = response.json()
    subject_dois = data.get('message', {}).get('items', [])
    
    # Add the fetched DOIs to the all_dois list
    all_dois.extend(subject_dois)
    
    # Update the progress bar
    pbar.update(1)
    
    # Break the loop if we have fetched 25 unique DOIs for each subject
    if len(all_dois) >= 25 * len(subjects_list):
        break

# Close the progress bar
pbar.close()

temp_doi_df = pd.DataFrame(all_dois, columns=['DOI'])
doi_df = pd.concat([doi_df, temp_doi_df], ignore_index=True)

Fetching DOIs:  33%|███▎      | 77/233 [01:09<02:20,  1.11it/s]


In [9]:
doi_df.drop_duplicates(inplace=True)
doi_df.reset_index(drop=True, inplace=True)

In [10]:
print(doi_df.describe)

<bound method NDFrame.describe of                                       DOI
0              10.1016/j.cell.2021.09.013
1              10.1016/j.cell.2021.09.018
2              10.1016/j.cell.2021.09.021
3              10.1016/j.cell.2021.09.022
4              10.1016/j.cell.2021.09.023
...                                   ...
5856           10.1057/9781137375469.0013
5857           10.1891/9780826125453.0005
5858           10.52419/3006-2023-4-1-159
5859  10.3828/dap.2021.2021.issue-january
5860    10.3828/dap.2021.2021.issue-march

[5861 rows x 1 columns]>


# Pulling Info From The DOI's

In [11]:
def fetch_details(doi, cache):
    # If the details for the DOI are already in the cache
    if doi in cache:
        common_details = cache[doi]['common']
        authors = cache[doi]['authors']
    else:
        url = f"https://api.crossref.org/works/{doi}"
        try:
            response = requests.get(url)
            response.raise_for_status() 
            data = response.json()
            message = data['message']

            # Extract common details
            title = message['title'][0] if 'title' in message else None
            abstract = message.get('abstract', None)
            journal = message.get('container-title', [None])[0]
            field = message.get('subject', [None])[0]
            citation_count = message.get('is-referenced-by-count', None)
            date_received = message.get('created', {}).get('date-time', None)
            date_published = message.get('published-online', {}).get('date-parts', [None])[0]
            address = message.get('publisher-location', None)
            language = message.get('language', None)

            common_details = {
                'DOI': doi,
                'Title': title,
                'Abstract': abstract,
                'Journal': journal,
                'Field': field,
                'Citation Count': citation_count,
                'Date Received': date_received,
                'Date Published': date_published,
                'Address': address,
                'Language': language
            }

            authors = message.get('author', [])
            cache[doi] = {'common': common_details, 'authors': authors}

        except requests.RequestException as e:
            print(f"Error fetching DOI {doi}: {e}")
            return []
        except (KeyError, IndexError) as e:
            print(f"Unexpected structure in response for DOI {doi}: {e}")
            return []

    # Getting authors details
    details_list = []
    for idx, author in enumerate(authors):
        author_first_name = author.get('given', None)
        author_last_name = author.get('family', None)
        author_order = idx + 1

        details = common_details.copy()
        details['Author First Name'] = author_first_name
        details['Author Last Name'] = author_last_name
        details['Author Order'] = author_order

        details_list.append(details)

    return details_list

# Assuming you have the DataFrame named combined_df

# Initialize a cache for storing details of already processed DOIs
doi_cache = {}
data_list = []

for idx, row in tqdm(doi_df.iterrows(), total=len(doi_df), desc="Fetching details"):
    if pd.notna(row['DOI']):  
        details = fetch_details(row['DOI'], doi_cache)
        data_list.extend(details)  # Since details is now a list of dictionaries

# Convert list of dictionaries to DataFrame
new_data_df = pd.DataFrame(data_list)

# Append new_data_df to full_df
full_df = pd.concat([full_df, new_data_df], ignore_index=True)

Fetching details:   8%|▊         | 483/5861 [01:51<21:23,  4.19it/s]

Error fetching DOI 10.1038/s41567-019-0765-5: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1038/s41567-019-0765-5
Error fetching DOI 10.1038/s41567-019-0765-6: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1038/s41567-019-0765-6


Fetching details:   8%|▊         | 485/5861 [01:52<20:00,  4.48it/s]

Error fetching DOI 10.1038/s41567-019-0765-7: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1038/s41567-019-0765-7
Error fetching DOI 10.1038/s41567-019-0765-8: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1038/s41567-019-0765-8


Fetching details:   8%|▊         | 487/5861 [01:52<22:09,  4.04it/s]

Error fetching DOI 10.1038/s41567-019-0765-9: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1038/s41567-019-0765-9


Fetching details:   8%|▊         | 488/5861 [01:52<21:38,  4.14it/s]

Error fetching DOI 10.1038/s41567-019-0765-10: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1038/s41567-019-0765-10


Fetching details:   8%|▊         | 489/5861 [01:52<21:16,  4.21it/s]

Error fetching DOI 10.1038/s41567-019-0765-11: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1038/s41567-019-0765-11


Fetching details:   8%|▊         | 490/5861 [01:53<20:44,  4.31it/s]

Error fetching DOI 10.1038/s41567-020-0894-9.: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1038/s41567-020-0894-9.


Fetching details:   9%|▊         | 510/5861 [01:57<22:33,  3.95it/s]

Error fetching DOI 10.1088/1361-6633/ac2c92: 404 Client Error: Not Found for url: https://api.crossref.org/works/10.1088/1361-6633/ac2c92


Fetching details:  10%|▉         | 575/5861 [02:13<21:57,  4.01it/s]

Unexpected structure in response for DOI 10.1002/csc2.v59.3: list index out of range
Unexpected structure in response for DOI 10.1002/csc2.v40.4: list index out of range


Fetching details:  10%|▉         | 577/5861 [02:14<20:12,  4.36it/s]

Unexpected structure in response for DOI 10.1002/csc2.v58.3: list index out of range


Fetching details:  10%|▉         | 578/5861 [02:14<19:38,  4.48it/s]

Unexpected structure in response for DOI 10.1002/csc2.v56.1: list index out of range


Fetching details:  10%|▉         | 579/5861 [02:14<19:55,  4.42it/s]

Unexpected structure in response for DOI 10.1002/csc2.v58.4: list index out of range


Fetching details:  10%|▉         | 583/5861 [02:15<19:14,  4.57it/s]

Unexpected structure in response for DOI 10.1002/csc2.v53.5: list index out of range


Fetching details:  10%|▉         | 584/5861 [02:15<18:59,  4.63it/s]

Unexpected structure in response for DOI 10.1002/csc2.v59.2: list index out of range


Fetching details:  10%|▉         | 585/5861 [02:15<18:38,  4.72it/s]

Unexpected structure in response for DOI 10.1002/csc2.v55.1: list index out of range


Fetching details:  10%|▉         | 586/5861 [02:15<18:46,  4.68it/s]

Unexpected structure in response for DOI 10.1002/csc2.v55.6: list index out of range


Fetching details:  10%|█         | 587/5861 [02:16<19:36,  4.48it/s]

Unexpected structure in response for DOI 10.1002/csc2.v52.2: list index out of range


Fetching details:  10%|█         | 588/5861 [02:16<19:36,  4.48it/s]

Unexpected structure in response for DOI 10.1002/csc2.v48.2: list index out of range


Fetching details:  10%|█         | 589/5861 [02:16<19:33,  4.49it/s]

Unexpected structure in response for DOI 10.1002/csc2.v56.5: list index out of range


Fetching details:  10%|█         | 592/5861 [02:17<19:47,  4.44it/s]

Unexpected structure in response for DOI 10.1002/csc2.v57.5: list index out of range


Fetching details:  10%|█         | 593/5861 [02:17<22:37,  3.88it/s]

Unexpected structure in response for DOI 10.1002/csc2.v55.4: list index out of range


Fetching details:  10%|█         | 594/5861 [02:17<24:40,  3.56it/s]

Unexpected structure in response for DOI 10.1002/csc2.v57.6: list index out of range


Fetching details:  10%|█         | 598/5861 [02:18<20:58,  4.18it/s]

Unexpected structure in response for DOI 10.1002/csc2.v54.2: list index out of range
Unexpected structure in response for DOI 10.1002/csc2.v48.4: list index out of range


Fetching details:  10%|█         | 600/5861 [02:19<19:57,  4.39it/s]

Unexpected structure in response for DOI 10.1002/csc2.v54.4: list index out of range


Fetching details:  10%|█         | 601/5861 [02:19<20:22,  4.30it/s]

Unexpected structure in response for DOI 10.1002/csc2.v53.3: list index out of range





KeyboardInterrupt: 

#### Cleaning

In [None]:
print(full_df['DOI'].nunique())
print(full_df.shape[0])

# Assuming your DataFrame is named 'full_df'
subset_columns = ['DOI', 'Author First Name', 'Author Last Name']

# Group by DOI and apply a function to remove duplicates within each group
deduplicated_df = full_df.groupby('DOI', group_keys=False).apply(
    lambda group: group.drop_duplicates(subset=subset_columns)
)

# Print the number of rows before and after deduplication
print(f"Number of rows before deduplication: {len(full_df)}")
print(f"Number of rows after deduplication: {len(deduplicated_df)}")