Visualization of author network

In [1]:
import bibtexparser
import logging
import numpy as np
import re

import os
os.chdir("..")

import pandas as pd

from src.db import Database
from src.tags import Tags

LOGGER = logging.getLogger(__name__)
logging.basicConfig(level = logging.INFO)

## Load Pubmed

In [2]:
def load_metadata(db, filter=True):
    # Initialize Tag class
    metadata = Tags()

    # Load the tag file and turn into a dataframe
    metadata.load("data/tag_files/tags.yaml")
    md = pd.DataFrame(metadata.tags["tagged_papers"])
    
    # Merge with database
    mddb = db.merge(md, on="id", how="outer")
    
    # List current progress out of total.
    total_papers = mddb.shape[0]
    tagged_papers = sum(~mddb["tag"].isnull())
    percent = np.round((tagged_papers/total_papers) * 100, 2)
    
    LOGGER.info(f"Currently {tagged_papers}/{total_papers} ({percent}%) papers tagged.")
    LOGGER.info(f"{total_papers-tagged_papers} papers remaining.")
        
    mddb = mddb.reset_index()
        
    return mddb

In [3]:
# Set tables
tables = ["active_inference", "bayesian_mechanics", "free_energy", "friston", "karl_friston", "predictive_coding", "predictive_processing"]

# Load database
database = Database()
database.load(tables=tables)
db = database.db

# Load metadata
pubmed_db = load_metadata(db, filter=False)

INFO:src.db:Checking tables...
INFO:src.db:Loading tables...


INFO:src.db:Tables downloaded from PubMed on Thursday, Sept. 14, 2023.
INFO:src.tags:YAML tag file successfully loaded from data/tag_files/tags.yaml.
INFO:__main__:Currently 3585/3585 (100.0%) papers tagged.
INFO:__main__:0 papers remaining.


## Load BioArXiv

In [4]:
with open("examples/bioarxiv_aif_citations.bib") as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)
    
bioarxiv_db = pd.DataFrame(bib_database.entries)

## Load ArXiv and PsyArXiv

In [5]:
arxiv_psyarxiv_db = pd.read_csv("data/arxiv_papers.csv")
arxiv_db = arxiv_psyarxiv_db[arxiv_psyarxiv_db["Archive"] == "ArXiv"]
psyarxiv_db = arxiv_psyarxiv_db[arxiv_psyarxiv_db["Archive"] == "PsyArXiv"]

## Isolate author list

In [6]:
pubmed_authors = pubmed_db["authors"]
bioarxiv_authors = bioarxiv_db["author"]
arxiv_authors = arxiv_db["Authors"]
psyarxiv_authors = psyarxiv_db["Authors"]

## Standardize author list

Authors are currently in the following format
* Pubmed Authors: Last Name First Initial, Last Name First Initial, ...
* BioArXiv Authors: Last Name, First Name and Last Name, First Name and ...
* ArXiv Authors: [First Name Last Name, First Name Last Name, ...]
* PsyArXiv Authors: [First Name Last Name, First Name Last Name, ...]

Based on this, it seems that the Pubmed style will have to become the global style because I am missing initials.

In [7]:
# Standardize BioArXiv list
def clean_special_characters(name):
    # Replace specific special character sequences with desired replacements
    name = re.sub(r'\{\\\'a\}', 'a', name)
    name = re.sub(r'\{\\~n\}', '~n', name)
    return name

def convert_name_format(name):
    # name = clean_special_characters(name)                           # Clean special characters in the name
    last_name, first_names = name.split(', ')                       # Split the name into last name and the rest
    initials = ''.join([name[0] for name in first_names.split()])   # Split the first names into individual names
    return f"{last_name} {initials}"                                # Return the formatted name

def convert_bioarxiv_names(row):
    # Clean the malformed " , and " in the authors string
    cleaned_authors_string = re.sub(r' , and ', ' and ', row)

    # Split the authors string using " and " as a delimiter
    parts = cleaned_authors_string.split(' and ')

    # Merge parts that were split incorrectly
    author_list = []
    temp_author = ""
    for part in parts:
        if ',' in part:
            if temp_author:
                author_list.append(temp_author)
                temp_author = ""
            author_list.append(part)
        else:
            temp_author += " and " + part

    # Add the last part if any
    if temp_author:
        author_list.append(temp_author.lstrip(" and "))
    
    formatted_authors = [convert_name_format(name) for name in author_list]
    return ", ".join(formatted_authors)

bioarxiv_authors = bioarxiv_authors.drop(labels=[169, 276]).apply(convert_bioarxiv_names)

In [8]:
# Standardize ArXiv lists

def convert_name_format2(name):
    parts = name.split()    # Split the name into parts
    last_name = parts[-1]   # The last part is the last name, the rest are first and middle names
    
    # Get the initials of the first and middle names
    first_middle_names = parts[:-1]   
    initials = ''.join([n[0] for n in first_middle_names])
    return f"{last_name} {initials}"

def convert_arxiv_psyarxiv_names(row):
    # Remove the square brackets and split the string by commas
    author_list = row.strip("[]").split(", ")

    # Convert all the names in the list
    formatted_authors = [convert_name_format2(name) for name in author_list]

    # Join the formatted names with a comma
    return ', '.join(formatted_authors)

arxiv_authors = arxiv_authors.apply(convert_arxiv_psyarxiv_names)
psyarxiv_authors = psyarxiv_authors.apply(convert_arxiv_psyarxiv_names)

In [39]:
authors = pd.concat([                         # Combine author lists together
    pubmed_authors, 
    bioarxiv_authors, 
    psyarxiv_authors, 
    arxiv_authors]).reset_index(drop=True)

authors = authors.tolist()                    # Convert to list
authors = [a.strip(".") for a in authors if isinstance(a, str)]   # Strip out any periods that may have been included
authors = [a.split(",") for a in authors] # Separate into a list of lists
authors = [[p.strip() for p in a] for a in authors]  # Remove leading/trailing white space around author name

In [44]:
# 335m 34.0s to generate
unique_elem = sorted(set(sum(authors, []))) # sorted list of unique elements
n = len(unique_elem)
author_adj_mat = np.zeros((n,n), dtype= int) # matrix (n,n)

for i, a in enumerate(unique_elem):
    for j, b in enumerate(unique_elem):
        author_adj_mat[i][j] = sum([True for seq in authors if a in seq and b in seq])

In [52]:
# Save and load
# np.save("data/author_adj_mat.npy", author_adj_mat, allow_pickle=True)

# with open("data/author_adj_mat.npy", "rb") as f:
#     author_adj_mat = np.load(f, allow_pickle=True)

## Create adjaceny matrix

In [62]:
import matplotlib.pyplot as plt
import networkx as nx

from pyvis.network import Network

In [60]:
G = nx.from_numpy_matrix(np.matrix(author_adj_mat), create_using=nx.DiGraph)
# layout = nx.spring_layout(G)
# nx.draw(G, layout)
# nx.draw_networkx_edge_labels(G, pos=layout)
# plt.show()

In [63]:
nt = Network('500px', '500px')
# populates the nodes and edges data structures
nt.from_nx(G)
nt.show('nx.html')

KeyboardInterrupt: 

In [66]:
author_adj_mat = pd.DataFrame(author_adj_mat)
author_adj_mat.index = unique_elem
author_adj_mat.columns = unique_elem

In [68]:
author_adj_mat.to_csv("data/author_adj_mat.csv")

## Scratch

In [33]:
bioarxiv_authors[0]

'Crowl, Sam and Coleman, Maeve Bella and Chaphiv, Andrew and Naegle, Kristen M.'

In [38]:
[author.strip() for author in bioarxiv_authors[0].split("and")]

['Crowl, Sam', 'Coleman, Maeve Bella', 'Chaphiv, Andrew', 'Naegle, Kristen M.']

In [None]:
['Crowl, Sam', 'Coleman, Maeve Bella', 'Chaphiv, Andrew John', 'Naegle, Kristen M.']
['Crowl S', 'Coleman MB', 'Chaphiv AJ', 'Naegle KM']

In [39]:
# Function to convert the names to the desired format
def convert_name_format(name):
    # Split the name into last name and the rest
    last_name, first_names = name.split(', ')
    # Split the first names into individual names
    initials = ''.join([name[0] for name in first_names.split()])
    # Return the formatted name
    return f"{last_name} {initials}"

# Convert all the names in the list
test_authors = ['Crowl, Sam', 'Coleman, Maeve Bella', 'Chaphiv, Andrew John', 'Naegle, Kristen M.']
formatted_authors = [convert_name_format(name) for name in test_authors]

# Print the result
print(formatted_authors)

['Crowl S', 'Coleman MB', 'Chaphiv AJ', 'Naegle KM']


In [None]:
def clean_special_characters(name):
    # Replace specific special character sequences with desired replacements
    name = re.sub(r'\{\\\'a\}', 'a', name)
    name = re.sub(r'\{\\~n\}', '~n', name)
    return name

def convert_name_format(name):
    name = clean_special_characters(name)                           # Clean special characters in the name
    last_name, first_names = name.split(', ')                       # Split the name into last name and the rest
    initials = ''.join([name[0] for name in first_names.split()])   # Split the first names into individual names
    return f"{last_name} {initials}"                                # Return the formatted name

def convert_bioarxiv_names(row):
    stripped_row = [author.strip() for author in row.split("and")]
    return [convert_name_format(name) for name in stripped_row]

In [73]:
import re

# Input string of authors
authors_string = "Brandt, I and Meyniel, Florent and Ochoa, David and {\\'a}Carlo, Bob and O'Neil, Anne {\\'a}Marie and {\\~n}Gomez, Pedro and Garcia, Maria {\\~n}Luisa"

# Function to clean special characters in the name
def clean_special_characters(name):
    name = re.sub(r'\{\\\'a\}', 'a', name)
    name = re.sub(r'\{\\~n\}', '~n', name)
    return name

# Function to convert the names to the desired format
def convert_name_format(name):
    name = clean_special_characters(name)
    last_name, first_names = name.split(', ')
    initials = ''.join([n[0] for n in first_names.split()])
    return f"{last_name} {initials}"

# Split the authors string using " and " as a delimiter
parts = authors_string.split(' and ')

# Merge parts that were split incorrectly
author_list = []
temp_author = ""
for part in parts:
    if ',' in part:
        if temp_author:
            author_list.append(temp_author)
            temp_author = ""
        author_list.append(part)
    else:
        temp_author += " and " + part

# Add the last part if any
if temp_author:
    author_list.append(temp_author.lstrip(" and "))

# Convert all the names in the list
formatted_authors = [convert_name_format(name) for name in author_list]

# Print the result
print(formatted_authors)


['Brandt I', 'Meyniel F', 'Ochoa D', 'aCarlo B', "O'Neil Aa", '~nGomez P', 'Garcia M~']


In [None]:
# print Exceptions, index, and row content
for i, row in enumerate(bioarxiv_authors):
    try:
        convert_bioarxiv_names(row)
    except Exception as e: 
        print('Error at index {}: {!r}'.format(i, row))
        print(e)

In [None]:
re.split(r'(?<=\w), (?=[A-Z][a-z]+)', bioarxiv_authors[25])

In [105]:
import re

# Input string of authors
authors_string = "[Boris Sedlak, Victor Casamayor Pujol, Praveen Kumar Donta, Schahram Dustdar]"

# Function to clean special characters in the name (though not required in this case)
def clean_special_characters(name):
    name = re.sub(r'\{\\\'a\}', 'a', name)
    name = re.sub(r'\{\\~n\}', '~n', name)
    return name

# Function to convert the names to the desired format
def convert_name_format(name):
    # Clean special characters in the name
    name = clean_special_characters(name)
    # Split the name into parts
    parts = name.split()
    # The last part is the last name, the rest are first and middle names
    last_name = parts[-1]
    first_middle_names = parts[:-1]
    # Get the initials of the first and middle names
    initials = ''.join([n[0] for n in first_middle_names])
    # Return the formatted name
    return f"{last_name} {initials}"

# Remove the square brackets and split the string by commas
author_list = authors_string.strip("[]").split(", ")

# Convert all the names in the list
formatted_authors = [convert_name_format(name) for name in author_list]

# Join the formatted names with a comma
result = ', '.join(formatted_authors)

# Print the result
print(result)


Sedlak B, Pujol VC, Donta PK, Dustdar S
