In [1]:
import numpy as np
import pandas as pd
import pickle
import time
from scholarly import scholarly
import folium
import spacy

import seaborn as sns
import matplotlib.pyplot as plt

import json

In [2]:
DATA_FOLDER = '../../data/'

# Plot Organizations

In [3]:
author_organizations_df = pd.read_parquet(f'{DATA_FOLDER}author_organizations.parquet')

In [4]:
author_organizations_df

Unnamed: 0,author_id,organization
0,zkBXb_kAAAAJ,Biomedical Informatics
1,zkBXb_kAAAAJ,Shandong University
2,EHvA-IUAAAAJ,Tianjin University
3,EHvA-IUAAAAJ,Tsinghua University
4,EHvA-IUAAAAJ,City University of Hong Kong
...,...,...
111,1wloHDIAAAAJ,City University of Hong Kong
112,Tc_U_9YAAAAJ,Amazon.com
113,jV50Ks8AAAAJ,"Biostatistics, University of Michigan"
114,QVJvfz8AAAAJ,Computer Science and Engineeing


In [5]:
organizations_df = pd.read_pickle(f'{DATA_FOLDER}organizations_with_location.pickle')

In [6]:
organizations_df

Unnamed: 0,organization,location
0,Biomedical Informatics,"(Health & Biomedical Informatics Centre, 202-2..."
1,Shandong University,"(山东大学（青岛校区）, 72, 滨海公路, 青岛蓝谷高新技术产业开发区, 即墨区, 青岛市..."
2,Tianjin University,"(天津医科大学, 22号, 气象台路, 新兴街道, 天津市, 和平区, 天津市, 30005..."
3,Tsinghua University,"(清华大学, 30, 双清路, 东升镇, 海淀区, 北京市, 100084, 中国, (40..."
4,City University of Hong Kong,"(香港城市大學 City University of Hong Kong, 沙田區 Sha ..."
...,...,...
105,Data Scientist,
106,Amazon.com,"(Amazon.com Doppler, 2021, 7th Avenue, Central..."
107,"Biostatistics, University of Michigan",
108,Computer Science and Engineeing,


## Inspect organizations where the location is missing 

In [7]:
sum(organizations_df['location'].isna())

31

In [8]:
organizations_df[organizations_df['location'].isna()]

Unnamed: 0,organization,location
11,GrainGenes / Adj,
15,College of Mechanics and Materials,
17,Bioinformatics Research Group,
19,zhujiang hospital of southern medical university,
22,Department of Thyroid,
28,Obstetrics and Gynecology Hospital of Fudan Un...,
30,University of Science and Technology Liaoning,
31,Shanghai University of Medicine and Health Sci...,
33,"Biochemistry, Clemson University",
36,Central South University,


The records without a location can be summarised into three groups:

1. Records that are not organizations e.g. Data Scientist

2. Records which are departments e.g. Computer Science and Engineeing

3. Record that contains both the department and organization e.g. Biostatistics, University of Michigan

At this stage the records with missing locations will be dropped, adding data cleaning further up the processing pipeline will address this in the future

In [9]:
organizations_df.dropna(subset=['location'], inplace=True)

## Plot organizations

In [10]:
def plot_organizations(organizations, locations):
    # Create a map
    affiliation_map = folium.Map(location=[0, 0], zoom_start=2)
    
    for org, loc in zip(organizations, locations):
        folium.Marker(
            [loc.latitude, loc.longitude],
            popup=f'<details><summary>{org}</summary><p>{loc.address}</p></details>'
        ).add_to(affiliation_map)
    
    return affiliation_map

In [11]:
map_object = plot_organizations(organizations_df.organization.values, organizations_df.location.values)
map_object.save('organizations_map.html')  # Save to an HTML file

## Plot organizations and show authors

In [12]:
author_organizations_df = pd.read_parquet(f'{DATA_FOLDER}author_organizations.parquet')

In [13]:
author_organizations_df

Unnamed: 0,author_id,organization
0,zkBXb_kAAAAJ,Biomedical Informatics
1,zkBXb_kAAAAJ,Shandong University
2,EHvA-IUAAAAJ,Tianjin University
3,EHvA-IUAAAAJ,Tsinghua University
4,EHvA-IUAAAAJ,City University of Hong Kong
...,...,...
111,1wloHDIAAAAJ,City University of Hong Kong
112,Tc_U_9YAAAAJ,Amazon.com
113,jV50Ks8AAAAJ,"Biostatistics, University of Michigan"
114,QVJvfz8AAAAJ,Computer Science and Engineeing


In [14]:
# inner join
merged_df = pd.merge(organizations_df, author_organizations_df, on='organization', how='inner')

In [15]:
merged_df

Unnamed: 0,organization,location,author_id
0,Biomedical Informatics,"(Health & Biomedical Informatics Centre, 202-2...",zkBXb_kAAAAJ
1,Shandong University,"(山东大学（青岛校区）, 72, 滨海公路, 青岛蓝谷高新技术产业开发区, 即墨区, 青岛市...",zkBXb_kAAAAJ
2,Tianjin University,"(天津医科大学, 22号, 气象台路, 新兴街道, 天津市, 和平区, 天津市, 30005...",EHvA-IUAAAAJ
3,Tsinghua University,"(清华大学, 30, 双清路, 东升镇, 海淀区, 北京市, 100084, 中国, (40...",EHvA-IUAAAAJ
4,City University of Hong Kong,"(香港城市大學 City University of Hong Kong, 沙田區 Sha ...",EHvA-IUAAAAJ
...,...,...,...
79,Sun Yat-sen University,"(中山大学广州校区南校园, 135, 新港西路, 旧凤凰, 新港街道, 海珠区, 广州市, ...",pu5CdXoAAAAJ
80,jilin university,"(吉林大学（朝阳校区）, 西朝阳南胡同, 清和街道, 朝阳区, 长春市, 绿园区, 吉林省,...",MeSogXgAAAAJ
81,Amazon,"(Amazon, Careiro da Várzea, Região Geográfica ...",7PVmb8MAAAAJ
82,Amazon.com,"(Amazon.com Doppler, 2021, 7th Avenue, Central...",Tc_U_9YAAAAJ


In [16]:
with open(f'{DATA_FOLDER}authors.pickle', 'rb') as handle:
    authors = pickle.load(handle)

In [17]:
authors_df = pd.DataFrame(authors)

In [18]:
authors_df

Unnamed: 0,author_id,name,affiliation
0,zkBXb_kAAAAJ,Zhi-Ping Liu,"Professor of Biomedical Informatics, Shandong ..."
1,EHvA-IUAAAAJ,Pufeng Du,Tianjin University; Tsinghua University; City ...
2,5RoxYhkAAAAJ,Jianjun Tan,北京工业大学
3,ap3FfWEAAAAJ,Caitlin Simopoulos,Roche
4,uxiJL_cAAAAJ,Hibah Shaath,Hamad Bin Khalifa University
...,...,...,...
104,jV50Ks8AAAAJ,Zongrui Dai,"Master Student in Biostatistics, University of..."
105,clJGV9UAAAAJ,Marwa Matboli Sayed,Professor of medical biochemistry and molecula...
106,AEaAOCQAAAAJ,Faroza Shamsheem,Assistant professor
107,QVJvfz8AAAAJ,TUNGA ARUNDHATHI Assistant Professor,Assistant Professor in Computer Science and En...


In [19]:
merged_df = pd.merge(merged_df, authors_df, on='author_id', how='inner')

In [20]:
merged_df

Unnamed: 0,organization,location,author_id,name,affiliation
0,Biomedical Informatics,"(Health & Biomedical Informatics Centre, 202-2...",zkBXb_kAAAAJ,Zhi-Ping Liu,"Professor of Biomedical Informatics, Shandong ..."
1,Shandong University,"(山东大学（青岛校区）, 72, 滨海公路, 青岛蓝谷高新技术产业开发区, 即墨区, 青岛市...",zkBXb_kAAAAJ,Zhi-Ping Liu,"Professor of Biomedical Informatics, Shandong ..."
2,Tianjin University,"(天津医科大学, 22号, 气象台路, 新兴街道, 天津市, 和平区, 天津市, 30005...",EHvA-IUAAAAJ,Pufeng Du,Tianjin University; Tsinghua University; City ...
3,Tsinghua University,"(清华大学, 30, 双清路, 东升镇, 海淀区, 北京市, 100084, 中国, (40...",EHvA-IUAAAAJ,Pufeng Du,Tianjin University; Tsinghua University; City ...
4,City University of Hong Kong,"(香港城市大學 City University of Hong Kong, 沙田區 Sha ...",EHvA-IUAAAAJ,Pufeng Du,Tianjin University; Tsinghua University; City ...
...,...,...,...,...,...
79,Sun Yat-sen University,"(中山大学广州校区南校园, 135, 新港西路, 旧凤凰, 新港街道, 海珠区, 广州市, ...",pu5CdXoAAAAJ,Yunfang Yu,"Sun Yat-sen Memorial Hospital, Sun Yat-sen Uni..."
80,jilin university,"(吉林大学（朝阳校区）, 西朝阳南胡同, 清和街道, 朝阳区, 长春市, 绿园区, 吉林省,...",MeSogXgAAAAJ,Nan Sheng (盛楠),jilin university
81,Amazon,"(Amazon, Careiro da Várzea, Região Geográfica ...",7PVmb8MAAAAJ,Rujira Achawanantakun,"Research Scientist, Amazon"
82,Amazon.com,"(Amazon.com Doppler, 2021, 7th Avenue, Central...",Tc_U_9YAAAAJ,Yuan Zhang,Applied Science Manager at Amazon.com


In [21]:
merged_df.rename(columns={'name': 'author_name'}, inplace=True)

In [22]:
organization_author_df = merged_df[['organization', 'author_id', 'author_name']].drop_duplicates()

In [23]:
def plot_organizations_authors(organizations, locations):
    # Create a map
    m = folium.Map(location=[0, 0], zoom_start=2)
    
    for org, loc in zip(organizations, locations):
        authors = organization_author_df.query(f'organization == "{org}"')['author_name'].values
        authors_html = ""
        for auth in authors:
            authors_html += f'{auth}<br>'
        
        folium.Marker(
            [loc.latitude, loc.longitude],
            popup=f'<details><summary>{org}</summary><p>{authors_html}</p></details>'
        ).add_to(m)
    
    return m

In [24]:
map_object  = plot_organizations_authors(organizations_df.organization.values, organizations_df.location.values)

In [25]:
map_object.save('organizations_authors_map.html')  # Save to an HTML file

## Plot organizations and show papers

In [26]:
with open(f'{DATA_FOLDER}papers.pickle', 'rb') as handle:
    papers = pickle.load(handle)

In [27]:
papers_df = pd.DataFrame(papers)

In [28]:
papers_df

Unnamed: 0,title,abstract,year,url,author_id
0,Predicting lncRNA-protein interactions by mach...,"Here, we aim to provide a review of machine-le...",2020,Unknown,[zkBXb_kAAAAJ]
1,Recent advances in predicting protein-lncRNA i...,classified into the deep learning-based method...,2022,Unknown,"[, , , EHvA-IUAAAAJ]"
2,Recent advances in machine learning methods fo...,machine learning prediction models of LDAs. Fi...,2022,https://www.frontiersin.org/articles/10.3389/f...,"[5RoxYhkAAAAJ, , , ]"
3,Prediction of plant lncRNA by ensemble machine...,Multiple machine learning approaches to lncRNA...,2018,https://link.springer.com/article/10.1186/s128...,"[ap3FfWEAAAAJ, , ]"
4,Machine learning-based integration develops an...,related lncRNAs remains largely unexplored. In...,2022,https://www.nature.com/articles/s41467-022-284...,"[, , , , , ]"
...,...,...,...,...,...
95,Machine-Learning-Based identification of key f...,330500) was used to assess the differential ex...,2024,Unknown,"[clJGV9UAAAAJ, , , ]"
96,Prediction of ncRNA from RNA-Seq data using ma...,ncRNAs or lncRNAs. By classifying coding and l...,2023,Unknown,"[AEaAOCQAAAAJ, QVJvfz8AAAAJ]"
97,A classification model for lncRNA and mRNA bas...,"For these four machine learning algorithms, we...",2019,https://link.springer.com/article/10.1186/s128...,"[, , , , , ]"
98,Integrating multiple machine learning algorith...,"from TCGA-STAD, we identified 26 prognostic ln...",2023,https://www.frontiersin.org/articles/10.3389/f...,"[, , , , , , ]"


author_id is a list in papers_df so we generate a author to paper map

In [29]:
def generate_author_paper_map(titles, author_ids):
    title_lst = list()
    author_id_lst = list()
    for title, authors in zip(titles, author_ids):        
        if not all('' == s or s.isspace() for s in authors): # list does not contains only empty values
            for auth in authors:
                if auth != "":
                    title_lst.append(title)
                    author_id_lst.append(auth)

    data = {
        'title': title_lst,
        'author_id': author_id_lst
    }
    
    return pd.DataFrame.from_dict(data)
            

In [30]:
author_paper_df = generate_author_paper_map(papers_df['title'].values, papers_df['author_id'].values)

In [31]:
author_paper_df

Unnamed: 0,title,author_id
0,Predicting lncRNA-protein interactions by mach...,zkBXb_kAAAAJ
1,Recent advances in predicting protein-lncRNA i...,EHvA-IUAAAAJ
2,Recent advances in machine learning methods fo...,5RoxYhkAAAAJ
3,Prediction of plant lncRNA by ensemble machine...,ap3FfWEAAAAJ
4,Long non-coding RNA and RNA-binding protein in...,uxiJL_cAAAAJ
...,...,...
112,Machine-Learning-Based identification of key f...,clJGV9UAAAAJ
113,Prediction of ncRNA from RNA-Seq data using ma...,AEaAOCQAAAAJ
114,Prediction of ncRNA from RNA-Seq data using ma...,QVJvfz8AAAAJ
115,Linc2function: A Comprehensive Pipeline and We...,BP702PgAAAAJ


Check for duplicates

In [32]:
sum(author_paper_df.duplicated())

0

Good there aren't any

In [33]:
organization_author_df

Unnamed: 0,organization,author_id,author_name
0,Biomedical Informatics,zkBXb_kAAAAJ,Zhi-Ping Liu
1,Shandong University,zkBXb_kAAAAJ,Zhi-Ping Liu
2,Tianjin University,EHvA-IUAAAAJ,Pufeng Du
3,Tsinghua University,EHvA-IUAAAAJ,Pufeng Du
4,City University of Hong Kong,EHvA-IUAAAAJ,Pufeng Du
...,...,...,...
79,Sun Yat-sen University,pu5CdXoAAAAJ,Yunfang Yu
80,jilin university,MeSogXgAAAAJ,Nan Sheng (盛楠)
81,Amazon,7PVmb8MAAAAJ,Rujira Achawanantakun
82,Amazon.com,Tc_U_9YAAAAJ,Yuan Zhang


In [34]:
def plot_organizations_papers(organizations, locations):
    # Create a map
    m = folium.Map(location=[0, 0], zoom_start=2)
    
    for org, loc in zip(organizations, locations):
        authors = organization_author_df.query(f'organization == "{org}"')['author_id'].values

        papers = list()
        for author_id in authors:
            papers = papers + author_paper_df.query(f'author_id == "{author_id}"')['title'].values.tolist()

        # drop duplicate values
        papers = list(set(papers))
        
        papers_html = ""
        for paper in papers:
            papers_html += f'{paper}<br>'
        
        folium.Marker(
            [loc.latitude, loc.longitude],
            popup=f'<details><summary>{org}</summary><p>{papers_html}</p></details>'
        ).add_to(m)
    
    return m

In [35]:
map_object  = plot_organizations_papers(organizations_df.organization.values, organizations_df.location.values)

In [36]:
map_object.save('organizations_papers_map.html')  # Save to an HTML file

## Plot papers and show similar papers

In [37]:
organization_author_location_df = pd.merge(organizations_df, organization_author_df, on='organization', how='inner')

In [38]:
organization_author_location_df

Unnamed: 0,organization,location,author_id,author_name
0,Biomedical Informatics,"(Health & Biomedical Informatics Centre, 202-2...",zkBXb_kAAAAJ,Zhi-Ping Liu
1,Shandong University,"(山东大学（青岛校区）, 72, 滨海公路, 青岛蓝谷高新技术产业开发区, 即墨区, 青岛市...",zkBXb_kAAAAJ,Zhi-Ping Liu
2,Tianjin University,"(天津医科大学, 22号, 气象台路, 新兴街道, 天津市, 和平区, 天津市, 30005...",EHvA-IUAAAAJ,Pufeng Du
3,Tsinghua University,"(清华大学, 30, 双清路, 东升镇, 海淀区, 北京市, 100084, 中国, (40...",EHvA-IUAAAAJ,Pufeng Du
4,City University of Hong Kong,"(香港城市大學 City University of Hong Kong, 沙田區 Sha ...",EHvA-IUAAAAJ,Pufeng Du
...,...,...,...,...
79,Sun Yat-sen University,"(中山大学广州校区南校园, 135, 新港西路, 旧凤凰, 新港街道, 海珠区, 广州市, ...",pu5CdXoAAAAJ,Yunfang Yu
80,jilin university,"(吉林大学（朝阳校区）, 西朝阳南胡同, 清和街道, 朝阳区, 长春市, 绿园区, 吉林省,...",MeSogXgAAAAJ,Nan Sheng (盛楠)
81,Amazon,"(Amazon, Careiro da Várzea, Região Geográfica ...",7PVmb8MAAAAJ,Rujira Achawanantakun
82,Amazon.com,"(Amazon.com Doppler, 2021, 7th Avenue, Central...",Tc_U_9YAAAAJ,Yuan Zhang


In [39]:
organization_paper_location_df = pd.merge(author_paper_df, organization_author_location_df, on='author_id', how='inner')

In [40]:
organization_paper_location_df

Unnamed: 0,title,author_id,organization,location,author_name
0,Predicting lncRNA-protein interactions by mach...,zkBXb_kAAAAJ,Biomedical Informatics,"(Health & Biomedical Informatics Centre, 202-2...",Zhi-Ping Liu
1,Predicting lncRNA-protein interactions by mach...,zkBXb_kAAAAJ,Shandong University,"(山东大学（青岛校区）, 72, 滨海公路, 青岛蓝谷高新技术产业开发区, 即墨区, 青岛市...",Zhi-Ping Liu
2,Recent advances in predicting protein-lncRNA i...,EHvA-IUAAAAJ,Tianjin University,"(天津医科大学, 22号, 气象台路, 新兴街道, 天津市, 和平区, 天津市, 30005...",Pufeng Du
3,Recent advances in predicting protein-lncRNA i...,EHvA-IUAAAAJ,Tsinghua University,"(清华大学, 30, 双清路, 东升镇, 海淀区, 北京市, 100084, 中国, (40...",Pufeng Du
4,Recent advances in predicting protein-lncRNA i...,EHvA-IUAAAAJ,City University of Hong Kong,"(香港城市大學 City University of Hong Kong, 沙田區 Sha ...",Pufeng Du
...,...,...,...,...,...
87,LncRNA-ID: Long non-coding RNA IDentification ...,1wloHDIAAAAJ,City University of Hong Kong,"(香港城市大學 City University of Hong Kong, 沙田區 Sha ...",Yanni Sun
88,LncRNA-ID: Long non-coding RNA IDentification ...,Tc_U_9YAAAAJ,Amazon.com,"(Amazon.com Doppler, 2021, 7th Avenue, Central...",Yuan Zhang
89,PRPI-SC: an ensemble deep learning model for p...,B6LqguIAAAAJ,Jomo Kenyatta University of Agriculture and Te...,(Jomo Kenyatta University of Agriculture and T...,"Jael Sanyanda Wekesa, PhD"
90,Linc2function: A Comprehensive Pipeline and We...,BP702PgAAAAJ,Monash University,"(Monash University, Mile Lane, International H...",Yashpal Ramakrishnaiah


This creates multiple records where a paper is authored by multiple people

To resolve this, we can drop the author related columns, that were necessary to join the paper and organization

In [41]:
organization_paper_location_df = organization_paper_location_df.drop(columns=['author_name', 'author_id'])

In [42]:
organization_paper_location_df.drop_duplicates(['title', 'organization'], inplace=True)

In [43]:
organization_paper_location_df

Unnamed: 0,title,organization,location
0,Predicting lncRNA-protein interactions by mach...,Biomedical Informatics,"(Health & Biomedical Informatics Centre, 202-2..."
1,Predicting lncRNA-protein interactions by mach...,Shandong University,"(山东大学（青岛校区）, 72, 滨海公路, 青岛蓝谷高新技术产业开发区, 即墨区, 青岛市..."
2,Recent advances in predicting protein-lncRNA i...,Tianjin University,"(天津医科大学, 22号, 气象台路, 新兴街道, 天津市, 和平区, 天津市, 30005..."
3,Recent advances in predicting protein-lncRNA i...,Tsinghua University,"(清华大学, 30, 双清路, 东升镇, 海淀区, 北京市, 100084, 中国, (40..."
4,Recent advances in predicting protein-lncRNA i...,City University of Hong Kong,"(香港城市大學 City University of Hong Kong, 沙田區 Sha ..."
...,...,...,...
87,LncRNA-ID: Long non-coding RNA IDentification ...,City University of Hong Kong,"(香港城市大學 City University of Hong Kong, 沙田區 Sha ..."
88,LncRNA-ID: Long non-coding RNA IDentification ...,Amazon.com,"(Amazon.com Doppler, 2021, 7th Avenue, Central..."
89,PRPI-SC: an ensemble deep learning model for p...,Jomo Kenyatta University of Agriculture and Te...,(Jomo Kenyatta University of Agriculture and T...
90,Linc2function: A Comprehensive Pipeline and We...,Monash University,"(Monash University, Mile Lane, International H..."


Save Organization/Paper/Location dataframe

In [44]:
with open(f'{DATA_FOLDER}organization_paper_location_df.pickle', 'wb') as handle:
    pickle.dump(organization_paper_location_df, handle, protocol=pickle.HIGHEST_PROTOCOL)