latitude

longitude

author_name

affiliation / organization

paper_title

In [37]:
import pandas as pd
import pickle
import time
from scholarly import scholarly
import folium

In [2]:
DATA_FOLDER = '../../data/'

# Plot Organizations

In [3]:
author_organizations_df = pd.read_parquet(f'{DATA_FOLDER}author_organizations.parquet')

In [4]:
author_organizations_df

Unnamed: 0,author_id,organization
0,zkBXb_kAAAAJ,Biomedical Informatics
1,zkBXb_kAAAAJ,Shandong University
2,EHvA-IUAAAAJ,Tianjin University
3,EHvA-IUAAAAJ,Tsinghua University
4,EHvA-IUAAAAJ,City University of Hong Kong
...,...,...
111,1wloHDIAAAAJ,City University of Hong Kong
112,Tc_U_9YAAAAJ,Amazon.com
113,jV50Ks8AAAAJ,"Biostatistics, University of Michigan"
114,QVJvfz8AAAAJ,Computer Science and Engineeing


In [5]:
organizations_df = pd.read_pickle(f'{DATA_FOLDER}organizations_with_location.pickle')

In [6]:
organizations_df

Unnamed: 0,organization,location
0,Biomedical Informatics,"(Health & Biomedical Informatics Centre, 202-2..."
1,Shandong University,"(山东大学（青岛校区）, 72, 滨海公路, 青岛蓝谷高新技术产业开发区, 即墨区, 青岛市..."
2,Tianjin University,"(天津医科大学, 22号, 气象台路, 新兴街道, 天津市, 和平区, 天津市, 30005..."
3,Tsinghua University,"(清华大学, 30, 双清路, 东升镇, 海淀区, 北京市, 100084, 中国, (40..."
4,City University of Hong Kong,"(香港城市大學 City University of Hong Kong, 沙田區 Sha ..."
...,...,...
105,Data Scientist,
106,Amazon.com,"(Amazon.com Doppler, 2021, 7th Avenue, Central..."
107,"Biostatistics, University of Michigan",
108,Computer Science and Engineeing,


### Inspect organizations where the location is missing 

In [12]:
sum(organizations_df['location'].isna())

31

In [9]:
organizations_df[organizations_df['location'].isna()]

Unnamed: 0,organization,location
11,GrainGenes / Adj,
15,College of Mechanics and Materials,
17,Bioinformatics Research Group,
19,zhujiang hospital of southern medical university,
22,Department of Thyroid,
28,Obstetrics and Gynecology Hospital of Fudan Un...,
30,University of Science and Technology Liaoning,
31,Shanghai University of Medicine and Health Sci...,
33,"Biochemistry, Clemson University",
36,Central South University,


The records without a location can be summarised into three groups:

1. Records that are not organizations e.g. Data Scientist

2. Records which are departments e.g. Computer Science and Engineeing

3. Record that contains both the department and organization e.g. Biostatistics, University of Michigan

At this stage the records with missing locations will be dropped, adding data cleaning further up the processing pipeline will address this in the future

In [13]:
organizations_df.dropna(subset=['location'], inplace=True)

Plot organizations

In [47]:
def plot_organizations(organizations, locations):
    # Create a map
    affiliation_map = folium.Map(location=[0, 0], zoom_start=2)
    
    for org, loc in zip(organizations, locations):
        folium.Marker(
            [loc.latitude, loc.longitude],
            popup=f'{org}'
        ).add_to(affiliation_map)
    
    return affiliation_map

map_object = plot_organizations(organizations_df.organization.values, organizations_df.location.values)
map_object.save('organizations_map.html')  # Save to an HTML file

In [46]:
organizations_df['location'][0]

Location(Health & Biomedical Informatics Centre, 202-206, Berkeley Street, Carlton, Melbourne, City of Melbourne, Victoria, 3053, Australia, (-37.8006938, 144.9588263, 0.0))

In [41]:
organizations_df['organization'][0]

'Biomedical Informatics'

In [42]:
dir(organizations_df['location'][0])

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '_address',
 '_point',
 '_raw',
 '_tuple',
 'address',
 'altitude',
 'latitude',
 'longitude',
 'point',
 'raw']

In [43]:
organizations_df['location'][0].address

'Health & Biomedical Informatics Centre, 202-206, Berkeley Street, Carlton, Melbourne, City of Melbourne, Victoria, 3053, Australia'

### Lets add the authors to organizations

In [20]:
author_organizations_df = pd.read_parquet(f'{DATA_FOLDER}author_organizations.parquet')

In [21]:
author_organizations_df

Unnamed: 0,author_id,organization
0,zkBXb_kAAAAJ,Biomedical Informatics
1,zkBXb_kAAAAJ,Shandong University
2,EHvA-IUAAAAJ,Tianjin University
3,EHvA-IUAAAAJ,Tsinghua University
4,EHvA-IUAAAAJ,City University of Hong Kong
...,...,...
111,1wloHDIAAAAJ,City University of Hong Kong
112,Tc_U_9YAAAAJ,Amazon.com
113,jV50Ks8AAAAJ,"Biostatistics, University of Michigan"
114,QVJvfz8AAAAJ,Computer Science and Engineeing


In [24]:
# inner join
df = pd.merge(organizations_df, author_organizations_df, on='organization', how='inner')

In [25]:
df

Unnamed: 0,organization,location,author_id
0,Biomedical Informatics,"(Health & Biomedical Informatics Centre, 202-2...",zkBXb_kAAAAJ
1,Shandong University,"(山东大学（青岛校区）, 72, 滨海公路, 青岛蓝谷高新技术产业开发区, 即墨区, 青岛市...",zkBXb_kAAAAJ
2,Tianjin University,"(天津医科大学, 22号, 气象台路, 新兴街道, 天津市, 和平区, 天津市, 30005...",EHvA-IUAAAAJ
3,Tsinghua University,"(清华大学, 30, 双清路, 东升镇, 海淀区, 北京市, 100084, 中国, (40...",EHvA-IUAAAAJ
4,City University of Hong Kong,"(香港城市大學 City University of Hong Kong, 沙田區 Sha ...",EHvA-IUAAAAJ
...,...,...,...
79,Sun Yat-sen University,"(中山大学广州校区南校园, 135, 新港西路, 旧凤凰, 新港街道, 海珠区, 广州市, ...",pu5CdXoAAAAJ
80,jilin university,"(吉林大学（朝阳校区）, 西朝阳南胡同, 清和街道, 朝阳区, 长春市, 绿园区, 吉林省,...",MeSogXgAAAAJ
81,Amazon,"(Amazon, Careiro da Várzea, Região Geográfica ...",7PVmb8MAAAAJ
82,Amazon.com,"(Amazon.com Doppler, 2021, 7th Avenue, Central...",Tc_U_9YAAAAJ


In [27]:
with open(f'{DATA_FOLDER}authors.pickle', 'rb') as handle:
    authors = pickle.load(handle)

In [30]:
authors_df = pd.DataFrame(authors)

In [31]:
authors_df

Unnamed: 0,author_id,name,affiliation
0,zkBXb_kAAAAJ,Zhi-Ping Liu,"Professor of Biomedical Informatics, Shandong ..."
1,EHvA-IUAAAAJ,Pufeng Du,Tianjin University; Tsinghua University; City ...
2,5RoxYhkAAAAJ,Jianjun Tan,北京工业大学
3,ap3FfWEAAAAJ,Caitlin Simopoulos,Roche
4,uxiJL_cAAAAJ,Hibah Shaath,Hamad Bin Khalifa University
...,...,...,...
104,jV50Ks8AAAAJ,Zongrui Dai,"Master Student in Biostatistics, University of..."
105,clJGV9UAAAAJ,Marwa Matboli Sayed,Professor of medical biochemistry and molecula...
106,AEaAOCQAAAAJ,Faroza Shamsheem,Assistant professor
107,QVJvfz8AAAAJ,TUNGA ARUNDHATHI Assistant Professor,Assistant Professor in Computer Science and En...


In [32]:
df = pd.merge(df, authors_df, on='author_id', how='inner')

In [33]:
df

Unnamed: 0,organization,location,author_id,name,affiliation
0,Biomedical Informatics,"(Health & Biomedical Informatics Centre, 202-2...",zkBXb_kAAAAJ,Zhi-Ping Liu,"Professor of Biomedical Informatics, Shandong ..."
1,Shandong University,"(山东大学（青岛校区）, 72, 滨海公路, 青岛蓝谷高新技术产业开发区, 即墨区, 青岛市...",zkBXb_kAAAAJ,Zhi-Ping Liu,"Professor of Biomedical Informatics, Shandong ..."
2,Tianjin University,"(天津医科大学, 22号, 气象台路, 新兴街道, 天津市, 和平区, 天津市, 30005...",EHvA-IUAAAAJ,Pufeng Du,Tianjin University; Tsinghua University; City ...
3,Tsinghua University,"(清华大学, 30, 双清路, 东升镇, 海淀区, 北京市, 100084, 中国, (40...",EHvA-IUAAAAJ,Pufeng Du,Tianjin University; Tsinghua University; City ...
4,City University of Hong Kong,"(香港城市大學 City University of Hong Kong, 沙田區 Sha ...",EHvA-IUAAAAJ,Pufeng Du,Tianjin University; Tsinghua University; City ...
...,...,...,...,...,...
79,Sun Yat-sen University,"(中山大学广州校区南校园, 135, 新港西路, 旧凤凰, 新港街道, 海珠区, 广州市, ...",pu5CdXoAAAAJ,Yunfang Yu,"Sun Yat-sen Memorial Hospital, Sun Yat-sen Uni..."
80,jilin university,"(吉林大学（朝阳校区）, 西朝阳南胡同, 清和街道, 朝阳区, 长春市, 绿园区, 吉林省,...",MeSogXgAAAAJ,Nan Sheng (盛楠),jilin university
81,Amazon,"(Amazon, Careiro da Várzea, Região Geográfica ...",7PVmb8MAAAAJ,Rujira Achawanantakun,"Research Scientist, Amazon"
82,Amazon.com,"(Amazon.com Doppler, 2021, 7th Avenue, Central...",Tc_U_9YAAAAJ,Yuan Zhang,Applied Science Manager at Amazon.com


In [34]:
df.rename(columns={'name': 'author_name'}, inplace=True)

In [35]:
df

Unnamed: 0,organization,location,author_id,author_name,affiliation
0,Biomedical Informatics,"(Health & Biomedical Informatics Centre, 202-2...",zkBXb_kAAAAJ,Zhi-Ping Liu,"Professor of Biomedical Informatics, Shandong ..."
1,Shandong University,"(山东大学（青岛校区）, 72, 滨海公路, 青岛蓝谷高新技术产业开发区, 即墨区, 青岛市...",zkBXb_kAAAAJ,Zhi-Ping Liu,"Professor of Biomedical Informatics, Shandong ..."
2,Tianjin University,"(天津医科大学, 22号, 气象台路, 新兴街道, 天津市, 和平区, 天津市, 30005...",EHvA-IUAAAAJ,Pufeng Du,Tianjin University; Tsinghua University; City ...
3,Tsinghua University,"(清华大学, 30, 双清路, 东升镇, 海淀区, 北京市, 100084, 中国, (40...",EHvA-IUAAAAJ,Pufeng Du,Tianjin University; Tsinghua University; City ...
4,City University of Hong Kong,"(香港城市大學 City University of Hong Kong, 沙田區 Sha ...",EHvA-IUAAAAJ,Pufeng Du,Tianjin University; Tsinghua University; City ...
...,...,...,...,...,...
79,Sun Yat-sen University,"(中山大学广州校区南校园, 135, 新港西路, 旧凤凰, 新港街道, 海珠区, 广州市, ...",pu5CdXoAAAAJ,Yunfang Yu,"Sun Yat-sen Memorial Hospital, Sun Yat-sen Uni..."
80,jilin university,"(吉林大学（朝阳校区）, 西朝阳南胡同, 清和街道, 朝阳区, 长春市, 绿园区, 吉林省,...",MeSogXgAAAAJ,Nan Sheng (盛楠),jilin university
81,Amazon,"(Amazon, Careiro da Várzea, Região Geográfica ...",7PVmb8MAAAAJ,Rujira Achawanantakun,"Research Scientist, Amazon"
82,Amazon.com,"(Amazon.com Doppler, 2021, 7th Avenue, Central...",Tc_U_9YAAAAJ,Yuan Zhang,Applied Science Manager at Amazon.com


### Add paper_titles

In [None]:
import geopandas as gpd
import folium
from geopy.geocoders import Nominatim

#def geocode_affiliations(papers):
#    geolocator = Nominatim(user_agent="lncRNA-map")
#    locations = []

#    for paper in papers:
#        for author in paper["authors"]:
#            affiliation = author.get("affiliation")
#            if affiliation and affiliation != "Unknown":
#                location = geolocator.geocode(affiliation)
#                if location:
#                    locations.append({
#                        "paper_title": paper["title"],
#                        "author_name": author["name"],
#                        "affiliation": affiliation,
#                        "latitude": location.latitude,
#                        "longitude": location.longitude
#                    })
#    return locations



def plot_affiliations(locations):
    # Create a map
    affiliation_map = folium.Map(location=[0, 0], zoom_start=2)
    
    for loc in locations:
        folium.Marker(
            [loc["latitude"], loc["longitude"]],
            popup=f"{loc['author_name']} ({loc['affiliation']})<br>{loc['paper_title']}"
        ).add_to(affiliation_map)
    
    return affiliation_map

# Example usage
locations = geocode_affiliations(papers)
map_object = plot_affiliations(locations)
map_object.save("affiliations_map.html")  # Save to an HTML file

In [7]:
def search_papers(query, limit=5):
    search_query = scholarly.search_pubs(query)
    papers = []

    for _ in range(limit):
        try:
            paper = next(search_query)
                        
            papers.append({
                "title": paper.get("bib", {}).get("title"),
                "abstract": paper.get("bib", {}).get("abstract"),
                "year": paper.get("bib", {}).get("pub_year"),
                "url": paper.get("eprint_url", "Unknown"),
                "author_id": paper.get("author_id", [])
            })
        except StopIteration:
            break
        except Exception as e:
            print(f"Error processing paper : {e}")
            continue

    return papers

In [27]:
def search_authors(papers):
    author_ids = list()
    authors = list()
    for paper in papers:
        print
        if len(author_ids) > 10:
            break
        if "author_id" in paper:
            for author_id in paper["author_id"]:
                if author_id: # check the author_id is not empty
                    #time.sleep(2)  # Introduce delay
                    if author_id not in author_ids: # avoids duplicate searches                        
                        try:
                            print(f'Searching for author with id {author_id}...')
                            author = scholarly.search_author_id(author_id)
                            #author = scholarly.fill(scholarly.search_author_id(author_id), sections=['basics'])
                            authors.append({
                                "author_id": author_id,
                                "name": author.get("name"),
                                "affiliation": author.get("affiliation", "Unknown")
                            })
                            author_ids.append(author_id)
                        except Exception as e:
                            print(f"Error processing author with id {author_id}: {e}")
    return authors

In [39]:
papers[0]['title']

'Predicting lncRNA-protein interactions by machine learning methods: a review'

# Search for papers and save the results to disk

In [8]:
query = "Machine Learning lncRNA"

In [9]:
papers = search_papers(query, limit=100)

In [16]:
with open(f'{DATA_FOLDER}papers.pickle', 'wb') as handle:
    pickle.dump(papers, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Search for authors associated with papers and save the results to disk

In [36]:
%%timeit
authors = search_authors(papers)

Searching for author with id zkBXb_kAAAAJ...
Searching for author with id EHvA-IUAAAAJ...
Searching for author with id 5RoxYhkAAAAJ...
Searching for author with id ap3FfWEAAAAJ...
Searching for author with id uxiJL_cAAAAJ...
Searching for author with id QV_7inYAAAAJ...
Searching for author with id JNrJo8cAAAAJ...
Searching for author with id o70NT8IAAAAJ...
Searching for author with id ZeGca3cAAAAJ...
Searching for author with id o3DdNZMAAAAJ...
Searching for author with id Ydo9ResAAAAJ...
Searching for author with id 4IRe3WYAAAAJ...
Searching for author with id zkBXb_kAAAAJ...
Searching for author with id EHvA-IUAAAAJ...
Searching for author with id 5RoxYhkAAAAJ...
Searching for author with id ap3FfWEAAAAJ...
Searching for author with id uxiJL_cAAAAJ...
Searching for author with id QV_7inYAAAAJ...
Searching for author with id JNrJo8cAAAAJ...
Searching for author with id o70NT8IAAAAJ...
Searching for author with id ZeGca3cAAAAJ...
Searching for author with id o3DdNZMAAAAJ...
Searching 

KeyboardInterrupt: 

In [29]:
authors

[{'author_id': 'zkBXb_kAAAAJ',
  'name': 'Zhi-Ping Liu',
  'affiliation': 'Professor of Biomedical Informatics, Shandong University'},
 {'author_id': 'EHvA-IUAAAAJ',
  'name': 'Pufeng Du',
  'affiliation': 'Tianjin University; Tsinghua University; City University of Hong Kong'},
 {'author_id': '5RoxYhkAAAAJ', 'name': 'Jianjun Tan', 'affiliation': '北京工业大学'},
 {'author_id': 'ap3FfWEAAAAJ',
  'name': 'Caitlin Simopoulos',
  'affiliation': 'Roche'},
 {'author_id': 'uxiJL_cAAAAJ',
  'name': 'Hibah Shaath',
  'affiliation': 'Hamad Bin Khalifa University'},
 {'author_id': 'QV_7inYAAAAJ',
  'name': 'Vishnubalaji radhakrishnan',
  'affiliation': 'HBKU-QBRI, Qatar Foundation'},
 {'author_id': 'JNrJo8cAAAAJ',
  'name': 'Ramesh Elango',
  'affiliation': 'QBRI, Hamad bin Khalifa University'},
 {'author_id': 'o70NT8IAAAAJ', 'name': 'Qiqi Xie', 'affiliation': 'IUB'},
 {'author_id': 'ZeGca3cAAAAJ',
  'name': 'Halise Busra Cagirici',
  'affiliation': 'Stanford University'},
 {'author_id': 'o3DdNZMAAAAJ

In [31]:
x = scholarly.search_author_id('4IRe3WYAAAAJ')

In [32]:
x

{'container_type': 'Author',
 'filled': ['basics'],
 'scholar_id': '4IRe3WYAAAAJ',
 'source': <AuthorSource.AUTHOR_PROFILE_PAGE: 'AUTHOR_PROFILE_PAGE'>,
 'name': 'Hikmet Budak',
 'url_picture': 'https://scholar.googleusercontent.com/citations?view_op=view_photo&user=4IRe3WYAAAAJ&citpid=10',
 'affiliation': 'Arizona Western College',
 'interests': ['wheat genomics',
  'wheat genetics',
  'wheat',
  'gene editing',
  'miRNA'],
 'email_domain': '@unl.edu',
 'homepage': 'https://loop.frontiersin.org/people/59677/overview',
 'citedby': 36995}

In [34]:
y = scholarly.fill(x, sections=['basics'])

In [35]:
y

{'container_type': 'Author',
 'filled': ['basics'],
 'scholar_id': '4IRe3WYAAAAJ',
 'source': <AuthorSource.AUTHOR_PROFILE_PAGE: 'AUTHOR_PROFILE_PAGE'>,
 'name': 'Hikmet Budak',
 'url_picture': 'https://scholar.googleusercontent.com/citations?view_op=view_photo&user=4IRe3WYAAAAJ&citpid=10',
 'affiliation': 'Arizona Western College',
 'interests': ['wheat genomics',
  'wheat genetics',
  'wheat',
  'gene editing',
  'miRNA'],
 'email_domain': '@unl.edu',
 'homepage': 'https://loop.frontiersin.org/people/59677/overview',
 'citedby': 36995}

In [1]:
import pandas as pd

1. Search for articles by keywords
2. Save article and associated author in separate dataframes

Create empty dataframes

pd.DataFrame()

In [14]:
sq = scholarly.search_pubs("Machine Learning lncRNA")

In [15]:
p = next(sq)

In [16]:
p

{'container_type': 'Publication',
 'source': <PublicationSource.PUBLICATION_SEARCH_SNIPPET: 'PUBLICATION_SEARCH_SNIPPET'>,
 'bib': {'title': 'Predicting lncRNA-protein interactions by machine learning methods: a review',
  'author': ['ZP Liu'],
  'pub_year': '2020',
  'venue': 'Current Bioinformatics',
  'abstract': 'Here, we aim to provide a review of machine-learning-based methods for predicting lncRNA  of predicting lncRNA-protein interactions into a general framework of machine learning. We'},
 'filled': False,
 'gsrank': 1,
 'pub_url': 'https://www.ingentaconnect.com/content/ben/cbio/2020/00000015/00000008/art00005',
 'author_id': ['zkBXb_kAAAAJ'],
 'url_scholarbib': '/scholar?hl=en&q=info:wfC5OKxQqHMJ:scholar.google.com/&output=cite&scirp=0&hl=en',
 'url_add_sclib': '/citations?hl=en&xsrf=&continue=/scholar%3Fq%3DMachine%2BLearning%2BlncRNA%26hl%3Den%26as_sdt%3D0,33&citilm=1&update_op=library_add&info=wfC5OKxQqHMJ&ei=NyxfZ_frIcyR6rQPhb_isQU&json=',
 'num_citations': 16,
 'citedby

In [17]:
list(p.keys())

['container_type',
 'source',
 'bib',
 'filled',
 'gsrank',
 'pub_url',
 'author_id',
 'url_scholarbib',
 'url_add_sclib',
 'num_citations',
 'citedby_url',
 'url_related_articles']

In [18]:
author_id = 'zkBXb_kAAAAJ'
author = scholarly.fill(scholarly.search_author_id(author_id))

In [19]:
author

{'container_type': 'Author',
 'filled': ['basics',
  'indices',
  'counts',
  'coauthors',
  'publications',
  'public_access'],
 'scholar_id': 'zkBXb_kAAAAJ',
 'source': <AuthorSource.AUTHOR_PROFILE_PAGE: 'AUTHOR_PROFILE_PAGE'>,
 'name': 'Zhi-Ping Liu',
 'affiliation': 'Professor of Biomedical Informatics, Shandong University',
 'organization': 16033752023248436093,
 'interests': ['Bioinformatics',
  'Computational Biology',
  'Systems Biomedicine',
  'Machine Learning',
  'Operations Research'],
 'email_domain': '@sdu.edu.cn',
 'citedby': 5975,
 'citedby5y': 3478,
 'hindex': 37,
 'hindex5y': 28,
 'i10index': 75,
 'i10index5y': 62,
 'cites_per_year': {2011: 34,
  2012: 109,
  2013: 261,
  2014: 328,
  2015: 378,
  2016: 407,
  2017: 458,
  2018: 442,
  2019: 432,
  2020: 445,
  2021: 577,
  2022: 685,
  2023: 621,
  2024: 713},
 'coauthors': [{'container_type': 'Author',
   'filled': [],
   'scholar_id': 'Uoqv8rkAAAAJ',
   'source': <AuthorSource.CO_AUTHORS_LIST: 'CO_AUTHORS_LIST'>,
 

----

In [20]:
p2 = next(sq)

In [21]:
p2

{'container_type': 'Publication',
 'source': <PublicationSource.PUBLICATION_SEARCH_SNIPPET: 'PUBLICATION_SEARCH_SNIPPET'>,
 'bib': {'title': 'Recent advances in predicting protein-lncRNA interactions using machine learning methods',
  'author': ['H Yu', 'ZA Shen', 'YK Zhou', 'PF Du'],
  'pub_year': '2022',
  'venue': 'Current Gene Therapy',
  'abstract': 'classified into the deep learning-based method, the ensemble learning-based method,  and  learning methods. We summarized the state-of-the-art methods in predicting lncRNA-protein'},
 'filled': False,
 'gsrank': 2,
 'pub_url': 'https://www.ingentaconnect.com/content/ben/cgt/2022/00000022/00000003/art00005',
 'author_id': ['', '', '', 'EHvA-IUAAAAJ'],
 'url_scholarbib': '/scholar?hl=en&q=info:WeRErDw2Ap8J:scholar.google.com/&output=cite&scirp=1&hl=en',
 'url_add_sclib': '/citations?hl=en&xsrf=&continue=/scholar%3Fq%3DMachine%2BLearning%2BlncRNA%26hl%3Den%26as_sdt%3D0,33&citilm=1&update_op=library_add&info=WeRErDw2Ap8J&ei=NyxfZ_frIcyR6r

In [23]:
for a in p2['author_id']:
    if a:
        print(a)

EHvA-IUAAAAJ


----

In [2]:
import spacy

# Load the spaCy language model
nlp = spacy.load("en_core_web_sm")

def extract_organizations(affiliation):
    """Extracts organization names from a affiliation value of an author record."""
    organizations = []
    
    doc = nlp(affiliation)
    # Extract entities labeled as ORG (organization)
    for ent in doc.ents:
        if ent.label_ == "ORG":
            organizations.append(ent.text)
    return organizations

In [13]:
for paper in papers:
    print(f"Title: {paper['title']}")
    print(f"Abstract: {paper['abstract']}")
    print(f"Year: {paper['year']}")
    print(f"URL: {paper['url']}")
    print("Authors:")
    for author in paper["authors"]:
        print(f"  - {author['name']}: {author['affiliation']}")
    print("-" * 80)

Title: Predicting lncRNA-protein interactions by machine learning methods: a review
Abstract: Here, we aim to provide a review of machine-learning-based methods for predicting lncRNA  of predicting lncRNA-protein interactions into a general framework of machine learning. We
Year: 2020
URL: Unknown
Authors:
  - Zhi-Ping Liu: Professor of Biomedical Informatics, Shandong University
--------------------------------------------------------------------------------
Title: Recent advances in predicting protein-lncRNA interactions using machine learning methods
Abstract: classified into the deep learning-based method, the ensemble learning-based method,  and  learning methods. We summarized the state-of-the-art methods in predicting lncRNA-protein
Year: 2022
URL: Unknown
Authors:
  - Pufeng Du: Tianjin University; Tsinghua University; City University of Hong Kong
--------------------------------------------------------------------------------
Title: Recent advances in machine learning methods 

In [28]:
for paper in papers:
    print(f"Title: {paper['title']}")
#    print(f"Abstract: {paper['abstract']}")
#    print(f"Year: {paper['year']}")
#    print(f"URL: {paper['url']}")
    print("Authors:")
    for author in paper["authors"]:
        organizations = extract_organizations(author['affiliation'])
        org = ""
        if len(organizations) > 0:
            org = organizations[0]
            
        print(f"  - {author['name']}: {author['affiliation']}: {organizations}")
        #print(f"  - {author['name']}: {org}")
    print("-" * 80)

Title: Predicting lncRNA-protein interactions by machine learning methods: a review
Authors:
  - Zhi-Ping Liu: Professor of Biomedical Informatics, Shandong University: ['Biomedical Informatics', 'Shandong University']
--------------------------------------------------------------------------------
Title: Recent advances in predicting protein-lncRNA interactions using machine learning methods
Authors:
  - Pufeng Du: Tianjin University; Tsinghua University; City University of Hong Kong: ['Tianjin University', 'Tsinghua University', 'City University of Hong Kong']
--------------------------------------------------------------------------------
Title: Recent advances in machine learning methods for predicting LncRNA and disease associations
Authors:
  - Jianjun Tan: 北京工业大学: []
--------------------------------------------------------------------------------
Title: Prediction of plant lncRNA by ensemble machine learning classifiers
Authors:
  - Caitlin Simopoulos: Roche: ['Roche']
---------

In [29]:
import geopandas as gpd
import folium
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="lncRNA-map")

In [32]:
geolocator.geocode("IUB")

Location(Iub, Dollo, ሶማሌ ክልል / Somali, ኢትዮጵያ, (8.23333, 45.68333, 0.0))

In [32]:
author

{'name': 'Zhi-Ping Liu',
 'affiliation': 'Professor of Biomedical Informatics, Shandong University'}

In [30]:
import geopandas as gpd
import folium
from geopy.geocoders import Nominatim

def geocode_affiliations(papers):
    geolocator = Nominatim(user_agent="lncRNA-map")
    locations = []

    for paper in papers:
        for author in paper["authors"]:
            affiliation = author.get("affiliation")
            if affiliation and affiliation != "Unknown":
                location = geolocator.geocode(affiliation)
                if location:
                    locations.append({
                        "paper_title": paper["title"],
                        "author_name": author["name"],
                        "affiliation": affiliation,
                        "latitude": location.latitude,
                        "longitude": location.longitude
                    })
    return locations

def plot_affiliations(locations):
    # Create a map
    affiliation_map = folium.Map(location=[0, 0], zoom_start=2)
    
    for loc in locations:
        folium.Marker(
            [loc["latitude"], loc["longitude"]],
            popup=f"{loc['author_name']} ({loc['affiliation']})<br>{loc['paper_title']}"
        ).add_to(affiliation_map)
    
    return affiliation_map

# Example usage
locations = geocode_affiliations(papers)
map_object = plot_affiliations(locations)
map_object.save("affiliations_map.html")  # Save to an HTML file


In [33]:
geolocator.geocode("Shandong University")

Location(山东大学（青岛校区）, 72, 滨海公路, 青岛蓝谷高新技术产业开发区, 即墨区, 青岛市, 山东省, 266200, 中国, (36.36553935, 120.68458591160784, 0.0))

In [34]:
geolocator.geocode("Professor of Biomedical Informatics, Shandong University")

TODO:

Need to extract the named entity of organization/university from the affilication field

In [9]:
import arxiv

def search_papers_arxiv(query, limit=20):
    """
    Search for papers on arXiv based on a query using the updated Client.results method.
    
    Args:
        query (str): Search query for arXiv (e.g., "Machine Learning lncRNA").
        limit (int): Maximum number of results to retrieve.
    
    Returns:
        List[Dict]: A list of dictionaries containing paper metadata.
    """
    client = arxiv.Client(
        page_size=limit  # Defines the number of results per API call
    )
    search = arxiv.Search(
        query=query,
        max_results=limit,
        sort_by=arxiv.SortCriterion.Relevance
    )
    
    papers = []
    for result in client.results(search):
        papers.append({
            "title": result.title,
            "abstract": result.summary,
            "institution": result.authors[0].affiliation if result.authors and result.authors[0].affiliation else "Unknown",
            "year": result.published.year,
            "url": result.entry_id
        })
    
    return papers



In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")
model_keywords = ["SVM", "Random Forest", "Neural Network", "Deep Learning", "Gradient Boosting"]

def categorize_papers(papers):
    for paper in papers:
        doc = nlp(paper["abstract"])
        paper["ml_model"] = [
            keyword for keyword in model_keywords if keyword in doc.text
        ]
    return papers


In [4]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="geoapi")

def geocode_institution(papers):
    for paper in papers:
        location = geolocator.geocode(paper["institution"])
        if location:
            paper["latitude"] = location.latitude
            paper["longitude"] = location.longitude
    return papers


In [5]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

def create_geodataframe(papers):
    df = pd.DataFrame(papers)
    geometry = [Point(xy) for xy in zip(df["longitude"], df["latitude"])]
    gdf = gpd.GeoDataFrame(df, geometry=geometry)
    return gdf


In [6]:
import geopandas as gpd
import matplotlib.pyplot as plt

def plot_papers_on_map(gdf):
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    ax = world.plot(color="white", edgecolor="black")
    gdf.plot(ax=ax, color="red", markersize=5)
    plt.show()


In [11]:
query = "Machine Learning lncRNA"

In [12]:
papers = search_papers(query, limit=50)

In [26]:
papers = categorize_papers(papers)

In [10]:
query = "Machine Learning lncRNA"
papers = search_papers_arxiv(query, limit=10)
for paper in papers:
    print(f"Title: {paper['title']}")
    print(f"Abstract: {paper['abstract']}")
    print(f"Institution: {paper['institution']}")
    print(f"Year: {paper['year']}")
    print(f"URL: {paper['url']}")
    print("-" * 80)



AttributeError: 'Author' object has no attribute 'affiliation'