In [24]:
import time
from scholarly import scholarly

def search_papers(query, limit=5):
    search_query = scholarly.search_pubs(query)
    papers = []

    for _ in range(limit):
        try:
            paper = next(search_query)
            authors = []
            if "author_id" in paper:
                for author_id in paper["author_id"]:
                    time.sleep(2)  # Introduce delay
                    author = scholarly.fill(scholarly.search_author_id(author_id))
                    authors.append({
                        "name": author.get("name"),
                        "affiliation": author.get("affiliation", "Unknown")
                    })
            papers.append({
                "title": paper.get("bib", {}).get("title"),
                "abstract": paper.get("bib", {}).get("abstract"),
                "authors": authors,
                "year": paper.get("bib", {}).get("pub_year"),
                "url": paper.get("eprint_url", "Unknown")
            })
        except StopIteration:
            break
        except Exception as e:
            print(f"Error processing paper or author: {e}")
            continue

    return papers


In [25]:
query = "Machine Learning lncRNA"
papers = search_papers(query, limit=5)

for paper in papers:
    print(f"Title: {paper['title']}")
    print(f"Abstract: {paper['abstract']}")
    print(f"Year: {paper['year']}")
    print(f"URL: {paper['url']}")
    print("Authors:")
    for author in paper["authors"]:
        print(f"  - {author['name']}: {author['affiliation']}")
    print("-" * 80)


Error processing paper or author: 'NoneType' object has no attribute 'get'
Error processing paper or author: 'NoneType' object has no attribute 'get'
Error processing paper or author: 'NoneType' object has no attribute 'get'
Error processing paper or author: 'NoneType' object has no attribute 'get'
Title: Predicting lncRNA-protein interactions by machine learning methods: a review
Abstract: Here, we aim to provide a review of machine-learning-based methods for predicting lncRNA  of predicting lncRNA-protein interactions into a general framework of machine learning. We
Year: 2020
URL: Unknown
Authors:
  - Zhi-Ping Liu: Professor of Biomedical Informatics, Shandong University
--------------------------------------------------------------------------------


In [32]:
author

{'name': 'Zhi-Ping Liu',
 'affiliation': 'Professor of Biomedical Informatics, Shandong University'}

In [30]:
import geopandas as gpd
import folium
from geopy.geocoders import Nominatim

def geocode_affiliations(papers):
    geolocator = Nominatim(user_agent="lncRNA-map")
    locations = []

    for paper in papers:
        for author in paper["authors"]:
            affiliation = author.get("affiliation")
            if affiliation and affiliation != "Unknown":
                location = geolocator.geocode(affiliation)
                if location:
                    locations.append({
                        "paper_title": paper["title"],
                        "author_name": author["name"],
                        "affiliation": affiliation,
                        "latitude": location.latitude,
                        "longitude": location.longitude
                    })
    return locations

def plot_affiliations(locations):
    # Create a map
    affiliation_map = folium.Map(location=[0, 0], zoom_start=2)
    
    for loc in locations:
        folium.Marker(
            [loc["latitude"], loc["longitude"]],
            popup=f"{loc['author_name']} ({loc['affiliation']})<br>{loc['paper_title']}"
        ).add_to(affiliation_map)
    
    return affiliation_map

# Example usage
locations = geocode_affiliations(papers)
map_object = plot_affiliations(locations)
map_object.save("affiliations_map.html")  # Save to an HTML file


In [33]:
geolocator.geocode("Shandong University")

Location(山东大学（青岛校区）, 72, 滨海公路, 青岛蓝谷高新技术产业开发区, 即墨区, 青岛市, 山东省, 266200, 中国, (36.36553935, 120.68458591160784, 0.0))

In [34]:
geolocator.geocode("Professor of Biomedical Informatics, Shandong University")

TODO:

Need to extract the named entity of organization/university from the affilication field

In [9]:
import arxiv

def search_papers_arxiv(query, limit=20):
    """
    Search for papers on arXiv based on a query using the updated Client.results method.
    
    Args:
        query (str): Search query for arXiv (e.g., "Machine Learning lncRNA").
        limit (int): Maximum number of results to retrieve.
    
    Returns:
        List[Dict]: A list of dictionaries containing paper metadata.
    """
    client = arxiv.Client(
        page_size=limit  # Defines the number of results per API call
    )
    search = arxiv.Search(
        query=query,
        max_results=limit,
        sort_by=arxiv.SortCriterion.Relevance
    )
    
    papers = []
    for result in client.results(search):
        papers.append({
            "title": result.title,
            "abstract": result.summary,
            "institution": result.authors[0].affiliation if result.authors and result.authors[0].affiliation else "Unknown",
            "year": result.published.year,
            "url": result.entry_id
        })
    
    return papers



In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")
model_keywords = ["SVM", "Random Forest", "Neural Network", "Deep Learning", "Gradient Boosting"]

def categorize_papers(papers):
    for paper in papers:
        doc = nlp(paper["abstract"])
        paper["ml_model"] = [
            keyword for keyword in model_keywords if keyword in doc.text
        ]
    return papers


In [4]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="geoapi")

def geocode_institution(papers):
    for paper in papers:
        location = geolocator.geocode(paper["institution"])
        if location:
            paper["latitude"] = location.latitude
            paper["longitude"] = location.longitude
    return papers


In [5]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

def create_geodataframe(papers):
    df = pd.DataFrame(papers)
    geometry = [Point(xy) for xy in zip(df["longitude"], df["latitude"])]
    gdf = gpd.GeoDataFrame(df, geometry=geometry)
    return gdf


In [6]:
import geopandas as gpd
import matplotlib.pyplot as plt

def plot_papers_on_map(gdf):
    world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
    ax = world.plot(color="white", edgecolor="black")
    gdf.plot(ax=ax, color="red", markersize=5)
    plt.show()


In [11]:
query = "Machine Learning lncRNA"

In [12]:
papers = search_papers(query, limit=50)

In [26]:
papers = categorize_papers(papers)

In [10]:
query = "Machine Learning lncRNA"
papers = search_papers_arxiv(query, limit=10)
for paper in papers:
    print(f"Title: {paper['title']}")
    print(f"Abstract: {paper['abstract']}")
    print(f"Institution: {paper['institution']}")
    print(f"Year: {paper['year']}")
    print(f"URL: {paper['url']}")
    print("-" * 80)



AttributeError: 'Author' object has no attribute 'affiliation'