**Purpose:** Make search better.

In [1]:
import os
import pandas as pd
import numpy as np
import pubmed_parser as pp
from tqdm.notebook import tqdm

In [2]:
path_all = pp.list_xml_path('data/')

In [3]:
columns = ['full_title', 'abstract', 'journal', 'pmid', 'pmc', 'doi',
       'publisher_id', 'author_list', 'affiliation_list', 'publication_year',
       'publication_date', 'subjects', 'path_to_file']

data = pd.DataFrame(columns = columns)

In [4]:
path_some = path_all[:10000]

for path in tqdm(path_some):
    article = pp.parse_pubmed_xml(path, include_path = True)
    data = data.append(article, ignore_index = True)

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [10]:
def get_authors(author_list):
    if author_list:
        author_arr = np.char.array(author_list).astype(str)[:, :-1]
        name_arr = author_arr[:, 1] + ' ' + author_arr[:, 0]
        _, idx = np.unique(name_arr, return_index = True)
        authors = ', '.join(name_arr[np.sort(idx)][:2]) + ', et al.'
        return authors
    else:
        return ''

def get_cover(article):
    title = article['full_title']
    authors = get_authors(article['author_list'])
    journal = article['journal']
    year = article['publication_year']
    info = authors + ' ' + journal + ' ' + year
    return title, info

def get_url(article):
    url_prefix = 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC'
    url = url_prefix + article['pmc'] + '/'
    return url

In [87]:
def search_title(query):
    searched = data[data['full_title'].str.contains(query, case = False)]
    return searched

In [90]:
query = 'alzheimer\'s'

searched = search_title(query)

In [91]:
for i, article in searched.iterrows():
    title, info = get_cover(article)
    url = get_url(article)
    print(title)
    print(info)
    print(url)
    print('')

Haplotype-based association analysis of the MAPT locus in Late Onset Alzheimer's disease
Odity Mukherjee, John SK Kauwe, et al. BMC Genetics 2007
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1800865/

Defective mitochondrial respiration, altered dNTP pools and reduced AP endonuclease 1 activity in peripheral blood mononuclear cells of Alzheimer's disease patients
Scott Maynard, Anne-Mette Hejl, et al. Aging (Albany NY) 2015
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4637207/

Beneficial effects of novel antagonists of GHRH in different models of Alzheimer's disease
Miklos Jaszberenyi, Ferenc G. Rick, et al. Aging (Albany NY) 2012
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3560443/

Phosphorylation of amyloid beta (Aβ) peptides – A trigger for formation of toxic aggregates in Alzheimer's disease
Sathish Kumar, Jochen Walter, et al. Aging (Albany NY) 2011
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3184981/

STARD1 and NPC1 expression as pathological markers associated with astrogli