# Data extraction

In [None]:
# Install pymed: a Python library that provides access to PubMed through the PubMed API.
# https://github.com/gijswobben/pymed

pip install pymed

In [42]:
from pymed import PubMed
import pandas as pd
import numpy as np
import csv

In [2]:
# Query form generator for pubmed: https://pubmed.ncbi.nlm.nih.gov/advanced/

pubmed = PubMed()

my_query = """
("The Journal of neuroscience : the official journal of the Society for Neuroscience"[Journal])
AND ("2011"[Date - Publication] : "2020"[Date - Publication])
"""

results = pubmed.query(my_query, max_results=15000)

In [3]:
print(results)

<itertools.chain object at 0x7f1aa8dbc6d0>


- 'results' is an intertools.chain object of size > 10,000 rows

In [None]:
# Write a new CSV file using 'csv.writer'
# First step: convert the text into a dictionary
# Second step: write a new row of .csv file by using 'writerow' method
# Result: at every iter, a new row will be written in the csv
# Why do this: it's faster than appending each row to a list or a dictionary...

def create_csv(iterable, filename): # filename: output csv filename
    filename_csv = (filename+'.'+'csv')
    
    with open(filename_csv, 'w') as csvfile:
        writer = csv.writer(csvfile)

        for it in iterable:
            it_dict = it.toDict()
            writer.writerow(val for key, val in it_dict.items())


create_csv(results, 'jneuro')

In [4]:
# Extract column names

for result in results:
    result_dict = result.toDict()
    if len(result_dict) > 0:
        break

columns = [key for key, val in result_dict.items()]
print(columns)

['pubmed_id', 'title', 'abstract', 'keywords', 'journal', 'publication_date', 'authors', 'methods', 'conclusions', 'results', 'copyrights', 'doi', 'xml']


In [5]:
# Import the csv as a dataframe

jneuro = pd.read_csv('../data/jneuro.csv', names=columns)

print(jneuro.shape)
jneuro.head()

(12734, 13)


Unnamed: 0,pubmed_id,title,abstract,keywords,journal,publication_date,authors,methods,conclusions,results,copyrights,doi,xml
0,32737169,"Erratum: Salido and Ramamurthy, ""Proteoglycan ...",,[],The Journal of neuroscience : the official jou...,2020-08-02,[],,,,,10.1523/JNEUROSCI.1846-20.2020,<Element 'PubmedArticle' at 0x7f18e0bb0a10>
1,32737168,"Erratum: Keitel et al., ""Medial Nucleus Accumb...",,[],The Journal of neuroscience : the official jou...,2020-08-02,[],,,,,10.1523/JNEUROSCI.1829-20.2020,<Element 'PubmedArticle' at 0x7f18e07a1050>
2,32737167,The neocortical progenitor specification progr...,Neuronal progenitors in the developing forebra...,[],The Journal of neuroscience : the official jou...,2020-08-02,"[{'lastname': 'Yabut', 'firstname': 'Odessa R'...",,,,Copyright © 2020 the authors.,10.1523/JNEUROSCI.2888-19.2020,<Element 'PubmedArticle' at 0x7f18e07d5b90>
3,32732324,Flexible coordinator and switcher hubs for ada...,Functional connectivity studies have identifie...,[],The Journal of neuroscience : the official jou...,2020-08-01,"[{'lastname': 'Cocuzza', 'firstname': 'Carrisa...",,,,Copyright © 2020 the authors.,10.1523/JNEUROSCI.2559-19.2020,<Element 'PubmedArticle' at 0x7f18e07c2a10>
4,32727820,Responses to Visual Speech in Human Posterior ...,Experimentalists studying multisensory integra...,[],The Journal of neuroscience : the official jou...,2020-07-31,"[{'lastname': 'Metzger', 'firstname': 'Brian A...",,,,Copyright © 2020 Metzger et al.,10.1523/JNEUROSCI.0279-20.2020,<Element 'PubmedArticle' at 0x7f18e07cc710>


## Observation:
- 12734 articles from 2011-01-07 to 2020-08-02
- The data is sorted by 'publication_date (YYYY-MM-DD)' in a descending order by default

###   