# Parsing Files

Example based on code from Ryan Mitchell's book.  

In [1]:
import pdfminer # remember to pip install PDFMiner3K if this is your first time using the package
import re
import pandas as pd
import string
from collections import OrderedDict
from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open

In [2]:
def readPDF(pdfFile):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)

    process_pdf(rsrcmgr, device, pdfFile)
    device.close()
    content = retstr.getvalue()
    retstr.close()
    return content

In [4]:
pdfFile = urlopen("http://journals.plos.org/ploscompbiol/article/file?id=10.1371/journal.pcbi.1005871&type=printable")
# use line below if local file. update the file path, obviously.
# pdfFile = open("/Users/johnmclevey/Desktop/journal.pcbi.1003906.PDF", 'rb') 

In [5]:
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()

EDITORIAL

Ten simple rules for biologists learning to
program

Maureen A. Carey1, Jason A. Papin2*

1 Department of Microbiology, Immunology, and Cancer Biology, University of Virginia School of Medicine,
Charlottesville, Virginia, United States of America, 2 Department of Biomedical Engineering, University of
Virginia, Charlottesville, Virginia, United States of America

* papin@virginia.edu

Introduction

As big data and multi-omics analyses are becoming mainstream, computational proficiency
and literacy are essential skills in a biologist’s tool kit. All “omics” studies require computa-
tional biology: the implementation of analyses requires programming skills, while experimen-
tal design and interpretation require a solid understanding of the analytical approach. While
academic cores, commercial services, and collaborations can aid in the implementation of
analyses, the computational literacy required to design and interpret omics studies cannot be
replaced or supplemented. Howeve

Let's do some minimal cleaning and extract n-grams. 

In [7]:
def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input)-n+1):
        output.append(input[i:i+n])
    return output

In [8]:
ngrams = ngrams(outputString, 2)
ngrams

print("There are " + str(len(ngrams)) + " 2-grams.")

There are 3207 2-grams.


Better cleaning...

In [9]:
def cleanInput(input):
    input = re.sub('\n+', " ", input)
    input = re.sub('\[[0-9]*\]', "", input)
    input = re.sub(' +', " ", input)
    input = bytes(input, "UTF-8")
    input = input.decode("ascii", "ignore")
    cleanInput = []
    input = input.split(' ')
    for item in input:
        item = item.strip(string.punctuation)
        if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
            cleanInput.append(item)
    return cleanInput


Re-defined n-grams function:

In [10]:
def ngrams(input, n):
    input = cleanInput(input)
    output = []
    for i in range(len(input)-n+1):
        output.append(input[i:i+n])
    return output


In [11]:
ngrams = ngrams(outputString, 2)
ngrams

[['EDITORIAL', 'Ten'],
 ['Ten', 'simple'],
 ['simple', 'rules'],
 ['rules', 'for'],
 ['for', 'biologists'],
 ['biologists', 'learning'],
 ['learning', 'to'],
 ['to', 'program'],
 ['program', 'Maureen'],
 ['Maureen', 'A'],
 ['A', 'Carey1'],
 ['Carey1', 'Jason'],
 ['Jason', 'A'],
 ['A', 'Papin2'],
 ['Papin2', 'Department'],
 ['Department', 'of'],
 ['of', 'Microbiology'],
 ['Microbiology', 'Immunology'],
 ['Immunology', 'and'],
 ['and', 'Cancer'],
 ['Cancer', 'Biology'],
 ['Biology', 'University'],
 ['University', 'of'],
 ['of', 'Virginia'],
 ['Virginia', 'School'],
 ['School', 'of'],
 ['of', 'Medicine'],
 ['Medicine', 'Charlottesville'],
 ['Charlottesville', 'Virginia'],
 ['Virginia', 'United'],
 ['United', 'States'],
 ['States', 'of'],
 ['of', 'America'],
 ['America', 'Department'],
 ['Department', 'of'],
 ['of', 'Biomedical'],
 ['Biomedical', 'Engineering'],
 ['Engineering', 'University'],
 ['University', 'of'],
 ['of', 'Virginia'],
 ['Virginia', 'Charlottesville'],
 ['Charlottesville'

Let's make a dataframe. 

In [13]:
df = pd.DataFrame(ngrams)

df.rename(columns = {
    0:'first',
    1:'second'
}, inplace = True)

In [14]:
df.head()

Unnamed: 0,first,second
0,EDITORIAL,Ten
1,Ten,simple
2,simple,rules
3,rules,for
4,for,biologists


In [15]:
top_50 = df.groupby(['first', 'second']).size().reset_index().sort_values(by = 0, ascending = False)[:50]
top_50

Unnamed: 0,first,second,0
190,Computational,Biology,12
359,January,2018,12
442,PLOS,Computational,12
1049,can,be,11
152,Biology,https://doi.org/10.1371/journal.pcbi.1005871,11
1567,https://doi.org/10.1371/journal.pcbi.1005871,January,11
58,2018,11,10
548,Simple,Rules,10
2645,to,learn,10
572,Ten,Simple,10
