In [2]:
import os
import string
import numpy as np

In [65]:
class BibItem:

    """A bibliography item. From the text will parse all the bibitem type, keywords and their aguments and citekey.
    From the author list a more readable citekey will be generated of the form: author_ea_year.

    BibItem.bitype : string, BibItem type, e.g. 'article'.
    BibItem.keywords : dictionary, returns the keyword values for a given keyword e.g. 'year'.
    BibItem.oldcitekey : string, original citekey used as the BibItem identifier.
    BibItem.citekey : string, citekey following the more readable author_ea_year format. May not be unique.
    BibItem.refstring : string, LaTeX string used for the reference section.

    """

    def __init__(self, text):
        self.text = text
        self.bitype = self.parseType()
        self.citekey = self.parseCiteKey()
        self.keywords = self.getKeywords()
        self.newcitekey = self.writeCiteKey()
        self.refstring = self.writeRefString()
        return

    def writeRefString(self):
        """Write the string for reference section."""
        
        # Author names.
        rs = self.keywords['author'][0] + ' '
        if len(self.keywords['author']) == 1:
            rs += ''
        elif len(self.keywords['author']) == 2:
            rs += '\& ' + self.keywords['author'][1] + ' '
        else: 
            rs += 'et al. '
        
        # Year.
        rs += self.keywords['year'] + ', '
        
        # Journal type, etc.
        # TODO: Other bibitem types.
        if self.bitype == 'article':
            rs += '%s, ' % self.keywords['journal']
            rs += '%s, ' % self.keywords['volume']
            rs += '%s' % self.keywords['pages']
        return rs
    
    def parseType(self):
        """Parse the bibitem type."""
        bitype = self.text[0].split(u'{')[0][1:]
        return bitype.lower()

    def parseCiteKey(self):
        """Parse the citekey to help replacement."""
        citekey = self.text[0].split(u'{')[-1]
        citekey = citekey.split(',')[0]
        return citekey

    def getKeywords(self):
        """Create a dictionary with all the keywords and their values."""
        keys = self.parseKeyWords()
        keywords = {}
        for key in keys:
            keywords[key] = self.parseKeyValue(key)
        return keywords

    def parseKeyValue(self, keyword):
        """Parse the information for the given keyword."""
        for l, line in enumerate(self.text):
            if keyword in line.lower():
                break
        s = self.stripBraces(self.getKeyWordValue(l))
        if keyword == 'author':
            return self.parseBraces(s)
        else:
            return s

    def parseKeyWords(self):
        """Return the different keyword used in the bibliography item."""
        return [l.split('=')[0].strip().lower() for l in self.text if '=' in l]

    def getKeyWordValue(self, i):
        """Return the value of the keyword starting on line i."""
        lb, rb = 0, 0
        for l, line in enumerate(self.text[i:]):
            lb += line.count(u'{')
            rb += line.count(u'}')
            if lb == rb:
                break
        s = ''.join([l for l in self.text[i:i+l+1]])
        return s.split('=')[-1]

    def stripBraces(self, s):
        """Remove the curly braces at the start and end of the string."""
        if (u'{' not in s and u'}' not in s):
            return self.stripNames(s)
        for i in range(len(s)):
            if s[i] == u'{':
                i += 1
                break
        for j in range(1, len(s)):
            if s[-j] == u'}':
                j *= -1
                break
        return s[i:j]

    def parseBraces(self, s):
        """Parse all the surnames in curly braces in a string of authors."""
        names = []
        lb, rb = 0, 0
        j = 0
        for i in range(len(s)):
            if s[i] == u'{':
                if lb == rb:
                    j = i + 1
                lb += 1
            elif s[i] == u'}':
                rb += 1
                if lb == rb:
                    names.append(self.stripNames(s[j:i]))
        return names

    def stripNames(self, s):
        """Remove spurious characters in a string but leave hyphens."""
        s = s.replace(u'\\', '')
        s = s.replace(u'"', '')
        s = s.replace(u'{', '')
        s = s.replace(u'}', '')
        s = s.replace(u'`', '')
        s = s.replace(u'\n', '')
        s = s.replace(u'\t', '')
        s = s.replace(u'=', '')
        s = s.replace(',', '')
        s = s.replace("'", "")
        s = s.replace(' ', '')
        return s.strip()

    def writeCiteKey(self):
        """Replace the citekey with a standard, more readable format:
        single author: author_year,
        two authors: author1_author_2_year,
        multiple authors: author1_ea_year.
        """
        if len(self.keywords['author']) == 1:
            citekey = '%s_%s' % (self.keywords['author'][0],
                                 self.keywords['year'])
        elif len(self.keywords['author']) == 2:
            citekey = '%s_%s_%s' % (self.keywords['author'][0],
                                    self.keywords['author'][1],
                                    self.keywords['year'])
        else:
            citekey = '%s_ea_%s' % (self.keywords['author'][0],
                                    self.keywords['year'])
        return citekey


In [70]:
class Bibliography:
    
    """Bibliography class. Main features are a dictionary with the citekey providing the individual BibItems.
    If the update flag is true, overwrite the bibliography, otherwise write to a new file.
    
    -- Commonly Used --
    bibpath : string, path to the file.
    bibfile : string, filename of bibliography.
    citekeys : list of possible citekeys.
    bibitems : dictionary, returns a BibItem for a given citekey. 
    
    -- Others --
    bib : strings, bibliography file ignoring new lines and comments.
    raw : strings, bibliography file with no parsing.
    months : dictionary, for a month, returns the order, e.g. months['mar'] = 3.
    alphabet : list, alphabet.
    """

    def __init__(self, filename='bibliography.bib', update=False):
        
        # Read in the bibliography file. Ignore newlines and comments for easier parsing.
        
        self.bibpath = ''
        for f in filename.split('/')[:-1]:
            self.bibpath += '%s/' % f
        self.bibfile = filename.split('/')[-1]     
        with open(filename) as f:
            self.bib = f.readlines()
            self.raw = f.readlines()
            self.bib = [l for l in self.bib if l != '\n']
            self.bib = [l for l in self.bib if l.strip()[0] != '%']

        # Declare ordering of the months and an alphabet for citekey suffixes.

        self.months = {}
        self.monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
                           'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
        for m, month in enumerate(self.monthnames):
            self.months[month] = m
        self.alphabet = list(string.ascii_lowercase)

        # Parse into individual bibliography items. Each item becomes a
        # BibItem stored in a dictionary sorted by the citekey of the item. 

        self.citekeys, self.bibitems = self.parseBibItems()
        print 'Read in %s. Found %d BibItems.' % (self.bibfile, len(self.bibitems))
        return
    
    
    def reorderBibliography(self):
        """Reorder the bibliography into alphbetical order of citekeys."""
        self.verifyMonthKeys()
        self.verifyCiteKeys()
        self.uniqueCiteKeys()
        self.updateCiteKeys()
        return

    
    def replaceCitations(self, filename, update=False):
        """Update the citekeys in filename. First replace into dummy variables,
        then from those into the new citekeys to prevent getting confused with
        old and new citekeys which may be the same.
        """
        with open(filename) as f:
            document = f.readlines()
        newck = np.array([bi.citekey for bi in self.bibitems])
        oldck = np.array([bi.oldcitekey for bi in self.bibitems])
        for i, ck in enumerate(oldck):
            document = [l.replace(ck, 'citekey%d' % i) for l in document]
        for i, ck in enumerate(newck):
            document = [l.replace('citekey%d' % i, ck) for l in document]

        # If update is True, overwrite the input file, otherwise write to a
        # second file with _updated in the name.

        if not update:
            for i in range(1, len(filename)):
                if filename[-i] == '.':
                    fileout = filename[:-i]
                    break
            fileout += '_updated.' + filename.split('.')[-1]
        else:
            fileout = filename
        with open(fileout, 'w') as fo:
            for line in document:
                fo.write(line)
        print 'Written to %s' % fileout
        return

    
    def parseBibItems(self):
        """Returns a dictionary of bibitems in the bibliography file."""
        bibitems = {}
        citekeys = []
        lb, rb, bi = 0, 0, 0
        for l, line in enumerate(self.bib):
            lb += line.count(u'{')
            rb += line.count(u'}')
            if lb == rb:
                x = BibItem(self.bib[bi:l+1])
                citekeys.append(x.citekey)
                bibitems[x.citekey] = x
                lb, rb = 0, 0
                bi = l + 1
        return citekeys, bibitems

    def verifyCiteKeys(self):
        """Check that the citekeys follow the correct format style:
        author_ea/author_year. If it doesn't conform, ask for user input.
        """
        for bi in self.bibitems:

            # Split the citekeys into their three (two) components.
            error = 0
            split = bi.citekey.split('_')
            if len(split) == 2:
                name1, year = split
            elif len(split) == 3:
                name1, name2, year = split
            else:
                raise ValueError("")

            # Check that each component contains the allowed characeters.
            if not np.array([n in self.alphabet or n == '-'
                             for n in name1.lower()]).all():
                error += 1

            if len(split) == 3:
                if not np.array([n in self.alphabet or n == '-'
                                 for n in name2.lower()]).all():
                    error += 1

            if not(year[:-1].isdigit()
                   and (year[-1].isdigit() or year[-1] in self.alphabet)):
                error += 1

            # If any error found, ask for user input.
            if error > 0:
                print 'Unable to create citekey for:\n'
                for l in bi.text:
                    print l.replace('\n', '').replace('\t', '')
                print '\n'
                newkey = raw_input("New citekey: ")
                bi.citekey = newkey
        return

    def verifyMonthKeys(self):
        """Check for the months, if there are none, assume January."""
        for bi in self.bibitems:
            if 'month' not in bi.keys:
                bi.keys['month'] = 'jan'
            elif bi.keys['month'] not in self.monthnames:
                bi.keys['month'] = 'jan'
        return

    def uniqueCiteKeys(self):
        """Check there are no duplicate citekeys. If they are, append them a, b,
        c etc. depending on month published.
        """
        self.citekeys = [bibitem.citekey for bibitem in self.bibitems]
        if len(self.citekeys) != len(set(self.citekeys)):
            duplicate_citekeys = self.findDuplicates()
            for ck in duplicate_citekeys:

                # Return the BibItems which are duplicates.
                # From their month value assign a number and order them.
                # Once ordered, append the appropriate suffix.

                bibitems = [bi for bi in self.bibitems if bi.citekey == ck]
                sfx = np.array([self.months[bi.keys['month']]
                                for bi in self.bibitems if bi.citekey == ck])
                sfx = sfx.argsort()
                for b, bi in enumerate(bibitems):
                    bi.citekey += self.alphabet[sfx[b]]
        self.citekeys = [bibitem.citekey for bibitem in self.bibitems]
        return

    def findDuplicates(self):
        """Find the duplicate citekey values."""
        seen, dupl = set(), set()
        for citekey in self.citekeys:
            if citekey in seen:
                dupl.add(citekey)
            else:
                seen.add(citekey)
        return list(dupl)

    def updateCiteKeys(self):
        """Change the citekey with the shortened, unique values."""
        for bibitem in self.bibitems:
            bibitem.text[0] = bibitem.text[0].split(u'{')[0]
            bibitem.text[0] += '{' + bibitem.citekey + ',\n'
        return

    
    def getOutputFilename(self):
        """Return the output filename for the new bibliography. Will cycle through"""
        if not self.update:
            for i in range(1, len(self.bibfile)):
                if self.bibfile[-i] == '.':
                    fileout = self.bibfile[:-i]
                    break
            suffix = 1
            while os.path.isdir(fileout+'%d.bib' % suffix):
                suffix += 1
            fileout = fileout +'%d.bib' % suffix
        else:
            fileout = self.bibfile
        return fileout

    
    def writeBibliography(self):
        """Write in a new bibliography file with alphabetically ordered
        citekeys.
        """
        fileout = self.getOutputFilename()
        writeorder = sorted(self.citekeys)
        with open(fileout, 'w') as fo:
            for ck in writeorder:
                for bi in self.bibitems:
                    if bi.citekey == ck:
                        for line in bi.text:
                            fo.write(line)
                        fo.write('\n')
        print 'Written to %s' % fileout
        return


In [93]:
class Document:
    
    def __init__(self, document, bibliography='bibliography.bib', update=False):
        
        # Upon initialisation, read in the bibliography file.
        # Parse all the citekeys, include \cite{}, \citep{} and \citet{}.
        
        self.bibliography = Bibliography(bibliography)
        self.filename = document
        self.update = update
        
        # Read in the document.
        with open(self.filename) as f:
            self.document = f.readlines()
        
        # Replace all citations in the document with numerals.
        self.writeNewCitations()
        
        return
    
    
    def checkCitationOverrun(self, k, incitation=False):
        """Check to see if the citation overruns onto the next line."""
        for c, char in enumerate(self.document[-k]):
            if self.document[-k][c-5:c] == u'\cite':
                incitation = True
            elif (incitation and char == u'}'):
                incitation = False
        if incitation:
            k -= 1
            self.checkCitationOverrun(k, incitation=True)
        return k
    
    def splitText(self):
        """Split the text into three parts, the main text containing all the citations."""
        
        preamble = ''
        for i in range(len(self.document)):
            if not self.document[i].count(u'\cite'):
                preamble += self.document[i]
            else:
                break
                
        for k in range(1, len(self.document)):
            if self.document[-k].count(u'\cite'):
                break
        k = self.checkCitationOverrun(k)
        postamble = ''
        for line in self.document[-k+1:]:
            postamble += line
            
        maintext = ''
        for line in self.document[i:-k+1]:
            maintext += line
        
        return preamble, maintext, postamble
    
    def writeNewCitations(self):
        """Change citation style. First split into pre, post and main text sections."""
        
        preamble, maintext, postamble = self.splitText()
        
        # For the maintext, generate a list of all '\cite*{refs}' substrings.
        
        toreplace = self.findReplacements(maintext)
        
        # For each '\cite*{refs}' substring, split into individual references.
        # From this also generate a dictionary of order.
        
        references = []
        uniquerefs = []
        for citation in toreplace:
            references.append(self.parseCitation(citation))
            for ref in self.parseCitation(citation):
                uniquerefs.append(ref)
        self.citedict = {}
        for r, ref in enumerate(set(uniquerefs)):
            self.citedict[ref] = '%d' % (r + 1)
        
        # For each citation, replace for a numeric citation.
        
        for c, citation in enumerate(toreplace):
            new = '['
            for ref in references[c]:
                new += self.citedict[ref] + ', '
            new = new[:-2] + ']'
            maintext = maintext.replace(citation, new)
        
        # Write the bibliography.        

        oldbib = self.bibliography.bibfile.split('.')[-2].split('/')[-1]
        oldbib = u'\\bibliography{%s}' % oldbib
        newbib = self.writeBibString()
        postamble = postamble.replace(oldbib, newbib)
        
        # Save as a new file.
        self.writeUpdatedText([preamble, maintext, postamble])
        
        return
    
    def writeBibString(self):
        bib = '\\section*{References}\n'
        bib += '\\footnotesize\n'
        for k, key in enumerate(self.citedict):
            bib += '[\\textbf{%d}] %s' % (k+1, self.bibliography.bibitems[key].refstring)
            if k + 1 < len(self.citedict):
                bib += ' $\\cdot$ '
        bib += '\n'
        return bib
    
    def findBibliography(self, text):
        inbib = False
        for c, char in enumerate(text):
            if text[c-14:c] == u'\\bibliography{':
                i = c - 14
                inbib = True
            if (inbib and char == u'}'):
                break
        return text[i:c+1]
    
    def parseCitation(self, citation):
        """Parses \cite*{reference1,reference2} into a list of [reference1, reference2]."""
        incitation = False
        cites = []
        for c, char in enumerate(citation):
            if char == u'{':
                i = c + 1
            if char == ',':
                cites.append(citation[i:c].replace('\n', '').strip())
                i = c + 1
            if char == u'}':
                cites.append(citation[i:c].replace('\n', '').strip())
        return cites
    
    def findReplacements(self, text):
        """Initially splits the main text into a list of \cite*{refs}."""
        incitation = False
        toreplace = []
        for c, char in enumerate(text):
            if text[c-5:c] == u'\cite':
                incitation = True
                cs = c-5
            elif (incitation and char == u'}'):
                toreplace.append(text[cs:c+1])
                incitation = False
        return toreplace   

    def getOutputFilename(self):
        """Return the output filename for the new bibliography. Will cycle through"""
        if not self.update:
            for i in range(1, len(self.filename)):
                if self.filename[-i] == '.':
                    fileout = self.filename[:-i]
                    break
            suffix = 1
            while os.path.isdir(fileout+'%d.tex' % suffix):
                suffix += 1
            fileout = fileout +'%d.tex' % suffix
        else:
            fileout = self.filename
        return fileout
    
    def writeUpdatedText(self, lines):
        """Write an updated text with references."""
        fileout = self.getOutputFilename()
        with open(fileout, 'w') as fo:
            for line in lines:
                fo.write(line)
        print 'Written to %s' % fileout
        return

In [94]:
test = Document('../Examples/main.tex', '../Examples/bibliography.bib')

Read in bibliography.bib. Found 5 BibItems.
Written to ../Examples/main1.tex


In [None]:
test.citedict