In [5]:
%qtconsole

In [175]:
class BibItem:
    
    def __init__(self, text):
        
        # Find the keywords avaialble.
        # For each key in keywords, cut out the line and remove bounding braces.
        
        self.text = text
        
        self.getKeys()
        
        self.writeCiteKey()
        
        return

    # Create a dictionary with all the keywords and their values from the bibitem.
    def getKeys(self):
        self.keywords = self.parseKeyWords()
        self.keys = {}
        for key in self.keywords:
            self.keys[key] = self.parseKeyValue(key)
        return
    
    
    # Find the entry type, e.g. arcticle, or inbook. Not currently used.
    def parseType(self):
        self.itemtype = None
        for i in range(1, len(self.text[0])):
            if self.text[0][i] == u'{':
                self.itemtype = self.text[0][1:i]
                break
        return

    
    # Parse the information for the given key.
    def parseKeyValue(self, key):
        
        # Use the special author function.
        if key == 'author':
            return self.parseAuthors()

        # Find the line with the key in.
        for l, line in enumerate(self.text):
            if key in line.lower():
                break
                
        # Parse the appropriate section from the keyword. Strip the curly braces.
        return self.stripBraces(self.getFullLine(l))

        
    # Find the different keys used in the bibliography item.
    def parseKeyWords(self):
        return [l.split('=')[0].split()[0].lower() for l in self.text if '=' in l]

    
    # Find the lines containing the author list, then parse the string
    # of author names, returning those in braces.
    def parseAuthors(self):
        for i, line in enumerate(self.text):
            if 'author' in line.lower():
                break
        lb, rb = 0, 0
        for l, line in enumerate(self.text[i:]):
            lb += line.count(u'{')
            rb += line.count(u'}')
            if lb == rb:
                break
        s = ''.join([l for l in self.text[i:i+l+1]])
        s = s.replace('\n', '')
        names = []
        lb = 0
        rb = 0
        i = 0
        for c, char in enumerate(s):
            if char == u'{':
                lb += 1
                i = c + 1
            elif char == u'}':
                rb += 1
                if (lb - 1 == rb and rb > 0):
                    names.append(self.stripString(s[i:c]))
                    i = c
        return names

    
    # Return the joined strings for one keyword entry.
    def getFullLine(self, i):
        lb, rb = 0, 0
        for l, line in enumerate(self.text[i:]):
            lb += line.count(u'{')
            rb += line.count(u'}')
            if lb == rb:
                break
        s = ''.join([l for l in self.text[i:i+l+1]])
        return s.split('=')[-1]
    

    # Strip a string down to the braces.
    def stripBraces(self, s):
        if (u'{' not in s and u'}' not in s):
            return s.strip()
        for i in range(len(s)):
            if s[i] == u'{':
                i += 1
                break
        for j in range(1, len(s)):
            if s[-j] == u'}':
                j *= -1
                break
        return s[i:j]

    
    # Remove spurious characters in a string.
    def stripString(self, s):
        s = s.replace(u'\\', '')
        s = s.replace(u'"', '')
        s = s.replace(u'{', '')
        s = s.replace(u'}', '')
        s = s.replace(u'`', '')
        s = s.replace(u'\n', '')
        s = s.replace(u'\t', '')
        s = s.replace(u'=', '')
        return s
    
    
    # Replace the citekey with a standard format: 1 author, author_year; 2 authors, author1_author_2_year and 3+ authors: author1_ea_year
    def writeCiteKey(self):
        if len(self.keys['author']) == 1:
            self.citekey = '%s_%s' % (self.keys['author'][0], self.keys['year'])
        elif len(self.keys['author']) == 2:
            self.citekey = '%s_%s_%s' % (self.keys['author'][0], self.keys['author'][1], self.keys['year'])
        else:
            self.citekey = '%s_ea_%s' % (self.keys['author'][0], self.keys['year'])
        return self.citekey

In [176]:
class ArticleItem:
    def __init(self, text):
        self.text = text
        return

class Bibliography:
    
    def __init__(self, filename='bibliography.bib'):
        
        # Upon intialisation, read in the file as a text file and parse into BibItems.
        # Remove all the lines which are empty.
        
        with open(filename) as f:
            self.bib = f.readlines()
            self.bib = [l for l in self.bib if l != '\n']
            
        # Parse into individual bibliography items. Each item becomes a BibItem.
        # Each BibItem gets assigned a citekey, check that there are no duplicates. If so, 
        # append the citekey with a, b, c etc.
        
        self.parseBibItems()
        
        self.citekeys = [bibitem.citekey for bibitem in self.bibitems]
        print self.citekeys
        self.checkCiteKeys()
             
        return
    
    # Parse the different bib items.
    def parseBibItems(self):
        self.bibitems = []  
        lb = 0
        rb = 0
        article = 0
        for l, line in enumerate(self.bib):
            lb += line.count(u'{')
            rb += line.count(u'}')
            if lb == rb:
                self.bibitems.append(BibItem(self.bib[article:l+1]))
                lb = 0
                rb = 0
                article = l
        print 'Found %d bibliography items.' % len(self.bibitems)
        return
    
    # Check there are no duplicate citekeys. If they are, append them a, b, c etc. depending on month published.
    def checkCiteKeys(self):
        if len(self.citekeys) == len(set(self.citekeys)):
            print 'No duplicates.'
        else:
            print 'Nooo!'
        return 

In [177]:
test = Bibliography()

Found 5 bibliography items.
[u'Qi_ea_2004,', u'Huang_Oberg_2015,', u'Gammie_2001,', u'Cleeves_ea_2013,', u'Cleeves_ea_2013,']
Nooo!


In [135]:
test.bibitems[0].keys['year']

'}'

In [None]:
test.bib_items[0].authors