In [2]:
import itertools
import re

'''
  This method takes element and label and produce the proper BOI label for each token
  elemnt -> Can be XML token, word token or punctuation token
'''
def getTagLabelPairs(element, label=None):
  resultList = []
  punc = r"[\/ ,.:;()\"'?!+@#{\[}\]\\$^&*-]"
  tagLabel = []
  result = re.match("<[^>]+>",element) # Check if element is a string with XML
  initLabel = False #
  if result and label: # Enter if element is XML string
    element = re.sub("<[^>]+>", "", element) # Remove all XML tag from element
    s = re.findall(r"\w+|[^\w\s]", element) # Split the element(with XML tag) into words and punctuation 
    for item in s:
      if item in punc:
        resultList.append((item,'B-PUNC'))
      else:
        if initLabel is False:
          initLabel = True
          resultList.append((item,'B-'+label))
        else:
          resultList.append((item,'I-'+label))
  else: # Works when element is anything other then string of XML tag. Can be punctuation or words outside of XML tag in ciation string
    # Either punctuation or other tag to anything outside of [defined] XML tag
    pattern = re.compile(punc)        
    result = pattern.match(element)
    if pattern.match(element): 
      resultList.append((element,'B-PUNC'))    
    else:
      resultList.append((element,'O'))    

  return resultList

# Utility method : Remove ASCII and unicodes
def strip_non_ascii(string):
  ''' Returns the string without non ASCII characters'''
  stripped = (c for c in string if 0 < ord(c) < 127)
  return ''.join(stripped)

'''
  Input:
  ciation string -> XML tagged citation string from GIANT dataset
'''
def tokenizeCitation(citationString):
  """
    Remove URL/DOI from the annotated citation string
    And cleanup texts before URL/DOI tags
    TODO: Currently keeping out of URL/DOI. Will work on this later
  """
  citationString = strip_non_ascii(citationString.replace('‚Äì','-').encode("ascii", "ignore").decode()) # Fixing GIANT training data citation string error and removing unicode

  urlExists = False
  doiExists = False
  urlordoi = None
  if "<URL>" in citationString:
    urlExists = True
    """
      re.findall [0] gets the matched string
      re.sub gives the text inside the tag
    """
    urlordoi = re.sub('<[^<]+>', "", re.findall('<URL>.+<\/URL>',citationString)[0]) 
    citationString = re.sub('<URL>.+<\/URL>', "", citationString)
  if "<DOI>" in citationString:
    doiExists = True
    urlordoi = re.sub('<[^<]+>', "", re.findall('<DOI>.+<\/DOI>',citationString)[0]) 
    citationString = re.sub('<DOI>.+<\/DOI>', "", citationString)

  tokens = []
  tags = []
  unprocessedTokens = []


  charPattern = r"[\w\d]"
  noCharPattern = r"[\/ ,.:;()\"'?!+@#{\[}\]\\$^&*-]" 
  endPtr = 0
  word = ""

  """
    Seperating non-word and words(with tags) at first 
    e.g. ['<author><family>Mehrinfar<family>', ',', ' ', '<given>Ramona<given>']
    [Processed them later]
  """
  tag = False
  tagName = ''
  elseCounter = 0
  while endPtr < len(citationString):
    # Keep everything inside the XML tag
    if citationString[endPtr] == '<':
      tag = True  

    if tag:      
      tagName += citationString[endPtr]

      if citationString[endPtr] == '>':
        tag = False
        tagName = tagName[-len(tagName)+1:-1] # Tag name excluding < & >
        tagExpression = r"<"+tagName+">.+<\/"+tagName+">" # <DOI>.+<\/DOI>
        p = re.compile(tagExpression)        
        result = p.search(citationString)
        if result:
          token = result.group()     # group(1) will return the 1st capture (stuff within the brackets).
          unprocessedTokens.append(token)   
          citationString = re.sub(tagExpression,'',citationString)
          
          endPtr = elseCounter
          tagName = ""
          word = ""
    else:
      # If it's not XML tag, look for char & non-char 
      if re.search(charPattern, citationString[endPtr]):
        word += citationString[endPtr]
      elif re.search(noCharPattern, citationString[endPtr]):
        if word is not "":
          unprocessedTokens.append(word)   
        unprocessedTokens.append(citationString[endPtr])
        word = ""
      elseCounter += 1  
    endPtr = endPtr + 1

  #print(unprocessedTokens)

  for item in unprocessedTokens:
    if item == ' ':
      unprocessedTokens.remove(item)

  #print(unprocessedTokens)
  '''
    author -> author
    issued -> date
    title -> title
    container-title -> based on citationType. If not found then containerTitle
    volume -> volume
    issue -> issue
    page -> page
    url -> url [A common one: http://dx.doi.org/10.1016/j.carrev.2009.04.081] | REMOVE
    doi -> doi [A common one: 10.1016/j.jlumin.2015.09.036] | REMOVE
    publisher -> publisher
    source -> source
    issn -> issn

    other
  '''
  sentence = []
  for item in unprocessedTokens:    
    if "author" in item or "family" in item or "given" in item:
      sentence.append(getTagLabelPairs(item,"AUTHOR"))
    elif "container" in item:
      sentence.append(getTagLabelPairs(item,"CT"))      
    elif "title" in item:
      sentence.append(getTagLabelPairs(item,"TITLE"))
    elif "year" in item or "issued" in item:
      sentence.append(getTagLabelPairs(item,"DATE"))
    elif "volume" in item:
      sentence.append(getTagLabelPairs(item,"VOL"))      
    elif "issue" in item:
      sentence.append(getTagLabelPairs(item,"ISSUE"))      
    elif "page" in item:
      sentence.append(getTagLabelPairs(item,"PAGE"))
    elif "publisher" in item:
      sentence.append(getTagLabelPairs(item,"PUBLISHER"))      
    elif "source" in item:
      sentence.append(getTagLabelPairs(item,"SOURCE"))    
    elif "issn" in item:
      sentence.append(getTagLabelPairs(item,"ISSN"))          
    else:
      sentence.append(getTagLabelPairs(item))

  sentence = list(itertools.chain.from_iterable(sentence))  
  #print("Sentence {}".format(sentence))

  # If returning a list-of-token & list-of-tags is needed.
  tokens = []
  tags = []
  for item in sentence:
    #print(item[0])
    tokens.append(item[0])
    tags.append(item[1])

  #print("Tokens {}".format(tokens))
  #print("Tags {}".format(tags))
  return (tokens,tags)
  #return sentence

In [3]:
""" 
    Parallel process the dataset for building training data
    Utilize n cores and chunk the task into n pieces [n = n_proc]
"""
# import necessary libraries
import pandas as pd
import os
import glob
import numpy as np
import multiprocessing
import string
import random
import time

PATH = "/home/muddi004/muddi/GIANT/downsized-1564-v2/Block-text-files/" # Text files to be
def getDataTuples(dataset):
    """
        Input# dataset: n rows of annotated ciation strings
        
        Process# Prepare each citation string using tokenizer mothod which returns word-label list
                 from a annotated citation string
        
        Output# List(List(word-lable tuple of one citation string))
    """
    #print('Chunked dataset before processing: {}'.format(dataset.shape))
    '''Read the dataset'''
    
    ''' Prepare dataframe for tuples'''    
    df2 = pd.DataFrame(columns=['Tokens'])
    df3 = pd.DataFrame(columns=['Tags'])
    for index, row in dataset.iterrows():
        tokenizerResponse = tokenizeCitation(row["citationStringAnnotated"].strip())
        df2 = df2.append({'Tokens': tokenizerResponse[0]}, ignore_index=True)
        df3 = df3.append({'Tags': tokenizerResponse[1]}, ignore_index=True)        

    
    data = pd.concat([df2,df3],axis=1)
    #print('Chunked dataset | data: {}'.format(data.shape))

    
    ''' Prepare list of tuples'''
    datatuple = []
    for index, row in data.iterrows():
        tuples = []
        for idx in range(len(row['Tokens'])):
            # Yeilding out the spaces
            if row['Tokens'][idx] is not ' ':
                tuples.append((row['Tokens'][idx], row['Tags'][idx]))
        datatuple.append(tuples)
    
    #print('Chunked dataset after processing: {}'.format(len(datatuple)))
    return datatuple


def multiprocessing_func(x):
    """
        Each process creates a text file and write a chunk of processed data
        into that file.
    """
    print('Chunked dataset shape in this process: {}'.format(x.shape))
    length = 5 # length of filename
    randomstr = ''.join(random.choices(string.ascii_letters+string.digits,k=length))
    filename = randomstr +'.txt'
    filepath = PATH + filename
    ''' Use datatuples to write into text file'''
    datatuple = getDataTuples(x)
    print('Datatuple {}'.format(len(datatuple)))
    with open(filepath, 'a+') as fileValid:  
        for sentence in datatuple:
            for tuples in sentence:                
                fileValid.write("{}\t{}\n".format(tuples[0],tuples[1]))    
            fileValid.write("\n")
    
    print('Done Writing {}'.format(filename))


# use glob to get all the csv files 
# in the folder
csvDirectory = r"/home/muddi004/muddi/GIANT/downsized-1564-v2/" #TODO: Change path here
csv_files = glob.glob(os.path.join(csvDirectory, "*.csv"))

start = time.time()
print("Starting now")
for csv in csv_files:
    print("Now working on: {}".format(csv))
    dataset = pd.read_csv(csv)
    #dataset = dataset.sample(n=250000) # Random sampling to 250000 data
    n_proc = 1 # TODO: Change number of available processors
    chunks = np.array_split(dataset, n_proc)

    processes=[] #Initialize the parallel processes list
    for i in np.arange(0,n_proc):    
        """Execute the target function on the n_proc target processors using the splitted input""" 
        p = multiprocessing.Process(target=multiprocessing_func,args=(chunks[i],))
        processes.append(p)
        p.start()
    for process in processes:
        process.join()

end = time.time()
print("Total time required: {}".format(end - start))

Starting now
Now working on: /home/muddi004/muddi/GIANT/downsized-1564-v2/qesdE.csv
Chunked dataset shape in this process: (1564, 5)
Datatuple 1564
Done Writing Dfwl9.txt
Now working on: /home/muddi004/muddi/GIANT/downsized-1564-v2/naT5N.csv
Chunked dataset shape in this process: (1564, 5)
Datatuple 1564
Done Writing JSAk9.txt
Now working on: /home/muddi004/muddi/GIANT/downsized-1564-v2/omVcP.csv
Chunked dataset shape in this process: (1564, 5)
Datatuple 1564
Done Writing D012x.txt
Now working on: /home/muddi004/muddi/GIANT/downsized-1564-v2/p5xFx.csv
Chunked dataset shape in this process: (1564, 5)
Datatuple 1564
Done Writing 5AA4Z.txt
Now working on: /home/muddi004/muddi/GIANT/downsized-1564-v2/7dWb1.csv
Chunked dataset shape in this process: (1564, 5)
Datatuple 1564
Done Writing C50ia.txt
Now working on: /home/muddi004/muddi/GIANT/downsized-1564-v2/sRMJg.csv
Chunked dataset shape in this process: (1564, 5)
Datatuple 1564
Done Writing Z0Opa.txt
Now working on: /home/muddi004/muddi/GI

In [2]:
""" Parallel process the dataset for building validation data"""
# import necessary libraries
import pandas as pd
import os
import glob
import numpy as np
import multiprocessing
import string
import random
import time
#from blockTokenizer import tokenizeCitation, strip_non_ascii, getTagLabelPairs

PATH_TO_TEXT_FILES = "/home/muddi004/muddi/GIANT/data-validation/"
def getDataTuples(dataset):
    #print('Chunked dataset before processing: {}'.format(dataset.shape))
    '''Read the dataset'''
    
    ''' Prepare dataframe for tuples'''    
    df2 = pd.DataFrame(columns=['Tokens'])
    df3 = pd.DataFrame(columns=['Tags'])
    for index, row in dataset.iterrows():
        tokenizerResponse = tokenizeCitation(row["citationStringAnnotated"].strip())
        df2 = df2.append({'Tokens': tokenizerResponse[0]}, ignore_index=True)
        df3 = df3.append({'Tags': tokenizerResponse[1]}, ignore_index=True)        

    
    data = pd.concat([df2,df3],axis=1)
    #print('Chunked dataset | data: {}'.format(data.shape))

    ''' Prepare list of tuples'''
    datatuple = []
    for index, row in data.iterrows():
        tuples = []
        for idx in range(len(row['Tokens'])):
            # Yeilding out the spaces
            if row['Tokens'][idx] is not ' ':
                tuples.append((row['Tokens'][idx], row['Tags'][idx]))
        datatuple.append(tuples)
    
    #print('Chunked dataset after processing: {}'.format(len(datatuple)))
    return datatuple

def multiprocessing_func(x):
    print('Chunked dataset shape in this process: {}'.format(x.shape))
    length = 5 # length of filename
    randomstr = ''.join(random.choices(string.ascii_letters+string.digits,k=length))
    filename = randomstr +'.txt'
    filepath = PATH_TO_TEXT_FILES + filename
    ''' Use datatuples to write into text file'''
    datatuple = getDataTuples(x)
    print('Datatuple {}'.format(len(datatuple)))
    with open(filepath, 'a+') as fileValid:  
        for sentence in datatuple:
            for tuples in sentence:                
                fileValid.write("{}\t{}\n".format(tuples[0],tuples[1]))    
            fileValid.write("\n")
    
    print('Done Writing {}'.format(filename))


# use glob to get all the csv files 
# in the folder
csvDirectory = r"/home/muddi004/muddi/GIANT/data-validation/"
csv_files = glob.glob(os.path.join(csvDirectory, "*.csv"))

start = time.time()
print("Starting now")
for csv in csv_files:
    print("Now working on: {}".format(csv))
    dataset = pd.read_csv(csv)
    #dataset = dataset.sample(n=250000) # Random sampling to 250000 data
    n_proc = 10 # TODO: Change number of available processors
    chunks = np.array_split(dataset, n_proc)

    processes=[] #Initialize the parallel processes list
    for i in np.arange(0,n_proc):    
        """Execute the target function on the n_proc target processors using the splitted input""" 
        p = multiprocessing.Process(target=multiprocessing_func,args=(chunks[i],))
        processes.append(p)
        p.start()
    for process in processes:
        process.join()

end = time.time()
print("Total time required: {}".format(end - start))

Starting now
Now working on: /home/muddi004/muddi/GIANT/data-validation/output_sampleCrossref.csv
Chunked dataset shape in this process: (1564, 5)
Chunked dataset shape in this process: (1564, 5)
Chunked dataset shape in this process: (1564, 5)
Chunked dataset shape in this process: (1564, 5)
Chunked dataset shape in this process: (1564, 5)
Chunked dataset shape in this process: (1564, 5)
Chunked dataset shape in this process: (1564, 5)
Chunked dataset shape in this process: (1564, 5)
Chunked dataset shape in this process: (1564, 5)
Chunked dataset shape in this process: (1564, 5)
Datatuple 1564
Datatuple 1564
Datatuple 1564
Done Writing Srwen.txt
Datatuple 1564Datatuple 1564

Datatuple 1564
Datatuple 1564
Done Writing 31bWM.txt
Datatuple 1564
Done Writing r2xwr.txt
Datatuple 1564
Done Writing Vdeni.txt
Done Writing zyEfb.txtDone Writing 42yvx.txt

Done Writing SjZtL.txt
Done Writing E5sQT.txt
Done Writing HQ33f.txt
Datatuple 1564
Done Writing Qdb3g.txt
Total time required: 8.187945842

In [2]:
""" Parallel process the dataset for building Test data"""

# import necessary libraries
import pandas as pd
import os
import glob
import numpy as np
import multiprocessing
import string
import random
import time

def getDataTuples(dataset):
    """
        Input# dataset: n rows of annotated ciation strings
        
        Process# Prepare each citation string using tokenizer mothod which returns word-label list
                 from a annotated citation string
        
        Output# List(List(word-lable tuple of one citation string))
    """
    #print('Chunked dataset before processing: {}'.format(dataset.shape))
    '''Read the dataset'''
    
    ''' Prepare dataframe for tuples'''    
    df2 = pd.DataFrame(columns=['Tokens'])
    df3 = pd.DataFrame(columns=['Tags'])
    for index, row in dataset.iterrows():
        tokenizerResponse = tokenizeCitation(row["citationStringAnnotated"].strip())
        df2 = df2.append({'Tokens': tokenizerResponse[0]}, ignore_index=True)
        df3 = df3.append({'Tags': tokenizerResponse[1]}, ignore_index=True)        

    
    data = pd.concat([df2,df3],axis=1)
    #print('Chunked dataset | data: {}'.format(data.shape))

    
    ''' Prepare list of tuples'''
    datatuple = []
    for index, row in data.iterrows():
        tuples = []
        for idx in range(len(row['Tokens'])):
            # Yeilding out the spaces
            if row['Tokens'][idx] is not ' ':
                tuples.append((row['Tokens'][idx], row['Tags'][idx]))
        datatuple.append(tuples)
    
    #print('Chunked dataset after processing: {}'.format(len(datatuple)))
    return datatuple


def multiprocessing_func(x):
    """
        Each process creates a text file and write a chunk of processed data
        into that file.
    """
    print('Chunked dataset shape in this process: {}'.format(x.shape))
    length = 5 # length of filename
    randomstr = ''.join(random.choices(string.ascii_letters+string.digits,k=length))
    filename = randomstr +'-block.txt'
    filepath = r"/home/muddi004/muddi/citationParser/data/testsamples/" + filename
    ''' Use datatuples to write into text file'''
    datatuple = getDataTuples(x)
    print('Datatuple {}'.format(len(datatuple)))
    with open(filepath, 'a+') as fileValid:  
        for sentence in datatuple:
            for tuples in sentence:                
                fileValid.write("{}\t{}\n".format(tuples[0],tuples[1]))    
            fileValid.write("\n")
    
    print('Done Writing {}'.format(filename))


# use glob to get all the csv files 
# in the folder
csvDirectory = r"/home/muddi004/muddi/citationParser/data/testsamples/"
csv_files = glob.glob(os.path.join(csvDirectory, "*.csv"))

start = time.time()
print("Starting now")
for csv in csv_files:
    print("Now working on: {}".format(csv))
    dataset = pd.read_csv(csv)
    #dataset = dataset.sample(n=250000) # Random sampling to 250000 data
    n_proc = 1 # TODO: Change number of available processors
    chunks = np.array_split(dataset, n_proc)

    processes=[] #Initialize the parallel processes list
    for i in np.arange(0,n_proc):    
        """Execute the target function on the n_proc target processors using the splitted input""" 
        p = multiprocessing.Process(target=multiprocessing_func,args=(chunks[i],))
        processes.append(p)
        p.start()
    for process in processes:
        process.join()

end = time.time()
print("Total time required: {}".format(end - start))

Starting now
Now working on: /home/muddi004/muddi/citationParser/data/testsamples/sample_one.csv
Chunked dataset shape in this process: (1564, 5)
Datatuple 1564
Done Writing BMdEr-block.txt
Total time required: 4.991891145706177


In [None]:
""" 
    Random Testing block 

"""

import time
import glob
import numpy as np
import pandas as pd


print("Starting")
start = time.time()
csvDirectory = r"/home/muddi004/muddi/GIANT/"
csv_files = glob.glob(os.path.join(csvDirectory, r"*.csv"))
filecount = 5
for csv in csv_files[filecount:]:
    print("Now working on file-no:{} name:{}".format(filecount+1,csv))
    dataset = pd.read_csv(csv)
    print("Original Dataset shape: {}".format(dataset.shape))
    dataset = dataset.sample(n=250000)
    print("Sample dataset shape: {}".format(dataset.shape))
    #n_proc = 40 # TODO: Change number of available processors
    #chunks = np.array_split(dataset, n_proc)
    #print("First chunk size:{}".format(chunks[0].shape))
    filecount = filecount + 1
    
end = time.time()
print("Total time required: {}".format(end - start))

In [None]:
"""
    Random Testing Block
"""

# import necessary libraries
import pandas as pd
import os
import glob
import numpy as np
import multiprocessing
import string
import random
import time

# use glob to get all the csv files 
# in the folder
test1 = "/home/muddi004/muddi/GIANT/data-test/test-1.csv"
test2 = "/home/muddi004/muddi/GIANT/data-test/test-2.csv"
csv_files = [test1,test2]#glob.glob(os.path.join(r"/home/muddi004/muddi/GIANT/data-test/", "*.csv"))
#dataset = pd.read_csv(csvDirectory, encoding= 'unicode_escape')
print(csv_files)
#print(csv_files[:2])

# loop over the list of csv files
'''
print(dataset.shape)
datatuple = []
dataChunks = np.array_split(dataset, 200)
#print(dataChunks[1].head)
csv_files = dataChunks
'''

for f in csv_files:      
    # read the csv file
    print(f)

    df = pd.read_csv(f, encoding='unicode_escape')
    print(df.shape)
     
    # print the location and filename
    #print('Location:', f)
    #print('File Name:', f.split("\\")[-1])
      
    # print the content
    #print('Content:')
    #display(df)
    

    df2 = pd.DataFrame(columns=['Tokens'])
    df3 = pd.DataFrame(columns=['Tags'])
    datatuple = []
    for index, row in df.iterrows():
        print("Now on: {}".format(row["citationStringAnnotated"]))
        tokenizerResponse = tokenizer(row["citationStringAnnotated"].strip())

        df2 = df2.append({'Tokens': tokenizerResponse[0]}, ignore_index=True)
        df3 = df3.append({'Tags': tokenizerResponse[1]}, ignore_index=True)

        data = pd.concat([df2,df3],axis=1)
        #data.head()

        for index, row in data.iterrows():
            tuples = []
            for idx in range(len(row['Tokens'])):
                # Yeilding out the spaces
                if row['Tokens'][idx] is not ' ':
                    tuples.append((row['Tokens'][idx], row['Tags'][idx]))
        datatuple.append(tuples)
        
    length = 5 # length of filename
    randomstr = ''.join(random.choices(string.ascii_letters+string.digits,k=length))
    filename = randomstr +'.txt'
    filepath = "/home/muddi004/muddi/GIANT/data-test/" + filename
    ''' Use datatuples to write into text file'''
    print('Datatuple {}'.format(len(datatuple)))
    with open(filepath, 'a+') as fileValid:  
        for sentence in datatuple:
            for tuples in sentence:                
                fileValid.write("{}\t{}\n".format(tuples[0],tuples[1]))    
            fileValid.write("\n")

    print('Done Writing {}'.format(filename))
  