In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Preprocessing of Trial Dataset

In [2]:
def find_source(data, ls = []):
    if isinstance(data,dict):
        for key in data.keys():
            if key=="from sentence":
                sentences=data[key].split('\n') # The value can be one or more sentences
                for s in sentences:
                    ls.append(s.strip())
            elif isinstance(data[key], dict):
                find_source(data[key],ls)
            elif isinstance(data[key], list):
                for i in data[key]:
                    find_source(i,ls)
    elif isinstance(data,list):
        for i in data:
            find_source(i,ls)
    return ls    # might have repeated sentences

# determine if a triple is contained in the trace of traversing the json object in a depth-firsr manner
def is_contained(trace, triple):
    if len(trace) >= len(triple):
        for i in range(len(trace)-2):
            if trace[i:i+3] == triple:
                return True
        return False
    else:
        return False

# Get the previous two words in a trace,
# to be prefixed to the coordinated items in a list or a dictionary
def get_prefix(data, trace):
    if isinstance(data, dict) or isinstance(data, list):
        return trace[-2:]

# traverse the json object recursively,
# to find the source sentence of the triple
def find_tri_sent(data, triple, trace=[], ls=[], prefix=[]):
    # Parse the json file recursively and return a list of source sentences
    if isinstance(data, dict):
        for i, key in enumerate(data.keys()):
            if key != "from sentence":
                if prefix and i != 0:
                    trace += prefix
                trace.append(key)
                find_tri_sent(data[key], triple, trace, ls,
                              get_prefix(data[key], trace))
            else:
                if is_contained(trace, triple):
                    ls.append(data[key].strip())
    elif isinstance(data, list):
        for i, item in enumerate(data):
            if prefix and i != 0:
                trace += prefix
            find_tri_sent(item, triple, trace, ls, prefix)
    elif isinstance(data, str):
        trace.append(data)
    return ls

In [3]:
'''
Data preprocessing and cleaning
get a dataframe of all sentences, together with relevant information to the tasks
'''
import os
import re
import math
import pandas as pd
import json


base_dir = '/content/drive/MyDrive/Colab Notebooks/Scispace/training_data'
sep = os.path.sep

def get_dir(topic_ls=None, paper_ls=None):
    # Get the list of paper directories
    dir_ls = []
    if topic_ls is None:
        topic_ls = os.listdir(base_dir)
        topic_ls.remove('train-README.md')
        topic_ls.remove('trial-README.md')
    if paper_ls is None:
        for topic in topic_ls:
            paper_ls = os.listdir(os.path.join(base_dir, topic))
            for i in paper_ls:
                dir_ls.append(os.path.join(base_dir, topic, i))
    else:
        for topic in topic_ls:
            for i in paper_ls:
                dir_ls.append(os.path.join(base_dir, topic, str(i)))
    return dir_ls

def get_file_path(dirs):
    # Get the relevant files from each directory of paper.
    rx = '(.*Stanza-out.txt$)|(^sentences.txt$)'
    file_path = []
    for dir in dirs:
        new = ['', '']  # stores the paths of the sentence file and the label file
        for file in os.listdir(dir):
            res = re.match(rx, file)
            if res:
                if res.group(1):
                    new[0] = os.path.join(dir, file)
                if res.group(2):
                    new[1] = os.path.join(dir, file)
        file_path.append(new)
    return file_path

def is_heading(line):
    # Determine if a line is a heading
    ls = line.split(' ')
    # Titles rarely end with these words
    False_end = ['by', 'as', 'in', 'and', 'that']
    if len(ls) < 10 and ls[-1] not in False_end:
        rx = '^[A-Z][^?]*[^?:]$|^title$|^abstract$'  # regex heuristic rules
        res = re.match(rx, line)
        return True if res else False
    return False

def is_main_heading(line, judge_mask=False):
    '''
    Assume that the line is a heading, determine if it is a main heading
    A main heading is either a typical main section heading, or it contains lexical cues that are considered important for judgement.
    '''
    if len(line.split(' ')) <= 4:
        if judge_mask:    # if the aim is to judge whether the sentence should be skipped
            lex_cue = 'background|related|conclusion'  # |related work
        else:
            lex_cue = 'title|abstract|introduction|background|related|conclusion|model|models|method|methods|approach|architecture|system|application|experiment|experiments|experimental setup|implementation|hyperparameters|training|result|results|ablation|baseline|evaluation'  # |related work
        exp = re.compile(lex_cue)
        # Decide if it is a main heading
        return True if exp.search(line.lower()) else False
    else:
        return False

# Determin if a sentence conforms to a specific case method.
# There are three case methods in all, eg: Attention Is All You Need; ATTENTION IS ALL YOU NEED; Attention is all you need

def check_case(line, flag):
    if flag == 1:
        match = re.search(r'[a-z]', line)
        if match:
            return False
        return True
    else:
        wd_num = 0
        words = line.split(' ')
        if flag == 0:
            stp_wd = ['a', 'an', 'and', 'the', 'or', 'if', 'by', 'as', 'to', 
            'of', 'for', 'in', 'on', 'but', 'via', 'nor', 'with']
            if not words[0].istitle():
                wd_num += 1
            if len(words) > 1:
                if not words[-1].istitle():
                    wd_num += 1
                for word in words[1:-1]:
                    if not word.istitle() and word not in stp_wd:
                        wd_num += 1
            return wd_num <= math.ceil(len(words)/5)
        if flag == 2:
            if not words[0].istitle():
                wd_num += 1
            for word in words[1:]:
                if re.match(r'[A-Z]', word):
                    wd_num += 1
            return wd_num <= math.ceil(len(words)/3)

# read the relevant files from the folder of one paper, and produce a data table for that paper.
def load_paper_sentence(sent_path, label_path):
    sent = []
    count = [0, 0, 0]
    task, index = sent_path.split(sep)[-3:-1]
    # Decide the case type of the titles in this paper, by counting over the main headings and find the maximum
    with open(sent_path, 'r') as f:
        while(True):
            line = f.readline().rstrip("\n")
            if line:
                if is_heading(line) and is_main_heading(line):
                    for m in range(3):
                        if check_case(line, m):
                            count[m] += 1
            else:
                break
    ocr_path = sent_path[:-14]+'Grobid-out.txt'
    with open(ocr_path, 'r') as f:
        fl=f.readlines()
    title_ls = []
    for i in range(len(fl)):
        if fl[i]=='\n':
            if i<(len(fl)-1):
                title_ls.append(fl[i+1].rstrip())
        if fl[i].rstrip().lower() in ['title','abstract','introduction']:
            title_ls.append(fl[i].rstrip())

    with open(sent_path, 'r') as f:
        i = 0
        flg = count.index(max(count))
        # two string buffers, storing the heading and the main heading respectively
        heading, main_h = '', ''
        ofs1 = ofs3 = 0
        while(True):
            i += 1
            line = f.readline().rstrip("\n")
            if line:
                if line in title_ls:
                    ofs3 = 0
                else:
                    ofs3 += 1
                if is_heading(line) and check_case(line, flg):
                    heading = line    # update the heading buffer
                    if is_main_heading(line):
                        ofs1 = 0
                        main_h = line    # update the main heading buffer too
                        # The line itself is a main heading, no heading needs to be stored.
                        sent.append(
                            [i, line, '', '', task, index, None, None, None, ofs1, 0, i-1, 0, ofs3, 0, 1, 0, None])
                    else:
                        ofs1 += 1
                        # for plain headings, store the main heading it belongs to.
                        # judge if it should be masked
                        if is_main_heading(main_h, judge_mask=True):
                            sent.append([i, line, main_h, '', task,
                                         index, None, None, None, ofs1, 0, i-1, 0, ofs3, 0, 0, 0, None])
                        else:
                            sent.append([i, line, main_h, '', task,
                                         index, None, None, None, ofs1, 0, i-1, 0, ofs3, 0, 1, 0, None])
                else:
                    # For plain text line, store both the heading and the main heading.
                    ofs1 += 1
                    if is_main_heading(main_h, judge_mask=True):
                        sent.append([i, line, main_h, heading, 
                                     task, index, None, None, None, ofs1, 0, i-1, 0, ofs3, 0, 0, 0, None])
                    else:
                        sent.append([i, line, main_h, heading, 
                                     task, index, None, None, None, ofs1, 0, i-1, 0, ofs3, 0, 1, 0, None])
            else:
                break
    for i in range(1,len(sent)):
        if sent[i][9]==0:
            sof = sent[i-1][9]
            if sof>1:
                for j in range(i-sof,i):
                    sent[j][10] = sent[j][9]/sof
        if sent[i][13] == 0:
            sof = sent[i-1][13]
            if sof>1:
                for j in range(i-sof, i):
                    sent[j][14] = sent[j][13]/sof
        if i == len(sent)-1:
            sof = sent[i][9]
            if sof > 1:
                for j in range(i-sof+1, i+1):
                    sent[j][10] = sent[j][9]/sof
            sof = sent[i][13]
            if sof > 1:
                for j in range(i-sof+1, i+1):
                    sent[j][14] = sent[j][13]/sof
        sent[i][12] = sent[i][11]/len(sent)

    # slice the sentence with the span of characters, returns the span of words
    def get_word_idx(sent, start, end):
        ls = sent.split(' ')
        if isinstance(start, str):
            start = int(start)
        if isinstance(end, str):
            end = int(end)
        # if the span of characters doesn't conform to word boundaries, 'st' and 'en' will remain 0.
        st, en = 0, 0
        length = [len(word) for word in ls]
        count = 0
        for i in range(len(ls)):
            if start == count:
                st = i
                break
            count += (length[i]+1)
        for j in range(st, len(ls)):
            count += (length[j]+1)
            if end == (count-1):
                en = j + 1
                break
        return st, en

    # Mark the label of contribution-ralated sentences, and initialize their BIO tag sequences.
    with open(label_path, 'r') as f:
        while(True):
            line = f.readline().rstrip("\n")
            if line:
                sent[int(line)-1][-2] = 1
                sent[int(line)-1][6] = ['O'] * \
                    len(sent[int(line)-1][1].split(' '))
            else:
                break

    # go over the entities and change the corresponding part of BIO sequences
    ent_path = sep.join(label_path.split(sep)[:-1]+['entities.txt'])
    with open(ent_path, 'r') as f:
        while(True):
            line = f.readline().rstrip("\n")
            if line:
                info = line.split('\t')
                sentence = sent[int(info[0])-1][1]
                # if sentence.split(' ')[0].lower()[1:] == sentence.split(' ')[0][1:]:
                # sentence = sentence[0].lower() + sentence[1:]
                st, en = get_word_idx(sentence, info[1], info[2])
                phrase = info[3].strip()
                # If the span of characters does not match the given phrase, use the given phrase instead
                if ' '.join(sentence.split(' ')[st: en]).strip() != phrase:
                    st_char = (' ' + sentence).find(' ' + phrase + ' ')
                    st, en = get_word_idx(
                        sentence, st_char, st_char + len(phrase))
                    if st == 0 and en == 0:
                        print(
                            f'Could not find the phrase \'{info[3]}\' in the {int(info[0])}th sentence of \'{task}\' paper {index}')
                        continue
                    else:
                        print(
                            f'In the {int(info[0])}th sentence of \'{task}\' paper {index}, the entity \'{info[3]}\' is not in the span ({info[1]}, {info[2]})')
                for j in range(st, en):
                    if sent[int(info[0])-1][6] is None:
                        print(
                            f'A phrase exists in the {int(info[0])}th sentence of \'{task}\' paper {index}, which is not labeled as a contribution sentence.')
                        sent[int(info[0])-1][6] = ['O'] * \
                            len(sent[int(info[0])-1][1].split(' '))
                    if j == st:
                        sent[int(info[0])-1][6][j] = 'B'
                    else:
                        sent[int(info[0])-1][6][j] = 'I'
            else:
                break

    # decide which information unit each positive sentence belongs to.
    j_dir = sep.join(sent_path.split(sep)[:-1]) + sep + 'info-units'
    for unit in os.listdir(j_dir):  # For each json file representing an information unit
        js_file = os.path.join(j_dir, unit)
        try:
            with open(js_file, 'r') as f:
                data = json.load(f, strict=False)
            lst = find_source(data, [])
            if "TITLE" in lst:  # When the title is a source sentence, sometimes it is abbreviated as 'TITLE'
                sent[1][-1] = unit[:-5]
            for j in range(len(sent)):
                if sent[j][1] in lst:
                    sent[j][-1] = unit[:-5]
        except json.JSONDecodeError as e:
            js_position = sep.join(js_file.split(sep)[-4:])
            print(f'JSONDecodeError in {js_position}\n', e)
            continue

    # given a sequence of BIO tags, get the list of tuples representing spans of entities
    def get_entity_spans(ls):
        spans = []
        for i in range(len(ls)):
            st, ed = 0, 0
            if ls[i] == 'B':
                st, ed = i, i + 1
                for j in range(i+1, len(ls)):
                    if ls[j] == 'I':
                        ed += 1
                    else:
                        break
                spans.append((st, ed))
        return spans

    for i in range(len(sent)):
        if sent[i][6] is not None:
            sent[i][8] = sent[i][7] = sent[i][6]

    # try to find the SPO(Subject, Predicate, Object) type of each phrase
    aux = []
    for i in range(len(sent)):
        if sent[i][6] is not None:
            tup_ls = get_entity_spans(sent[i][6])
            # use three booleans to indicate if the phrase has ever been a subject, predicate or object
            tuple_ls = [[0, 0, 0, tup] for tup in tup_ls]
            word_ls = sent[i][1].split(' ')
            phrase_ls = [' '.join(word_ls[st:en]) for st, en in tup_ls]
            # store the sentence idx, tuple_ls, and phrase_ls.
            aux.append([i, tuple_ls, phrase_ls, []])        
    t_dir = sep.join(sent_path.split(sep)[:-1]) + sep + 'triples'
    paper_triple_stat = [0] * 5
    for unit in os.listdir(t_dir):
        t_file = os.path.join(t_dir, unit)
        js_file = os.path.join(j_dir, unit.replace('.txt', '.json'))
        try:
            with open(js_file,'r') as g:
                js = json.load(g, strict=False)
                js = {'Contribution': js}
        except json.JSONDecodeError as e:
            js_position = sep.join(js_file.split(sep)[-4:])
            print(f'JSONDecodeError in {js_position}\n', e)
            continue
        except FileNotFoundError as fe:
            print(fe)
            continue
        with open(t_file, 'r') as f:
            while(True):
                line = f.readline().rstrip("\n")
                if line:
                    # empty the temporary buffer
                    for a in range(len(aux)):
                        aux[a][3] = []
                    if line[0] == '(':
                        line = line[1:]
                    if line[-1] == ')':
                        line = line[:-1]
                    triple = line.split('||')
                    evidence = find_tri_sent(
                        js, triple, [], [], [])  # unit[:-4]
                    if not evidence:
                        js_position = sep.join(js_file.split(sep)[-4:]) #
                        paper_triple_stat[0] += 1
                        print(f'the triple \'{triple}\' not found in {js_position}')
                    else:                       
                        cands = evidence[0].split('\n')
                        for i in range(len(cands)):
                            for j in range(len(aux)):
                                if cands[i].strip() == sent[aux[j][0]][1]:
                                    for w in range(3):
                                        for k in range(len(aux[j][2])):
                                            if aux[j][2][k] == triple[w]:
                                                aux[j][3].append((w, k))
                                                break
                                    break
                        lens = [len(aux[j][3]) for j in range(len(aux))]
                        try:
                            paper_triple_stat[max(lens)] += 1
                        except IndexError:
                            print(f'List index out of range. The actual number of max is {max(lens)} for triple \'{triple}\' in\n', t_file)
                        if max(lens)!=0:
                            idx = lens.index(max(lens))
                            found = [0, 0, 0]
                            for t in range(len(aux[idx][3])):
                                w, k = aux[idx][3][t]
                                aux[idx][1][k][w] = 1
                                found[w] = 1
                            for i in range(3):
                                if found[i] == 0:
                                    for j in range(len(aux)):
                                        for w, k in aux[j][3]:
                                            if w == i and triple[w] == aux[j][2][k]:
                                                aux[j][1][k][w] = 1
                                                break
                                        else:
                                            continue
                                        break
                else:
                    break

    # An S-P-O type corresponds to a combination of boolean indicators
    # The 4 keys stand for 'predicate', 'subject', 'object', 'both subject and object' respectively.
    good_state = {'p': [0, 1, 0], 's': [1, 0, 0],
                    'ob': [0, 0, 1], 'b': [1, 0, 1]}
    for i in range(len(aux)):
        for item in aux[i][1]:
            if item[:3] not in good_state.values():
                # if the label of any phrase in the sentence cannot be decided,
                # delete the tag sequence to filter out this sentence
                sent[aux[i][0]][7] = sent[aux[i][0]][8] = None
                break

    for i in range(len(aux)):
        '''
        interprete the boolean states to phrase types according to the BIO_type setting,
        and change the corresponding parts in BIO sequences
        BIO_type=1: decide whether it is a predicate
        BIO_type=2: decide which of the four keys in 'good_state' it belongs to
        '''
        if sent[aux[i][0]][7] is not None:
            sent[aux[i][0]][7] = ['O']*len(sent[aux[i][0]][7])
            sent[aux[i][0]][8] = ['O']*len(sent[aux[i][0]][8])
            for item in aux[i][1]:
                st, en = item[3]
                if item[:3] == good_state['p']:
                    sent[aux[i][0]][7][st] = 'B-p'
                    for j in range(st+1, en):
                        sent[aux[i][0]][7][j] = 'I-p'
                else:
                    sent[aux[i][0]][7][st] = 'B-n'
                    for j in range(st+1, en):
                        sent[aux[i][0]][7][j] = 'I-n'
                for key, value in good_state.items():
                    if item[:3] == value:
                        sent[aux[i][0]][8][st] = 'B-'+key
                        for j in range(st+1, en):
                            sent[aux[i][0]][8][j] = 'I-'+key
    # print(f'paper triple stat: {paper_triple_stat}')
    return sent, paper_triple_stat

def load_data_sentence(file_path):
    # Get the data table of all the papers in file_path
    triple_stat = [0] * 5
    data = []
    for tuple in file_path:
        sentence_path, label_path = tuple
        paper_data, paper_triple_stat = load_paper_sentence(
        sentence_path, label_path)
        for i in range(5):
            triple_stat[i] += paper_triple_stat[i]
        data += paper_data
    return data

dirs = get_dir()
file_path = get_file_path(dirs)
data = load_data_sentence(file_path)

df = pd.DataFrame(data)
df.columns = ['idx', 'text', 'main_heading', 'heading',
              'topic', 'paper_idx', 'BIO', 'BIO_1', 'BIO_2', 'offset1', 'pro1', 'offset2', 'pro2', 'offset3', 'pro3', 'mask', 'bi_labels', 'labels']

df.to_csv('all_sent.csv', index=False)


Could not find the phrase 'https://github.com/allenai/scibert/' in the 9th sentence of 'relation-classification' paper 9

"all_sent.csv" and "pos_sent.csv" have been saved to ./interim


In [4]:
df1 = pd.read_csv('all_sent.csv')

In [5]:
df1

Unnamed: 0,idx,text,main_heading,heading,topic,paper_idx,BIO,BIO_1,BIO_2,offset1,pro1,offset2,pro2,offset3,pro3,mask,bi_labels,labels
0,1,title,,,text-classification,6,,,,0,0.000000,0,0.000000,0,0.000000,1,0,
1,2,Universal Sentence Encoder,title,,text-classification,6,"['B', 'I', 'I']","['B-n', 'I-n', 'I-n']","['B-ob', 'I-ob', 'I-ob']",1,0.000000,1,0.006757,1,0.000000,1,1,research-problem
2,3,abstract,,,text-classification,6,,,,0,0.000000,2,0.013514,0,0.000000,1,0,
3,4,We present models for encoding sentences into ...,abstract,abstract,text-classification,6,"['O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'I', ...","['O', 'O', 'O', 'O', 'B-n', 'I-n', 'I-n', 'I-n...","['O', 'O', 'O', 'O', 'B-ob', 'I-ob', 'I-ob', '...",1,0.111111,3,0.020270,1,0.111111,1,1,research-problem
4,5,The models are efficient and result in accurat...,abstract,abstract,text-classification,6,,,,2,0.222222,4,0.027027,2,0.222222,1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1875,268,We use almost the same biattention classificat...,Model,A.8 Sentiment classification,named-entity-recognition,4,,,,46,0.920000,267,0.981618,1,0.200000,1,0,
1876,269,A BCN model with a batch - normalized maxout n...,Model,A.8 Sentiment classification,named-entity-recognition,4,,,,47,0.940000,268,0.985294,2,0.400000,1,0,
1877,270,"To match the CoVe training setup , we only tra...",Model,A.8 Sentiment classification,named-entity-recognition,4,,,,48,0.960000,269,0.988971,3,0.600000,1,0,
1878,271,We use 300 -d hidden states for the biLSTM and...,Model,A.8 Sentiment classification,named-entity-recognition,4,,,,49,0.980000,270,0.992647,4,0.800000,1,0,


In [None]:
df1.head()

Unnamed: 0,idx,text,main_heading,heading,topic,paper_idx,BIO,BIO_1,BIO_2,offset1,pro1,offset2,pro2,offset3,pro3,mask,bi_labels,labels
0,1,title,,,text-classification,6,,,,0,0.0,0,0.0,0,0.0,1,0,
1,2,Universal Sentence Encoder,title,,text-classification,6,"['B', 'I', 'I']","['B-n', 'I-n', 'I-n']","['B-ob', 'I-ob', 'I-ob']",1,0.0,1,0.006757,1,0.0,1,1,research-problem
2,3,abstract,,,text-classification,6,,,,0,0.0,2,0.013514,0,0.0,1,0,
3,4,We present models for encoding sentences into ...,abstract,abstract,text-classification,6,"['O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'I', ...","['O', 'O', 'O', 'O', 'B-n', 'I-n', 'I-n', 'I-n...","['O', 'O', 'O', 'O', 'B-ob', 'I-ob', 'I-ob', '...",1,0.111111,3,0.02027,1,0.111111,1,1,research-problem
4,5,The models are efficient and result in accurat...,abstract,abstract,text-classification,6,,,,2,0.222222,4,0.027027,2,0.222222,1,0,


In [10]:
df1 = df1.drop(columns=['BIO','BIO_1','BIO_2','offset1','pro1','offset2','pro2','offset3','pro3'])

In [None]:
df1.fillna("nan")

Unnamed: 0,idx,text,main_heading,heading,topic,paper_idx,mask,bi_labels,labels
0,1,title,,,text_generation,5,1,0,
1,2,Improved Variational Autoencoders for Text Mod...,title,title,text_generation,5,1,1,research-problem
2,3,abstract,,,text_generation,5,1,0,
3,4,Recent work on generative text modeling has fo...,abstract,abstract,text_generation,5,1,1,research-problem
4,5,This negative result is so far poorly understo...,abstract,abstract,text_generation,5,1,0,
...,...,...,...,...,...,...,...,...,...
57076,268,We use almost the same biattention classificat...,Model,A.8 Sentiment classification,named-entity-recognition,4,1,0,
57077,269,A BCN model with a batch - normalized maxout n...,Model,A.8 Sentiment classification,named-entity-recognition,4,1,0,
57078,270,"To match the CoVe training setup , we only tra...",Model,A.8 Sentiment classification,named-entity-recognition,4,1,0,
57079,271,We use 300 -d hidden states for the biLSTM and...,Model,A.8 Sentiment classification,named-entity-recognition,4,1,0,


In [None]:
summary = ''
abstract = ''
sum_list = []
abs_list = []
score = []
y = 1
x = 0
k = 0
while k < 100:
  summary = ''
  abstract = ''
  y = 1
  
  while df1['idx'].iloc[x] == y:
    if df1['bi_labels'].iloc[x] == 1:
      summary = summary + '' + df1['text'].iloc[x]
    if df1['main_heading'].iloc[x]=='abstract':
      abstract = abstract + '' + df1['text'].iloc[x]
    x += 1
    y += 1
  print(summary)
  score.append(rouge.get_scores(summary, abstract))
  sum_list.append(summary)
  abs_list.append(abstract)
  k += 1

  




  


Improved Variational Autoencoders for Text Modeling using Dilated ConvolutionsRecent work on generative text modeling has found that variational autoencoders ( VAE ) with LSTM decoders perform worse than simpler LSTM language models ( Bowman et al. , 2015 ) .We propose the use of a dilated CNN as a decoder in VAE , inspired by the recent success of using CNNs for audio , image and language modeling ( van den .In contrast with prior work where extremely large CNNs are used , we exploit the dilated CNN for its flexibility in varying the amount of conditioning context .We use an LSTM as an encoder for VAE and explore LSTMs and CNNs as decoders .For CNNs , we explore several different configurations .We set the convolution filter size to be 3 and gradually increase the depth and dilation from [ 1 , 2 , 4 ] , ] to .We use Gumbel - softmax to sample y from q ( y|x ) .We use a vocabulary size of 20 k for both data sets and set the word embedding dimension to be 512 .The number of channels for

In [None]:
import csv 
header = ["SN","Abstract", "Summary", "Rogue Score"]
x = 0

with open("out.csv", 'w',encoding='UTF8',newline='') as f:
  writer = csv.writer(f)
  writer.writerow(header)
  while x < len(score):
    writer.writerow([x,abs_list[x],sum_list[x],score[x]])
    x += 1

In [6]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from rouge import Rouge
rouge = Rouge()
rouge.get_scores(summary, abstract)

[{'rouge-1': {'r': 0.5254237288135594,
   'p': 0.2897196261682243,
   'f': 0.3734939713216722},
  'rouge-2': {'r': 0.23976608187134502,
   'p': 0.09808612440191387,
   'f': 0.13921901115942834},
  'rouge-l': {'r': 0.5084745762711864,
   'p': 0.2803738317757009,
   'f': 0.3614457785505879}}]

Using BERT for model training

In [7]:
!pip install simpletransformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.7-py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 5.0 MB/s 
[?25hCollecting transformers>=4.6.0
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 53.9 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 35.8 MB/s 
[?25hCollecting wandb>=0.10.32
  Downloading wandb-0.13.2-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 47.4 MB/s 
Collecting streamlit
  Downloading streamlit-1.12.2-py2.py3-none-any.whl (9.1 MB)
[K     |████████████████████████████████| 9.1 MB 48.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014

In [8]:
import logging

import pandas as pd
import sklearn
import random
from simpletransformers.classification import ClassificationModel, ClassificationArgs


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

df = pd.read_csv('/content/all_sent.csv')
df = df.drop(columns=['BIO', 'BIO_1', 'BIO_2', 'labels']).rename(
    columns={'bi_labels': 'labels'})
df['title'] = df['main_heading'] + ': ' + df['heading']
df.loc[((df['main_heading'] == df['heading']) | (
    pd.isnull(df['heading']))), 'title'] = df['main_heading']
df['title'] = df['title'].fillna('')
df['paper'] = df['topic'] + df['paper_idx'].astype(str)
ids = df["paper"].unique()
random.seed(1)
random.shuffle(ids)
bound = int(0.9*len(ids))

train_df = df.set_index("paper").loc[ids[:bound]].reset_index()
eval_df = df.set_index("paper").loc[ids[bound:]].reset_index()
train_df = train_df.sample(frac=1, random_state=1)

# Some sentences are in the 'related work' or 'conclusion' section, and should be masked out.
train_df = train_df[train_df['mask'] == 1]
eval_df = eval_df[eval_df['mask'] == 1]

train_pos = train_df[train_df['labels'] == 1]
train_neg = train_df[train_df['labels'] == 0]
imbalance_ratio = len(train_neg) / len(train_pos)

# Create a ClassificationModel
model_args = ClassificationArgs()
model_args.use_early_stopping = True
model_args.early_stopping_metric = "F1_score"
model_args.early_stopping_metric_minimize = False
model_args.early_stopping_patience = 2
model_args.early_stopping_consider_epochs = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True

model_args.downsample = 1.0
model_args.normalize_ofs = True
model_args.out_learning_rate = 0.001

model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.output_dir = 'binary/'
model_args.best_model_dir = 'binary/best_model'
model_args.save_model_every_epoch = True
model_args.save_steps = -1
model_args.manual_seed = 1
model_args.fp16 = False
model_args.num_train_epochs = 3
model_args.train_batch_size = 64
model_args.use_multiprocessing = False  # set to True if cpu memory is enough
model_args.gradient_accumulation_steps = 4
model_args.learning_rate = 0.001
model_args.scheduler = "polynomial_decay_schedule_with_warmup"
model_args.polynomial_decay_schedule_power = 0.5
model_args.warmup_steps = 200
model_args.do_lower_case = True


# Create a TransformerModel
model = ClassificationModel(
    "bert",
    "allenai/scibert_scivocab_uncased",
    weight=[1, imbalance_ratio/model_args.downsample],
    use_cuda=False,
    args=model_args,
)

model.train_model(train_df, eval_df=eval_df,
                    F1_score=sklearn.metrics.f1_score)



Downloading config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

Downloading vocab.txt:   0%|          | 0.00/223k [00:00<?, ?B/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/224 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/224 [00:00<?, ?it/s]

(15,
 defaultdict(list,
             {'global_step': [5, 10, 15],
              'train_loss': [0.6684433817863464,
               0.6455604434013367,
               0.26190826296806335],
              'mcc': [0.17600664594905818,
               0.35262491742120977,
               0.30488250292212377],
              'tp': [36, 33, 20],
              'tn': [39, 109, 152],
              'fp': [148, 78, 35],
              'fn': [1, 4, 17],
              'auroc': [0.6461916461916463,
               0.7823384882208412,
               0.8025726261020378],
              'auprc': [0.26833763755825285,
               0.39514813676764005,
               0.4736273347754718],
              'F1_score': [0.3257918552036199,
               0.44594594594594594,
               0.4347826086956522],
              'eval_loss': [0.7000990786722728,
               0.582463946725641,
               0.45661281182297636]}))

###Preprocessing of Test Data

In [48]:
def find_source(data, ls = []):
    if isinstance(data,dict):
        for key in data.keys():
            if key=="from sentence":
                sentences=data[key].split('\n') # The value can be one or more sentences
                for s in sentences:
                    ls.append(s.strip())
            elif isinstance(data[key], dict):
                find_source(data[key],ls)
            elif isinstance(data[key], list):
                for i in data[key]:
                    find_source(i,ls)
    elif isinstance(data,list):
        for i in data:
            find_source(i,ls)
    return ls    # might have repeated sentences

# determine if a triple is contained in the trace of traversing the json object in a depth-firsr manner
def is_contained(trace, triple):
    if len(trace) >= len(triple):
        for i in range(len(trace)-2):
            if trace[i:i+3] == triple:
                return True
        return False
    else:
        return False

# Get the previous two words in a trace,
# to be prefixed to the coordinated items in a list or a dictionary
def get_prefix(data, trace):
    if isinstance(data, dict) or isinstance(data, list):
        return trace[-2:]

# traverse the json object recursively,
# to find the source sentence of the triple
def find_tri_sent(data, triple, trace=[], ls=[], prefix=[]):
    # Parse the json file recursively and return a list of source sentences
    if isinstance(data, dict):
        for i, key in enumerate(data.keys()):
            if key != "from sentence":
                if prefix and i != 0:
                    trace += prefix
                trace.append(key)
                find_tri_sent(data[key], triple, trace, ls,
                              get_prefix(data[key], trace))
            else:
                if is_contained(trace, triple):
                    ls.append(data[key].strip())
    elif isinstance(data, list):
        for i, item in enumerate(data):
            if prefix and i != 0:
                trace += prefix
            find_tri_sent(item, triple, trace, ls, prefix)
    elif isinstance(data, str):
        trace.append(data)
    return ls

In [51]:
'''
Data preprocessing and cleaning
get a dataframe of all sentences, together with relevant information to the tasks
'''
import os
import re
import math
import pandas as pd
import json


base_dir = '/content/drive/MyDrive/Colab Notebooks/Scispace/Test_Data'
sep = os.path.sep

def get_dir(topic_ls=None, paper_ls=None):
    # Get the list of paper directories
    dir_ls = []
    if topic_ls is None:
        topic_ls = os.listdir(base_dir)
        #topic_ls.remove('train-README.md')
        #topic_ls.remove('trial-README.md')
    if paper_ls is None:
        for topic in topic_ls:
            paper_ls = os.listdir(os.path.join(base_dir, topic))
            for i in paper_ls:
                dir_ls.append(os.path.join(base_dir, topic, i))
    else:
        for topic in topic_ls:
            for i in paper_ls:
                dir_ls.append(os.path.join(base_dir, topic, str(i)))
    return dir_ls

def get_file_path(dirs):
    # Get the relevant files from each directory of paper.
    rx = '(.*Stanza-out.txt$)|(^sentences.txt$)'
    file_path = []
    for dir in dirs:
        new = ['', '']  # stores the paths of the sentence file and the label file
        for file in os.listdir(dir):
            res = re.match(rx, file)
            if res:
                if res.group(1):
                    new[0] = os.path.join(dir, file)
                if res.group(2):
                    new[1] = os.path.join(dir, file)
        file_path.append(new)
    return file_path

def is_heading(line):
    # Determine if a line is a heading
    ls = line.split(' ')
    # Titles rarely end with these words
    False_end = ['by', 'as', 'in', 'and', 'that']
    if len(ls) < 10 and ls[-1] not in False_end:
        rx = '^[A-Z][^?]*[^?:]$|^title$|^abstract$'  # regex heuristic rules
        res = re.match(rx, line)
        return True if res else False
    return False

def is_main_heading(line, judge_mask=False):
    '''
    Assume that the line is a heading, determine if it is a main heading
    A main heading is either a typical main section heading, or it contains lexical cues that are considered important for judgement.
    '''
    if len(line.split(' ')) <= 4:
        if judge_mask:    # if the aim is to judge whether the sentence should be skipped
            lex_cue = 'background|related|conclusion'  # |related work
        else:
            lex_cue = 'title|abstract|introduction|background|related|conclusion|model|models|method|methods|approach|architecture|system|application|experiment|experiments|experimental setup|implementation|hyperparameters|training|result|results|ablation|baseline|evaluation'  # |related work
        exp = re.compile(lex_cue)
        # Decide if it is a main heading
        return True if exp.search(line.lower()) else False
    else:
        return False

# Determin if a sentence conforms to a specific case method.
# There are three case methods in all, eg: Attention Is All You Need; ATTENTION IS ALL YOU NEED; Attention is all you need

def check_case(line, flag):
    if flag == 1:
        match = re.search(r'[a-z]', line)
        if match:
            return False
        return True
    else:
        wd_num = 0
        words = line.split(' ')
        if flag == 0:
            stp_wd = ['a', 'an', 'and', 'the', 'or', 'if', 'by', 'as', 'to', 
            'of', 'for', 'in', 'on', 'but', 'via', 'nor', 'with']
            if not words[0].istitle():
                wd_num += 1
            if len(words) > 1:
                if not words[-1].istitle():
                    wd_num += 1
                for word in words[1:-1]:
                    if not word.istitle() and word not in stp_wd:
                        wd_num += 1
            return wd_num <= math.ceil(len(words)/5)
        if flag == 2:
            if not words[0].istitle():
                wd_num += 1
            for word in words[1:]:
                if re.match(r'[A-Z]', word):
                    wd_num += 1
            return wd_num <= math.ceil(len(words)/3)

# read the relevant files from the folder of one paper, and produce a data table for that paper.
def load_paper_sentence(sent_path, label_path):
    sent = []
    count = [0, 0, 0]
    task, index = sent_path.split(sep)[-3:-1]
    # Decide the case type of the titles in this paper, by counting over the main headings and find the maximum
    with open(sent_path, 'r') as f:
        while(True):
            line = f.readline().rstrip("\n")
            if line:
                if is_heading(line) and is_main_heading(line):
                    for m in range(3):
                        if check_case(line, m):
                            count[m] += 1
            else:
                break
    ocr_path = sent_path[:-14]+'Grobid-out.txt'
    with open(ocr_path, 'r') as f:
        fl=f.readlines()
    title_ls = []
    for i in range(len(fl)):
        if fl[i]=='\n':
            if i<(len(fl)-1):
                title_ls.append(fl[i+1].rstrip())
        if fl[i].rstrip().lower() in ['title','abstract','introduction']:
            title_ls.append(fl[i].rstrip())

    with open(sent_path, 'r') as f:
        i = 0
        flg = count.index(max(count))
        # two string buffers, storing the heading and the main heading respectively
        heading, main_h = '', ''
        ofs1 = ofs3 = 0
        while(True):
            i += 1
            line = f.readline().rstrip("\n")
            if line:
                if line in title_ls:
                    ofs3 = 0
                else:
                    ofs3 += 1
                if is_heading(line) and check_case(line, flg):
                    heading = line    # update the heading buffer
                    if is_main_heading(line):
                        ofs1 = 0
                        main_h = line    # update the main heading buffer too
                        # The line itself is a main heading, no heading needs to be stored.
                        sent.append(
                            [i, line, '', '', task, index, None, None, None, ofs1, 0, i-1, 0, ofs3, 0, 1, 0, None])
                    else:
                        ofs1 += 1
                        # for plain headings, store the main heading it belongs to.
                        # judge if it should be masked
                        if is_main_heading(main_h, judge_mask=True):
                            sent.append([i, line, main_h, '', task,
                                         index, None, None, None, ofs1, 0, i-1, 0, ofs3, 0, 0, 0, None])
                        else:
                            sent.append([i, line, main_h, '', task,
                                         index, None, None, None, ofs1, 0, i-1, 0, ofs3, 0, 1, 0, None])
                else:
                    # For plain text line, store both the heading and the main heading.
                    ofs1 += 1
                    if is_main_heading(main_h, judge_mask=True):
                        sent.append([i, line, main_h, heading, 
                                     task, index, None, None, None, ofs1, 0, i-1, 0, ofs3, 0, 0, 0, None])
                    else:
                        sent.append([i, line, main_h, heading, 
                                     task, index, None, None, None, ofs1, 0, i-1, 0, ofs3, 0, 1, 0, None])
            else:
                break
    for i in range(1,len(sent)):
        if sent[i][9]==0:
            sof = sent[i-1][9]
            if sof>1:
                for j in range(i-sof,i):
                    sent[j][10] = sent[j][9]/sof
        if sent[i][13] == 0:
            sof = sent[i-1][13]
            if sof>1:
                for j in range(i-sof, i):
                    sent[j][14] = sent[j][13]/sof
        if i == len(sent)-1:
            sof = sent[i][9]
            if sof > 1:
                for j in range(i-sof+1, i+1):
                    sent[j][10] = sent[j][9]/sof
            sof = sent[i][13]
            if sof > 1:
                for j in range(i-sof+1, i+1):
                    sent[j][14] = sent[j][13]/sof
        sent[i][12] = sent[i][11]/len(sent)

    # slice the sentence with the span of characters, returns the span of words
    def get_word_idx(sent, start, end):
        ls = sent.split(' ')
        if isinstance(start, str):
            start = int(start)
        if isinstance(end, str):
            end = int(end)
        # if the span of characters doesn't conform to word boundaries, 'st' and 'en' will remain 0.
        st, en = 0, 0
        length = [len(word) for word in ls]
        count = 0
        for i in range(len(ls)):
            if start == count:
                st = i
                break
            count += (length[i]+1)
        for j in range(st, len(ls)):
            count += (length[j]+1)
            if end == (count-1):
                en = j + 1
                break
        return st, en

    # Mark the label of contribution-ralated sentences, and initialize their BIO tag sequences.
    with open(label_path, 'r') as f:
        while(True):
            line = f.readline().rstrip("\n")
            if line:
                sent[int(line)-1][-2] = 1
                sent[int(line)-1][6] = ['O'] * \
                    len(sent[int(line)-1][1].split(' '))
            else:
                break

    # go over the entities and change the corresponding part of BIO sequences
    ent_path = sep.join(label_path.split(sep)[:-1]+['entities.txt'])
    with open(ent_path, 'r') as f:
        while(True):
            line = f.readline().rstrip("\n")
            if line:
                info = line.split('\t')
                sentence = sent[int(info[0])-1][1]
                # if sentence.split(' ')[0].lower()[1:] == sentence.split(' ')[0][1:]:
                # sentence = sentence[0].lower() + sentence[1:]
                st, en = get_word_idx(sentence, info[1], info[2])
                phrase = info[3].strip()
                # If the span of characters does not match the given phrase, use the given phrase instead
                if ' '.join(sentence.split(' ')[st: en]).strip() != phrase:
                    st_char = (' ' + sentence).find(' ' + phrase + ' ')
                    st, en = get_word_idx(
                        sentence, st_char, st_char + len(phrase))
                    if st == 0 and en == 0:
                        print(
                            f'Could not find the phrase \'{info[3]}\' in the {int(info[0])}th sentence of \'{task}\' paper {index}')
                        continue
                    else:
                        print(
                            f'In the {int(info[0])}th sentence of \'{task}\' paper {index}, the entity \'{info[3]}\' is not in the span ({info[1]}, {info[2]})')
                for j in range(st, en):
                    if sent[int(info[0])-1][6] is None:
                        print(
                            f'A phrase exists in the {int(info[0])}th sentence of \'{task}\' paper {index}, which is not labeled as a contribution sentence.')
                        sent[int(info[0])-1][6] = ['O'] * \
                            len(sent[int(info[0])-1][1].split(' '))
                    if j == st:
                        sent[int(info[0])-1][6][j] = 'B'
                    else:
                        sent[int(info[0])-1][6][j] = 'I'
            else:
                break

    # decide which information unit each positive sentence belongs to.
    j_dir = sep.join(sent_path.split(sep)[:-1]) + sep + 'info-units'
    for unit in os.listdir(j_dir):  # For each json file representing an information unit
        js_file = os.path.join(j_dir, unit)
        try:
            with open(js_file, 'r') as f:
                data = json.load(f, strict=False)
            lst = find_source(data, [])
            if "TITLE" in lst:  # When the title is a source sentence, sometimes it is abbreviated as 'TITLE'
                sent[1][-1] = unit[:-5]
            for j in range(len(sent)):
                if sent[j][1] in lst:
                    sent[j][-1] = unit[:-5]
        except json.JSONDecodeError as e:
            js_position = sep.join(js_file.split(sep)[-4:])
            print(f'JSONDecodeError in {js_position}\n', e)
            continue

    # given a sequence of BIO tags, get the list of tuples representing spans of entities
    def get_entity_spans(ls):
        spans = []
        for i in range(len(ls)):
            st, ed = 0, 0
            if ls[i] == 'B':
                st, ed = i, i + 1
                for j in range(i+1, len(ls)):
                    if ls[j] == 'I':
                        ed += 1
                    else:
                        break
                spans.append((st, ed))
        return spans

    for i in range(len(sent)):
        if sent[i][6] is not None:
            sent[i][8] = sent[i][7] = sent[i][6]

    # try to find the SPO(Subject, Predicate, Object) type of each phrase
    aux = []
    for i in range(len(sent)):
        if sent[i][6] is not None:
            tup_ls = get_entity_spans(sent[i][6])
            # use three booleans to indicate if the phrase has ever been a subject, predicate or object
            tuple_ls = [[0, 0, 0, tup] for tup in tup_ls]
            word_ls = sent[i][1].split(' ')
            phrase_ls = [' '.join(word_ls[st:en]) for st, en in tup_ls]
            # store the sentence idx, tuple_ls, and phrase_ls.
            aux.append([i, tuple_ls, phrase_ls, []])        
    t_dir = sep.join(sent_path.split(sep)[:-1]) + sep + 'triples'
    paper_triple_stat = [0] * 5
    for unit in os.listdir(t_dir):
        t_file = os.path.join(t_dir, unit)
        js_file = os.path.join(j_dir, unit.replace('.txt', '.json'))
        try:
            with open(js_file,'r') as g:
                js = json.load(g, strict=False)
                js = {'Contribution': js}
        except json.JSONDecodeError as e:
            js_position = sep.join(js_file.split(sep)[-4:])
            print(f'JSONDecodeError in {js_position}\n', e)
            continue
        except FileNotFoundError as fe:
            print(fe)
            continue
        with open(t_file, 'r') as f:
            while(True):
                line = f.readline().rstrip("\n")
                if line:
                    # empty the temporary buffer
                    for a in range(len(aux)):
                        aux[a][3] = []
                    if line[0] == '(':
                        line = line[1:]
                    if line[-1] == ')':
                        line = line[:-1]
                    triple = line.split('||')
                    evidence = find_tri_sent(
                        js, triple, [], [], [])  # unit[:-4]
                    if not evidence:
                        js_position = sep.join(js_file.split(sep)[-4:]) #
                        paper_triple_stat[0] += 1
                        print(f'the triple \'{triple}\' not found in {js_position}')
                    else:                       
                        cands = evidence[0].split('\n')
                        for i in range(len(cands)):
                            for j in range(len(aux)):
                                if cands[i].strip() == sent[aux[j][0]][1]:
                                    for w in range(3):
                                        for k in range(len(aux[j][2])):
                                            if aux[j][2][k] == triple[w]:
                                                aux[j][3].append((w, k))
                                                break
                                    break
                        lens = [len(aux[j][3]) for j in range(len(aux))]
                        try:
                            paper_triple_stat[max(lens)] += 1
                        except IndexError:
                            print(f'List index out of range. The actual number of max is {max(lens)} for triple \'{triple}\' in\n', t_file)
                        if max(lens)!=0:
                            idx = lens.index(max(lens))
                            found = [0, 0, 0]
                            for t in range(len(aux[idx][3])):
                                w, k = aux[idx][3][t]
                                aux[idx][1][k][w] = 1
                                found[w] = 1
                            for i in range(3):
                                if found[i] == 0:
                                    for j in range(len(aux)):
                                        for w, k in aux[j][3]:
                                            if w == i and triple[w] == aux[j][2][k]:
                                                aux[j][1][k][w] = 1
                                                break
                                        else:
                                            continue
                                        break
                else:
                    break

    # An S-P-O type corresponds to a combination of boolean indicators
    # The 4 keys stand for 'predicate', 'subject', 'object', 'both subject and object' respectively.
    good_state = {'p': [0, 1, 0], 's': [1, 0, 0],
                    'ob': [0, 0, 1], 'b': [1, 0, 1]}
    for i in range(len(aux)):
        for item in aux[i][1]:
            if item[:3] not in good_state.values():
                # if the label of any phrase in the sentence cannot be decided,
                # delete the tag sequence to filter out this sentence
                sent[aux[i][0]][7] = sent[aux[i][0]][8] = None
                break

    for i in range(len(aux)):
        '''
        interprete the boolean states to phrase types according to the BIO_type setting,
        and change the corresponding parts in BIO sequences
        BIO_type=1: decide whether it is a predicate
        BIO_type=2: decide which of the four keys in 'good_state' it belongs to
        '''
        if sent[aux[i][0]][7] is not None:
            sent[aux[i][0]][7] = ['O']*len(sent[aux[i][0]][7])
            sent[aux[i][0]][8] = ['O']*len(sent[aux[i][0]][8])
            for item in aux[i][1]:
                st, en = item[3]
                if item[:3] == good_state['p']:
                    sent[aux[i][0]][7][st] = 'B-p'
                    for j in range(st+1, en):
                        sent[aux[i][0]][7][j] = 'I-p'
                else:
                    sent[aux[i][0]][7][st] = 'B-n'
                    for j in range(st+1, en):
                        sent[aux[i][0]][7][j] = 'I-n'
                for key, value in good_state.items():
                    if item[:3] == value:
                        sent[aux[i][0]][8][st] = 'B-'+key
                        for j in range(st+1, en):
                            sent[aux[i][0]][8][j] = 'I-'+key
    # print(f'paper triple stat: {paper_triple_stat}')
    return sent, paper_triple_stat

def load_data_sentence(file_path):
    # Get the data table of all the papers in file_path
    triple_stat = [0] * 5
    data = []
    for tuple in file_path:
        sentence_path, label_path = tuple
        paper_data, paper_triple_stat = load_paper_sentence(
        sentence_path, label_path)
        for i in range(5):
            triple_stat[i] += paper_triple_stat[i]
        data += paper_data
    return data

dirs = get_dir()
file_path = get_file_path(dirs)
data = load_data_sentence(file_path)

df = pd.DataFrame(data)
df.columns = ['idx', 'text', 'main_heading', 'heading',
              'topic', 'paper_idx', 'BIO', 'BIO_1', 'BIO_2', 'offset1', 'pro1', 'offset2', 'pro2', 'offset3', 'pro3', 'mask', 'bi_labels', 'labels']

df.to_csv('test_data.csv', index=False)

Predicting Contribution statement based on test data

In [52]:
import logging
import pandas as pd
import sklearn
from simpletransformers.classification import ClassificationModel, ClassificationArgs

logging.basicConfig(level=logging.WARNING)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

df = pd.read_csv('./test_data.csv')
df = df.drop(columns=['BIO_2', 'labels']).rename(
    columns={'bi_labels': 'labels'})
df['title'] = df['main_heading'] + ': ' + df['heading']
df.loc[((df['main_heading'] == df['heading']) | (
    pd.isnull(df['heading']))), 'title'] = df['main_heading']
df['title'] = df['title'].fillna('')

model_args = ClassificationArgs()

model_args.normalize_ofs = True
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.use_multiprocessing = False
model_args.manual_seed = 1
model_args.fp16 = False
model_args.do_lower_case = True

# Create a TransformerModel
model = ClassificationModel(
    "bert",
    "./binary/best_model",
    use_cuda = False,
    args=model_args,
)

result, model_outputs, wrong_predictions = model.eval_model(
    df, F1_score=sklearn.metrics.f1_score)

predictions = model_outputs.argmax(axis=1)
# select the sentences that are predicted positive, to be the input for subtask 2
mask = df['mask'].values
# sentences in the 'related work' or 'conclusion' sections are forced to be negative
predictions = predictions * mask
pos = df[predictions == 1]
pos.to_csv('pos_predict_out.csv', index=False)

  0%|          | 0/267 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/34 [00:00<?, ?it/s]

In [14]:
df

Unnamed: 0,idx,text,main_heading,heading,topic,paper_idx,BIO,BIO_1,offset1,pro1,offset2,pro2,offset3,pro3,mask,labels,title
0,1,title,,,text-classification,6,,,0,0.000000,0,0.000000,0,0.000000,1,0,
1,2,Universal Sentence Encoder,title,,text-classification,6,"['B', 'I', 'I']","['B-n', 'I-n', 'I-n']",1,0.000000,1,0.006757,1,0.000000,1,1,title
2,3,abstract,,,text-classification,6,,,0,0.000000,2,0.013514,0,0.000000,1,0,
3,4,We present models for encoding sentences into ...,abstract,abstract,text-classification,6,"['O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'I', ...","['O', 'O', 'O', 'O', 'B-n', 'I-n', 'I-n', 'I-n...",1,0.111111,3,0.020270,1,0.111111,1,1,abstract
4,5,The models are efficient and result in accurat...,abstract,abstract,text-classification,6,,,2,0.222222,4,0.027027,2,0.222222,1,0,abstract
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,144,The sentence level embeddings surpass the perf...,Conclusion,Conclusion,text-classification,6,,,2,0.333333,143,0.966216,2,0.333333,0,0,Conclusion
144,145,Models that make use of sentence and word leve...,Conclusion,Conclusion,text-classification,6,,,3,0.500000,144,0.972973,3,0.500000,0,0,Conclusion
145,146,We observe that transfer learning is most help...,Conclusion,Conclusion,text-classification,6,,,4,0.666667,145,0.979730,4,0.666667,0,0,Conclusion
146,147,The encoding models make different trade - off...,Conclusion,Conclusion,text-classification,6,,,5,0.833333,146,0.986486,5,0.833333,0,0,Conclusion


In [53]:
df1=pd.read_csv('/content/pos_predict_out.csv')

In [25]:
df1['labels']

0      1
1      0
2      0
3      0
4      0
      ..
904    0
905    0
906    0
907    0
908    0
Name: labels, Length: 909, dtype: int64

Extratcting all the contribution statement from prediction and generating abstractive summary using google pegasus.




In [54]:
from rouge import Rouge
rouge = Rouge()

In [43]:
len(abstract)

934

In [17]:
len(df1['idx'])

909

In [16]:
!pip install transformers sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [34]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

In [1]:
src_text =["Universal Sentence EncoderWe present models for encoding sentences into embedding vectors that specifically target transfer learning to other NLP tasks .We find that transfer learning using sentence embeddings tends to outperform word level transfer .With transfer learning via sentence embeddings , we observe surprisingly good performance with minimal amounts of supervised training data for a transfer task .Both models are implemented in TensorFlow and are available to download from TF Hub : 1 https://tfhub.dev/google/universal-sentence-encoder/1The transformer based sentence encoding model constructs sentence embeddings using the encoding sub - graph of the transformer architecture .This sub - graph uses attention to compute context aware representations of words in a sentence that take into account both the ordering and identity of all the other words .The context aware word representations are converted to a fixed length sentence encoding vector by computing the element - wise sum of the representations at each word position .This is accomplished by using multi-task learning whereby a single encoding model is used to feed multiple downstream tasks .Deep Averaging Network ( DAN )The second encoding model makes use of a deep averaging network ( DAN ) whereby input embeddings for words and bi-grams are first averaged together and then passed through a feedforward deep neural network ( DNN ) to produce sentence embeddings .Similar to the Transformer encoder , the DAN encoder takes as input a lowercased PTB tokenized string and outputs a 512 dimensional sentence embedding .We make use of mul-titask learning whereby a single DAN encoder is used to supply sentence embeddings for multiple downstream tasks .The primary advantage of the DAN encoder is that compute time is linear in the length of the input sequence .MR : Movie review snippet sentiment on a five star scale .CR : Sentiment of sentences mined from customer reviews .SUBJ : Subjectivity of sentences from movie reviews and plot summaries .MPQA : Phrase level opinion polarity from news data .TREC : Fine grained question classification sourced from TREC .SST : Binary phrase level sentiment classification .STS Benchmark : Semantic textual similarity ( STS ) between sentence pairs scored by Pearson correlation with human judgments .WEAT : Word pairs from the psychology literature on implicit association tests ( IAT ) that are used to characterize model bias .For word level transfer , we use word embeddings from a word2 vec skip - gram model trained on a corpus of news data .The pretrained word embeddings are included as input to two model types : a convolutional neural network models ( CNN ) ; a DAN .Additional baseline CNN and DAN models are trained without using any pretrained word or sentence embeddings ."]

In [54]:
len(abstract)

934

In [55]:
src_text

['Universal Sentence EncoderWe present models for encoding sentences into embedding vectors that specifically target transfer learning to other NLP tasks .We find that transfer learning using sentence embeddings tends to outperform word level transfer .With transfer learning via sentence embeddings , we observe surprisingly good performance with minimal amounts of supervised training data for a transfer task .Both models are implemented in TensorFlow and are available to download from TF Hub : 1 https://tfhub.dev/google/universal-sentence-encoder/1The transformer based sentence encoding model constructs sentence embeddings using the encoding sub - graph of the transformer architecture .This sub - graph uses attention to compute context aware representations of words in a sentence that take into account both the ordering and identity of all the other words .The context aware word representations are converted to a fixed length sentence encoding vector by computing the element - wise sum

In [55]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [56]:
model_name = 'google/pegasus-xsum'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)


In [57]:
batch = tokenizer(src_text,truncation=True, padding='longest', return_tensors="pt").to(device)
translated = model.generate(**batch,min_length = 500,max_length= 1024)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

In [58]:
print(tgt_text)

We present two models for encoding sentences into embedding vectors that specifically target transfer learning to other NLP tasks and find that transfer learning using sentence embeddings tends to outperform word level transfer with minimal amounts of supervised training data for transfer learning via STS , with surprisingly good performance with minimal amounts of supervised training data for transfer learning via STS, with surprisingly good performance with minimal amounts of supervised training data for transfer learning via STS, with surprisingly good performance with minimal amounts of supervised training data for transfer learning via STS, with surprisingly good performance with minimal amounts of supervised training data for transfer learning via STS, with surprisingly good performance with minimal amounts of supervised training data for transfer learning via STS, with surprisingly good performance with minimal amounts of supervised training data for transfer learning via STS, w

In [25]:
abstract

'We present models for encoding sentences into embedding vectors that specifically target transfer learning to other NLP tasks .For both variants , we investigate and report the relationship between model complexity , resource consumption , the availability of transfer task training data , and task performance .Comparisons are made with baselines that use word level transfer learning via pretrained word embeddings as well as baselines do not use any transfer learning .We find that transfer learning using sentence embeddings tends to outperform word level transfer .With transfer learning via sentence embeddings , we observe surprisingly good performance with minimal amounts of supervised training data for a transfer task .We obtain encouraging results on Word Embedding Association Tests ( WEAT ) targeted at detecting model bias .Our pre-trained sentence encoding models are made freely available for download and on TF Hub .'

In [59]:
rouge.get_scores(tgt_text, abstract)

[{'rouge-1': {'r': 0.43333333333333335,
   'p': 0.9285714285714286,
   'f': 0.590909086570248},
  'rouge-2': {'r': 0.2992125984251969,
   'p': 0.7037037037037037,
   'f': 0.4198894985757457},
  'rouge-l': {'r': 0.43333333333333335,
   'p': 0.9285714285714286,
   'f': 0.590909086570248}}]

In [23]:
df1.head()

Unnamed: 0,idx,text,main_heading,heading,topic,paper_idx,BIO,BIO_1,offset1,pro1,offset2,pro2,offset3,pro3,mask,labels,title
0,2,Robust Multilingual Part - of - Speech Tagging...,title,title,part-of-speech_tagging,0,"['O', 'O', 'B', 'I', 'I', 'I', 'I', 'I', 'O', ...","['O', 'O', 'B-n', 'I-n', 'I-n', 'I-n', 'I-n', ...",1,0.0,1,0.004065,1,0.0,1,1,title
1,4,Adversarial training ( AT ) 1 is a powerful re...,abstract,abstract,part-of-speech_tagging,0,,,1,0.142857,3,0.012195,1,0.142857,1,0,abstract
2,7,In our experiments on the Penn Treebank WSJ co...,abstract,abstract,part-of-speech_tagging,0,,,4,0.571429,6,0.02439,4,0.571429,1,0,abstract
3,8,We also demonstrate that 3 ) the improved tagg...,abstract,abstract,part-of-speech_tagging,0,,,5,0.714286,7,0.028455,5,0.714286,1,0,abstract
4,10,These positive results motivate further use of...,abstract,abstract,part-of-speech_tagging,0,,,7,1.0,9,0.036585,7,1.0,1,0,abstract


In [57]:
summary = ''
abstract = ''
sum_list = []
abs_list = []
score = []
y = 1
x = 0
k = 0
while df1['idx'].iloc[x]< len(df1['idx']):
  if df1['labels'].iloc[x] == 1:
    summary = summary + '' + df1['text'].iloc[x]
  if df1['main_heading'].iloc[x]=='abstract':
    abstract = abstract + '' + df1['text'].iloc[x]
  x += 1
sum_list.append(summary)
abs_list.append(abstract)
batch = tokenizer(summary,truncation=True, padding='longest', return_tensors="pt").to(device)
translated = model.generate(**batch,min_length = len(abstract.split())-50,max_length= len(abstract.split())+50)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
score.append(rouge.get_scores(summary, abstract))
print("Summary:",summary)
print("Abstractive Summary:",tgt_text)
print("Abstract:",abstract)
print(rouge.get_scores(tgt_text, abstract))

Summary: Structural Scaffolds for Citation Intent Classification in Scientific PublicationsOur code and data are available at : https://github.com/ allenai/scicite .In this work , we approach the problem of citation intent classification by modeling the language expressed in the citation context .To this end , we propose a neural multitask learning framework to incorporate knowledge into citations from the structure of scientific papers .In particular , we propose two auxiliary tasks as structural scaffolds to improve citation intent prediction : 1 ( 1 ) predicting the section title in which the citation occurs and ( 2 ) predicting whether a sentence needs a citation .On two datasets , we show that the proposed neural scaffold model outperforms existing methods by large margins .Our contributions are : ( i ) we propose a neural scaffold framework for citation intent classification to incorporate into citations knowledge from structure of scientific papers ; ( ii ) we achieve a new stat

In [46]:
len(abstract.split())

156

In [43]:
len(summary.split())

189

In [58]:
import csv 
header = ["SN","Abstract", "Summary", "Rogue Score"]
x = 0

with open("out.csv", 'w',encoding='UTF8',newline='') as f:
  writer = csv.writer(f)
  writer.writerow(header)
  while x < len(score):
    writer.writerow([x,abs_list[x],sum_list[x],score[x]])
    x += 1