# Scientific Paper Text Analysis

CoreSC is an annotation scheme for scientific papers 


IMAGE HERE

In [16]:
import os
import csv
import xml.etree.ElementTree as ET

from collections import Counter,defaultdict



sentences = defaultdict(lambda: [])
for rootdir, dirs, files in os.walk("consensus_annotated/"):

    for file in files:
        fullname = os.path.join(rootdir, file)

        #open and parse the paper
        tree = ET.parse(fullname)
        root = tree.getroot()

        #lets find all sentences in the paper
        for sentEl in root.iter("s"):
            annoArt = sentEl.find('CoreSc1')
            text = sentEl.find("text")
            if annoArt != None:
                sentences[annoArt.get('type')].append("".join(text.itertext()))

            elif text != None:
                    sentences['Other'].append("".join(text.itertext()))



sentcount = sum( [len(x) for x in sentences.values() ])
print("There are {} sentences in this corpus".format(sentcount))

print ("Scientific Concepts:")
for lbl,sents in sentences.items():

    pc = len(sents) / sentcount * 100
    print("There are {} {} ( {}% )sentences".format(len(sents),lbl, pc))


There are 40180 sentences in this corpus
Scientific Concepts:
There are 3858 Exp ( 9.60179193628671% )sentences
There are 8407 Res ( 20.923344947735192% )sentences
There are 1358 Obj ( 3.3797909407665507% )sentences
There are 3656 Mod ( 9.09905425584868% )sentences
There are 541 Mot ( 1.3464410154305626% )sentences
There are 5410 Obs ( 13.464410154305625% )sentences
There are 783 Hyp ( 1.948730711796914% )sentences
There are 4285 Met ( 10.664509706321553% )sentences
There are 3646 Con ( 9.0741662518666% )sentences
There are 629 Goa ( 1.5654554504728722% )sentences
There are 7607 Bac ( 18.93230462916874% )sentences


In [15]:
import random
import math

TRAIN = 0.8

def write_gt(filename, gt_dict):
    
    with open(filename,encoding='utf-8',mode="w") as f:
        csvw = csv.writer(f, lineterminator='\n')
        
        for lbl,sents in gt_dict.items():
            for sent in sents:
                csvw.writerow([lbl,sent])

def split_train_test(sentences, proportion):
    
    indices = range(0,len(sentences))
    
    samplesize = math.floor(len(indices) * proportion)
    
    trainidx = random.sample(indices,samplesize)
    
    testidx = list(set(indices) - set(trainidx))
    
    return [sentences[x] for x in trainidx], [sentences[y] for y in testidx]
    
train = { x:None for x in sentences.keys() }
test = { x:None for x in sentences.keys() }

for idx, sents in sentences.items():
    train[idx],test[idx] = split_train_test(sentences[idx], TRAIN)
    print(idx,len(train[idx]),len(test[idx]))
    
write_gt("train.csv", train)
write_gt("test.csv",test)

Exp 3086 772
Res 6725 1682
Obj 1086 272
Mod 2924 732
Mot 432 109
Obs 4328 1082
Hyp 626 157
Met 3428 857
Con 2916 730
Goa 503 126
Bac 6085 1522
