# Scientific Paper Text Analysis

CoreSC is a sentence-level annotation scheme for scientific papers that *"recognizes the main components of scientific investigations as represented in articles"* [Liakata et al 2012](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3315721/?report=classic). 

![alt text](assets/coresc.jpg "Examples of CoreSC designations")

We spent months building a CRF classifier that gets ~51% accuracy over 11 classes of sentence. 

I spent 2 days training and testing an NLC instance with 63% accuracy out of the box.


In [53]:
import os
import csv
import xml.etree.ElementTree as ET

from collections import Counter,defaultdict



sentences = defaultdict(lambda: [])
for rootdir, dirs, files in os.walk("consensus_annotated/"):

    for file in files:
        fullname = os.path.join(rootdir, file)

        #open and parse the paper
        tree = ET.parse(fullname)
        root = tree.getroot()

        #lets find all sentences in the paper
        for sentEl in root.iter("s"):
            annoArt = sentEl.find('CoreSc1')
            text = sentEl.find("text")
            if annoArt != None:
                sentences[annoArt.get('type')].append("".join(sentEl.itertext()))

            else:
                    sentences['Other'].append("".join(sentEl.itertext()))



sentcount = sum( [len(x) for x in sentences.values() ])
print("There are {} sentences in this corpus".format(sentcount))

print ("Scientific Concepts:")
for lbl,sents in sentences.items():

    pc = len(sents) / sentcount * 100
    print("There are {} {} ( {}% )sentences".format(len(sents),lbl, pc))


There are 8501 sentences in this corpus
Scientific Concepts:
There are 168 Goa ( 1.9762380896365135% )sentences
There are 1839 Exp ( 21.63274908834255% )sentences
There are 631 Obs ( 7.4226561580990476% )sentences
There are 54 Other ( 0.6352193859545936% )sentences
There are 1826 Bac ( 21.47982590283496% )sentences
There are 4 Mod ( 0.04705328784848841% )sentences
There are 293 Obj ( 3.446653334901776% )sentences
There are 586 Con ( 6.893306669803552% )sentences
There are 305 Mot ( 3.5878131984472414% )sentences
There are 929 Met ( 10.928126102811433% )sentences
There are 1718 Res ( 20.209387130925773% )sentences
There are 148 Hyp ( 1.7409716503940715% )sentences


In [55]:
import random
import math

TRAIN = 0.7

def write_gt(filename, gt_dict):
    
    with open(filename,encoding='utf-8',mode="w") as f:
        csvw = csv.writer(f, lineterminator='\n')
        
        for lbl,sents in gt_dict.items():
            for sent in sents:
                if len(sent) > 1024:
                    continue
                else:
                    csvw.writerow([sent,lbl])

def split_train_test(sentences, proportion):
    
    indices = range(0,len(sentences))
    
    samplesize = math.floor(len(indices) * proportion)
    
    trainidx = random.sample(indices,samplesize)
    
    testidx = list(set(indices) - set(trainidx))
    
    return [sentences[x] for x in trainidx], [sentences[y] for y in testidx]
    
train = { x:None for x in sentences.keys() }
test = { x:None for x in sentences.keys() }

for idx, sents in sentences.items():
    train[idx],test[idx] = split_train_test(sentences[idx], TRAIN)
    print(idx,len(train[idx]),len(test[idx]))
    
write_gt("train.csv", train)
write_gt("test.csv",test)

Goa 117 51
Exp 1287 552
Obs 441 190
Other 37 17
Bac 1278 548
Mod 2 2
Obj 205 88
Con 410 176
Mot 213 92
Met 650 279
Res 1202 516
Hyp 103 45


## Training a classifier




In [56]:
import json

with open("creds.json") as f:
    creds = json.load(f)['credentials']
    username,password = creds['username'], creds['password']
    endpoint = creds['url']
    
    print("Using endpoint {} with username {} ".format(endpoint,username))

Using endpoint https://gateway.watsonplatform.net/natural-language-classifier/api with username 59b52427-a680-49cf-aa23-50531cbcf1d4 


In [57]:
import requests

In [58]:
meta = { "language":"en", "name":"CoreSC Classifier"  } 

r = requests.post(endpoint + "/v1/classifiers", 
              auth=(username,password),
             files={ "training_data" : open("train.csv"), "training_metadata" : json.dumps(meta)   })

print (r.text)

classifierID = r.json()['classifier_id']

{
  "classifier_id" : "3a84d1x62-nlc-1038",
  "name" : "CoreSC Classifier",
  "language" : "en",
  "created" : "2016-04-28T08:28:00.125Z",
  "url" : "https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/3a84d1x62-nlc-1038",
  "status" : "Training",
  "status_description" : "The classifier instance is in its training phase, not yet ready to accept classify requests"
}


In [63]:
classifierID="3a84d1x62-nlc-1038" #"3a84d1x62-nlc-1027" #3a84cfx63-nlc-886"
r = requests.get(endpoint + "/v1/classifiers/" + classifierID, 
              auth=(username,password))

print (r.json())

{'name': 'CoreSC Classifier', 'url': 'https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/3a84d1x62-nlc-1038', 'classifier_id': '3a84d1x62-nlc-1038', 'created': '2016-04-28T08:28:00.125Z', 'language': 'en', 'status': 'Available', 'status_description': 'The classifier instance is now available and is ready to take classifier requests.'}


## Test classifier

In [64]:
from collections import Counter
from IPython.display import display
from ipywidgets import FloatProgress

tps = Counter()
fns = Counter()


fp = FloatProgress(min=0, max=sum(len(x) for x in test.values()))
display(fp)
resuls = []

f = open("results.txt","w")

for label,sents in test.items():
    
    for sent in sents:
        r = requests.get(
            endpoint + "/v1/classifiers/" + classifierID + "/classify",
            auth=(username,password),
            params={"text" : sent}
        )
        try:
            result = r.json()['classes'][0]['class_name']

            results.append((label, result))

            f.write("{},{}\n".format(label,result))
            
        except:
            pass

        fp.value += 1

f.close()
    

print (r)
print (r.content)

<Response [200]>
b'{\n  "classifier_id" : "3a84d1x62-nlc-1038",\n  "url" : "https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/3a84d1x62-nlc-1038",\n  "text" : "Aryl hydrocarbon receptor (AhR) agonists suppress interleukin-6 expression by bone marrow stromal cells: an immunotoxicology study",\n  "top_class" : "Bac",\n  "classes" : [ {\n    "class_name" : "Bac",\n    "confidence" : 0.9851534941025172\n  }, {\n    "class_name" : "Obj",\n    "confidence" : 0.0037460404592158138\n  }, {\n    "class_name" : "Hyp",\n    "confidence" : 0.0021243958290101186\n  }, {\n    "class_name" : "Other",\n    "confidence" : 0.0018484181209979776\n  }, {\n    "class_name" : "Con",\n    "confidence" : 0.001608292345229142\n  }, {\n    "class_name" : "Goa",\n    "confidence" : 0.0013573938282026969\n  }, {\n    "class_name" : "Mod",\n    "confidence" : 8.968194007939619E-4\n  }, {\n    "class_name" : "Res",\n    "confidence" : 8.561750682276065E-4\n  }, {\n    "class_name" : 

In [76]:
correct = sum([1 for true,exp in results if true == exp])
total = len(results)

print (correct,total)

tp = Counter()
fp = Counter()
fn = Counter()

for true, predictedLabel in results:
    #logger.info("%s, %s, %s", true, predictedLabel, probability)
    if true == predictedLabel:
        tp[true] += 1
    else:
        fp[predictedLabel] += 1
        fn[true] += 1

print ("Label\t\tPrecision\tRecall\t\tF-measure")
for label in sentences.keys():
    #logger.info(label)
    if tp[label] == 0:
        prec = 0
        rec = 0
    else:
        prec = tp[label] / (tp[label] + fp[label])
        rec = tp[label] / (tp[label] + fn[label])

    if (prec + rec) > 0:
        fm = (2 * prec * rec ) / (prec + rec)
    else:
        fm = 0

    #logger.info('prec: %d tp / (%d tp + %d fp) = %f', tp[label], tp[label], fp[label], prec)
    #logger.info('rec: %d tp / (%d tp + %d fn) = %f', tp[label], tp[label], fn[label], rec)
    #logger.info('F-measure: %f',fm)

   
    print("{}\t\t{:.2%}\t\t{:.2%}\t\t{:.2%}".format(label,prec,rec,fm))
    #csvw.writerow([label, prec, rec, fm])
    

print (correct/total*100)

21647 34168
Label		Precision	Recall		F-measure
Goa		78.47%		43.04%		55.59%
Exp		81.15%		82.59%		81.87%
Obs		68.47%		64.65%		66.50%
Other		93.75%		88.24%		90.91%
Bac		59.81%		71.75%		65.24%
Mod		64.03%		66.28%		65.13%
Obj		58.77%		50.63%		54.40%
Con		71.50%		52.08%		60.26%
Mot		61.35%		36.73%		45.95%
Met		54.18%		53.92%		54.05%
Res		55.51%		68.86%		61.47%
Hyp		58.87%		36.87%		45.34%
63.35460079606649


## Alchemy enrichment and classifier

In [60]:
from alchemyapi import AlchemyAPI

import os
import hashlib
import json
from IPython.display import display
from ipywidgets import FloatProgress

fp = FloatProgress(min=0, max=sum(len(x) for x in sentences.values()))

alchemyapi = AlchemyAPI()

display(fp)

for idx, sents in sentences.items():
    
    datapath = os.path.join("data",idx)
    
    for sent in sents:
        fp.value += 1
        m = hashlib.md5()
        m.update(sent.encode('utf-8'))
        
        if not os.path.exists(datapath):
            os.makedirs(datapath)
            
        sent_file = os.path.join(datapath, m.hexdigest())
        
        if os.path.exists(sent_file) and (os.path.getsize(sent_file) > 69):
            continue

        result = alchemyapi.combined("text", sent, options={ "showSourceText":1})

        with open(sent_file,"w") as f:
            json.dump(result, f)

        
        

# Managing Classifiers

In [61]:

print (requests.get(endpoint + "/v1/classifiers", 
              auth=(username,password)).json())


{'classifiers': [{'url': 'https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/3a84d1x62-nlc-1027', 'name': 'CoreSC Classifier', 'classifier_id': '3a84d1x62-nlc-1027', 'created': '2016-04-28T06:44:28.406Z', 'language': 'en'}, {'url': 'https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/3a84d1x62-nlc-1038', 'name': 'CoreSC Classifier', 'classifier_id': '3a84d1x62-nlc-1038', 'created': '2016-04-28T08:28:00.125Z', 'language': 'en'}, {'url': 'https://gateway.watsonplatform.net/natural-language-classifier/api/v1/classifiers/3a84cfx63-nlc-886', 'name': 'CoreSC Classifier', 'classifier_id': '3a84cfx63-nlc-886', 'created': '2016-04-27T09:07:18.634Z', 'language': 'en'}]}


In [41]:
print (requests.delete(endpoint + "/v1/classifiers/3a84d1x62-nlc-962", 
              auth=(username,password)).json())


{}
