In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)


In [9]:
import os
from nltk.tag.stanford import StanfordPOSTagger
java_path = "/home/sreekumar_s/jdk1.8.0_131/bin/java"
os.environ['JAVAHOME'] = java_path


import CMUTweetTagger

In [10]:
path_to_model = "models/english-bidirectional-distsim.tagger"
path_to_jar = "models/stanford-postagger.jar"
tagger = StanfordPOSTagger(path_to_model, path_to_jar)
tagger.java_options='-mx4096m'

In [11]:
raw_data_path = "/mnt/disks/vault/analysis-data/nature-science-data-full/nature_science_journal_data.pql"
data = pd.read_pickle(raw_data_path)

data = data[data.Journal == "NATURE"]

In [12]:
data[["Title"]][50:70].head(20)

Unnamed: 0,Title
93,MOUSE IGG3 ANTIBODIES ARE HIGHLY PROTECTIVE AGAINST INFECTION WITH STREPTOCOCCUS-PNEUMONIAE
95,3-DIMENSIONAL STRUCTURE OF THE ENZYME CATALASE
96,SELLING DARWIN
102,HYDROLYSIS OF ATP AND REVERSIBLE BINDING TO F-ACTIN BY MYOSIN HEAVY-CHAINS FREE OF ALL LIGHT-CHAINS
103,CHONDROITIN SULFATE FROM FOSSILIZED ANTLERS
105,UNIVERSITY STAFFING
109,PHOTOCHEMICAL CLEAVAGE OF WATER BY PHOTOCATALYSIS
112,ATMOSPHERIC TRANSPORT OF CONTINENTALLY DERIVED LIPIDS TO THE TROPICAL NORTH PACIFIC
117,TUMOR PROMOTERS INDUCE MITOTIC ANEUPLOIDY IN YEAST
121,REGULATION OF TRANSCRIPTION IN EXPRESSED AND UNEXPRESSED MATING TYPE CASSETTES OF YEAST


In [13]:
titles = data['Title'][:100]

In [14]:
stanford_tagger_outputs = [tagger.tag(sentence.split()) for sentence in titles]

__Numbers: 38,__

In [22]:
# try the twitter one
stanford_tagger_outputs[69]

[('BACKSCATTERING', 'VBG'),
 ('METHOD', 'NN'),
 ('FOR', 'IN'),
 ('THE', 'DT'),
 ('STUDY', 'NN'),
 ('OF', 'IN'),
 ('BLISTERING', 'VBG'),
 ('WITH', 'IN'),
 ('ENERGY-DISTRIBUTED', 'NN'),
 ('HE', 'PRP'),
 ('PARTICLES', 'NNS')]

In [12]:
ark_tagger_outputs = CMUTweetTagger.runtagger_parse(titles)

In [31]:
ark_tagger_outputs[38]

[('GETTERING', 'V', 0.4628),
 ('OF', 'P', 0.93),
 ('CRYSTALLINE', 'A', 0.6794),
 ('DEFECTS', 'N', 0.981),
 ('IN', 'P', 0.9914),
 ('SI', '^', 0.9848),
 ('BY', '^', 0.4777),
 ('BENDING', 'V', 0.8552)]

In [33]:
## titles with VBG tag in the first 100 titles

def is_verb_present_in_tag_stanford(tags):
    is_verb_present = False
    for name, tag in tags:
        if tag == 'VBG':
            is_verb_present = True
            break
    return is_verb_present


def is_verb_present_in_tag_ark(tags):
    is_verb_present = False
    for name, tag, prob in tags:
        if tag == 'V':
            is_verb_present = True
            break
    return is_verb_present

stanford_indices_with_vbg = [index for index, title_tags 
                    in enumerate(stanford_tagger_outputs) if is_verb_present_in_tag_stanford(title_tags)]
ark_indices_with_vbg = [index for index, title_tags 
                    in enumerate(ark_tagger_outputs) if is_verb_present_in_tag_ark(title_tags)]

In [34]:
stanford_indices_with_vbg

[9, 31, 38, 47, 66, 69, 99]

In [35]:
ark_indices_with_vbg

[1,
 7,
 15,
 16,
 17,
 19,
 26,
 28,
 29,
 31,
 32,
 33,
 38,
 41,
 47,
 49,
 51,
 56,
 59,
 66,
 68,
 71,
 73,
 75,
 78,
 85,
 86,
 87,
 91,
 93,
 95,
 99]

In [36]:
ark_tagger_outputs[1]

[('WAVELENGTH-MULTIPLEXED', 'N', 0.7234),
 ('AND', '&', 0.9986),
 ('GATE', 'N', 0.907),
 ('-', ',', 0.9765),
 ('A', 'D', 0.9758),
 ('BUILDING', 'N', 0.976),
 ('BLOCK', 'N', 0.6508),
 ('FOR', 'P', 0.9981),
 ('MONOLITHIC', 'A', 0.8152),
 ('OPTICALLY', 'R', 0.9165),
 ('COUPLED-CIRCUITS', 'V', 0.9201)]

In [42]:
ark_tagger_outputs[7], stanford_tagger_outputs[7]

([('SUB-DOPPLER', 'N', 0.9157),
  ('SUBMILLIMETER', 'N', 0.3751),
  ('SPECTROSCOPY', 'N', 0.9359),
  ('USING', 'V', 0.9785),
  ('MOLECULAR-BEAMS', 'A', 0.6122)],
 [('SUB-DOPPLER', 'NNP'),
  ('SUBMILLIMETER', 'NNP'),
  ('SPECTROSCOPY', 'NNP'),
  ('USING', 'NNP'),
  ('MOLECULAR-BEAMS', 'NN')])

In [41]:
ark_tagger_outputs[15],stanford_tagger_outputs[15]

([('PHOTO-LUMINESCENCE', 'N', 0.9641),
  ('TECHNIQUE', 'N', 0.9862),
  ('FOR', 'P', 0.9999),
  ('THE', 'D', 0.9963),
  ('DETERMINATION', 'N', 0.9998),
  ('OF', 'P', 0.9949),
  ('MINORITY-CARRIER', '^', 0.7474),
  ('DIFFUSION', 'N', 0.8752),
  ('LENGTH', 'N', 0.9927),
  ('IN', 'P', 0.9961),
  ('GAAS', 'N', 0.6978),
  ('GROWN', 'V', 0.7759),
  ('BY', 'P', 0.7184),
  ('MOLECULAR-BEAM', 'A', 0.8227),
  ('EPITAXY', 'N', 0.5114)],
 [('PHOTO-LUMINESCENCE', 'NN'),
  ('TECHNIQUE', 'NN'),
  ('FOR', 'IN'),
  ('THE', 'DT'),
  ('DETERMINATION', 'NN'),
  ('OF', 'IN'),
  ('MINORITY-CARRIER', 'NNP'),
  ('DIFFUSION', 'NNP'),
  ('LENGTH', 'NNP'),
  ('IN', 'IN'),
  ('GAAS', 'NNP'),
  ('GROWN', 'NNP'),
  ('BY', 'NNP'),
  ('MOLECULAR-BEAM', 'NNP'),
  ('EPITAXY', 'NNP')])

In [40]:
ark_tagger_outputs[17],stanford_tagger_outputs[17]

([('GROWTH-KINETICS', 'N', 0.9609),
  ('OF', 'P', 0.9914),
  ('OXIDATION-INDUCED', 'N', 0.5556),
  ('STACKING-FAULTS', 'V', 0.8346),
  ('IN', 'P', 0.9594),
  ('SILICON', 'N', 0.7339),
  ('-', ',', 0.9501),
  ('A', 'D', 0.9899),
  ('NEW', 'A', 0.9889),
  ('CONCEPT', 'N', 0.9891)],
 [('GROWTH-KINETICS', 'NN'),
  ('OF', 'IN'),
  ('OXIDATION-INDUCED', 'NNP'),
  ('STACKING-FAULTS', 'NNP'),
  ('IN', 'IN'),
  ('SILICON', 'NNP'),
  ('-', ':'),
  ('A', 'DT'),
  ('NEW', 'JJ'),
  ('CONCEPT', 'NN')])

__Tagged positive by Stanford tagger__

In [43]:
ark_tagger_outputs[9],stanford_tagger_outputs[9]

([('SELF-ANNEALING', 'N', 0.9477),
  ('OF', 'P', 0.9881),
  ('ION-IMPLANTED', '^', 0.6364),
  ('SILICON', '^', 0.8923),
  ('-', ',', 0.9414),
  ('1ST', 'A', 0.5284),
  ('EXPERIMENTAL', 'A', 0.9649),
  ('RESULTS', 'N', 0.9968)],
 [('SELF-ANNEALING', 'VBG'),
  ('OF', 'IN'),
  ('ION-IMPLANTED', 'NNP'),
  ('SILICON', 'NNP'),
  ('-', ':'),
  ('1ST', 'JJ'),
  ('EXPERIMENTAL', 'JJ'),
  ('RESULTS', 'NNS')])

In [44]:
ark_tagger_outputs[31],stanford_tagger_outputs[31]

([('X-RAY-IMAGING', 'V', 0.5846),
  ('WITH', 'P', 0.9983),
  ('A', 'D', 0.9828),
  ('CHARGE-COUPLED', 'N', 0.9073),
  ('DEVICE', 'N', 0.9927),
  ('FABRICATED', 'V', 0.9706),
  ('ON', 'P', 0.9575),
  ('A', 'D', 0.9847),
  ('HIGH-RESISTIVITY', 'A', 0.7691),
  ('SILICON', 'N', 0.9054),
  ('SUBSTRATE', 'N', 0.9981)],
 [('X-RAY-IMAGING', 'VBG'),
  ('WITH', 'IN'),
  ('A', 'NNP'),
  ('CHARGE-COUPLED', 'NNP'),
  ('DEVICE', 'NNP'),
  ('FABRICATED', 'NNP'),
  ('ON', 'NNP'),
  ('A', 'NNP'),
  ('HIGH-RESISTIVITY', 'NNP'),
  ('SILICON', 'NNP'),
  ('SUBSTRATE', 'NNP')])

In [45]:
ark_tagger_outputs[47],stanford_tagger_outputs[47]

([('AN', 'D', 0.8473),
  ('IMPROVED', 'A', 0.9319),
  ('TECHNIQUE', 'N', 0.9928),
  ('FOR', 'P', 0.9996),
  ('CALCULATING', 'V', 0.9964),
  ('THE', 'D', 0.9985),
  ('RESOLUTION', 'N', 0.9988),
  ('OF', 'P', 0.9902),
  ('TROCHOIDAL', '^', 0.6994),
  ('ELECTRON', '^', 0.5661),
  ('MONOCHROMATORS', '^', 0.9082)],
 [('AN', 'DT'),
  ('IMPROVED', 'JJ'),
  ('TECHNIQUE', 'NN'),
  ('FOR', 'IN'),
  ('CALCULATING', 'VBG'),
  ('THE', 'DT'),
  ('RESOLUTION', 'NN'),
  ('OF', 'IN'),
  ('TROCHOIDAL', 'FW'),
  ('ELECTRON', 'FW'),
  ('MONOCHROMATORS', 'NNS')])

In [46]:
ark_tagger_outputs[66],stanford_tagger_outputs[66]

([('POLY(METHYL', 'G', 0.3424),
  ('METHACRYLATE', 'N', 0.41),
  ('SENSITIVITY', 'N', 0.9932),
  ('VARIATION', 'N', 0.9973),
  ('VERSUS', 'P', 0.8781),
  ('THE', 'D', 0.9933),
  ('ELECTRONIC', '^', 0.3872),
  ('STOPPING', 'V', 0.8133),
  ('POWER', 'N', 0.9402),
  ('AT', 'P', 0.967),
  ('ION', '^', 0.323),
  ('LITHOGRAPHY', '^', 0.7867),
  ('EXPOSURE', '^', 0.6361)],
 [('POLY(METHYL', 'NNP'),
  ('METHACRYLATE', 'NNP'),
  ('SENSITIVITY', 'NNP'),
  ('VARIATION', 'NNP'),
  ('VERSUS', 'NNPS'),
  ('THE', 'DT'),
  ('ELECTRONIC', 'NNS'),
  ('STOPPING', 'VBG'),
  ('POWER', 'NN'),
  ('AT', 'IN'),
  ('ION', 'NN'),
  ('LITHOGRAPHY', 'NN'),
  ('EXPOSURE', 'NN')])