In [None]:
#---------------------------------------------------------------------
#This code tags a corpus with the Stanford tagger
#---------------------------------------------------------------------

import jpype
import re
import ast

#---------------------------------------------------------------------
#Parts of Stanford parser from:
#https://github.com/ayushjaiswal/multipass4coreference/tree/master/stanford-parser-python-r22186
#
#Additionally edited a part and added 'wordTokenPairs' which
#creates a tuple output
#---------------------------------------------------------------------

class TextStandoff:
    def __init__(self, text, range):
        self.entireText = text

        self.range = range
        
    def asPrimitives(self):
        return (self.entireText, self.range)
    
    @staticmethod
    def fromPrimitives(args):
        return TextStandoff(*args)
        
    def isNull(self):
        return self.range == (0, 0)
    
    @property
    def text(self):
        start, end = self.range
        return self.entireText[start:end]
    @property
    def length(self):
        start, end = self.range
        return end - start
    @property
    def end(self):
        start, end = self.range
        return end
    @property
    def start(self):
        start, end = self.range
        return start

    def overlaps(self, standoff):
        if self.start < standoff.end and standoff.start < self.end:
            return True
        else:
            return False
    def contains(self, standoff):
        start, end = standoff
        return self.start <= start and self.end >= end
    def before(self, standoff):
        if self.end <= standoff.start:
            return True
        else:
            return False
    def degreeOfOverlap(self, standoff):
        """
        Returns the size of the overlapping range of two tags. Returns
        zero if they do not overlap.
        """
        start, end = standoff
        if self.overlaps(standoff):
            return min(end, self.end) - max(start, self.start)
        else:
            return 0

    def __iter__(self):
        return iter((self.start, self.end))
    def toXml(self, standoff):
        standoff.setAttribute("start", str(self.start))
        standoff.setAttribute("end", str(self.end))

    def __repr__(self):
        return 'TextStandoff("%s", (%d, %d))' % (self.entireText, self.start, self.end)

    def __str__(self):
        return '("%s", (%d, %d))' % (self.text, self.start, self.end)

    def __eq__(self, obj):
        if isinstance(obj, TextStandoff):
            if self.range == obj.range and self.entireText == obj.entireText:
                return True
        return False

    def __hash__(self):
        return hash(self.entireText) * 17 + hash(self.range)

class ParserError(Exception):
    def __init__(self, *args, **margs):
        Exception.__init__(self, *args,**margs)

def standoffFromToken(txt, token):
    return TextStandoff(txt, (token.beginPosition(), token.endPosition()))

class Dependencies:
    def __init__(self, sentence, tokens, posTags, dependencies, wordTokenPairs):
        ###
        #self.standoffTokens = standoffTokens
        ###
        self.wordTokenPairs = wordTokenPairs
        
        self.sentence = sentence

        self.posTags = posTags        
        
        self.tokens = tokens

        self.tokensToPosTags = dict(zip(self.tokens, self.posTags))

        self.dependencies = dependencies
        
        self.govToDeps = {}
        self.depToGov = {}
        self.constituentsToRelation = {}

        #there is a bug where sometimes there is a self-dependence
        self.dependencies = [(relation, gov, dep) for relation, gov, dep in self.dependencies
                             if gov != dep]

        for relation, gov, dep in self.dependencies:

            self.govToDeps.setdefault(gov, [])
            self.govToDeps[gov].append(dep)
            #assert not dep in self.depToGov, (dep.text, [(key.text, value.text) for key, value in self.depToGov.iteritems()])
            self.depToGov[dep] = gov
            self.constituentsToRelation[(gov,dep)] = relation
            
        self.checkRep()

    def tagForTokenStandoff(self, tokenStandoff):
        return self.tokensToPosTags[tokenStandoff]
        
    def checkRep(self):
        assert len(self.posTags) == len(self.posTags)        
        for t in self.tokens:
            assert t.entireText == self.sentence

    def govForDep(self, dep):
        return self.depToGov[dep]
    def depsForGov(self, gov):
        return self.govToDeps[gov]

    def relForConstituents(self, gov, dep):
        return self.constituentsToRelation((gov, dep))
    
    def __str__(self):
        result = ""
        result += "sentence=" + repr(self.sentence) + "\n"
        for relation, gov, dep in self.dependencies:
            result += relation + "(" + gov.text + ", " + dep.text + ")\n"
        return result

stanford_parser_home = None

def startJvm():
    import os
    os.environ.setdefault("STANFORD_PARSER_HOME", r"..\stanford-parser-python-r22186\3rdParty\stanford-parser\stanford-parser-2010-08-20")
    global stanford_parser_home
    stanford_parser_home = os.environ["STANFORD_PARSER_HOME"]
    jpype.startJVM(jpype.getDefaultJVMPath(),
                   "-ea",
                   "-Djava.class.path=%s/stanford-parser.jar" % (stanford_parser_home),)

class Parser:
    def __init__(self, pcfg_model_fname=None):
        if pcfg_model_fname == None:
            self.pcfg_model_fname = "%s/../englishPCFG.July-2010.ser" % stanford_parser_home            
        else:
            self.pcfg_model_fname = pcfg_model_fname

        self.package_lexparser = jpype.JPackage("edu.stanford.nlp.parser.lexparser")
        
        self.parser = self.package_lexparser.LexicalizedParser(self.pcfg_model_fname)
        self.package = jpype.JPackage("edu.stanford.nlp")

        tokenizerFactoryClass = self.package.process.__getattribute__("PTBTokenizer$PTBTokenizerFactory")
        self.tokenizerFactory = tokenizerFactoryClass.newPTBTokenizerFactory(True, True)

        self.documentPreprocessor = self.package.process.DocumentPreprocessor(self.tokenizerFactory)
        
        self.parser.setOptionFlags(["-retainTmpSubcategories"])

    def printInfo(self):

        Numberer = self.package.util.Numberer
        print ("Grammar\t" +
               Numberer.getGlobalNumberer("states").total() + '\t' +
               Numberer.getGlobalNumberer("tags").total() + '\t' +
               Numberer.getGlobalNumberer("words").total() + '\t' +
               self.parser.pparser.ug.numRules() + '\t' +
               self.parser.pparser.bg.numRules() + '\t' +
               self.parser.pparser.lex.numRules())

        print("ParserPack is ", self.parser.op.tlpParams.getClass())
        print("Lexicon is ", self.parser.pd.lex.getClass())        
        print("Tags are: ", Numberer.getGlobalNumberer("tags"))
        self.parser.op.display()
        print("Test parameters")
        self.parser.op.tlpParams.display();
        self.package_lexparser.Test.display()
    def parse(self, sentence):
        """
        Parses the sentence string, returning the tokens and the parse tree as a tuple.
        tokens, tree = parser.parse(sentence)
        """
        tokens = self.documentPreprocessor.getWordsFromString(sentence)

        for token in tokens:
            if token.word() in ["down"]:
                token.setTag("IN")
                pass
            if token.word().lower() in ["bot"]:
                token.setTag("NN")
                pass

        wasParsed = self.parser.parse(tokens)
        
        if not wasParsed:
            raise ParserError("Could not parse " + sentence)
        return tokens, self.parser.getBestParse()
    
    def parseToStanfordDependencies(self, sentence):

        tokens, tree = self.parse(sentence)
        standoffTokens = [standoffFromToken(sentence, token) for token in tokens]
        posTags = [token.tag() for token in tree.taggedYield()]
        wordTokenPairs = [(word.text, tag) for word, tag in zip(standoffTokens, posTags)] #added for ANLP project
        result = self.package.trees.EnglishGrammaticalStructure(tree)
        
        returnList = []
        for dependency in result.typedDependenciesCollapsedTree():

            govStandoff = standoffTokens[dependency.gov().index() - 1]
            depStandoff = standoffTokens[dependency.dep().index() - 1]

            returnList.append((str(dependency.reln()),
                               govStandoff,
                               depStandoff))

        return Dependencies(sentence, standoffTokens, posTags, returnList, wordTokenPairs)

startJvm() #one jvm per python instance

#---------------------------------------------------------------------
#Write corpus file into list
#---------------------------------------------------------------------

sentence_corpus = []

with open('..\\data\\example_topic_corpus_edited.txt') as f:
    sentence_corpus = f.readlines()
sentence_corpus = [x.strip() for x in sentence_corpus] 

#---------------------------------------------------------------------
#Write Stanford tagged corpus as file
#---------------------------------------------------------------------

stanford_tagged_sentences = []
parser = Parser()

file = open('..\\data\\example_tagged_topic_corpus.txt', 'w', encoding = 'utf-8') 

file.write('[')
for sentence in sentence_corpus:
    file.write(str(parser.parseToStanfordDependencies(sentence).wordTokenPairs))
    file.write(',' + '\n')

file.write(']')
file.close() 

#---------------------------------------------------------------------
#Reconnect @ tuple
#---------------------------------------------------------------------

file = open('..\\data\\example_tagged_topic_corpus.txt', 'r', encoding = 'utf-8') 
tag_list = ast.literal_eval(file.read())
file.close()

new_tag_list = []

for sentence_idx, sentence in enumerate(tag_list):    
    new_tag_sentence = []
    skip = []
    
    for tuple_idx, pair in enumerate(sentence):
        for word_idx, word in enumerate(pair):
            if word_idx == 0:
                if tuple_idx not in skip:
                    if word=="@":
                        new_tag_sentence.append(("@"+sentence[tuple_idx+1][0], sentence[tuple_idx+1][1]))
                        skip.append(tuple_idx+1)
                    else:
                        new_tag_sentence.append(pair)
                    
    new_tag_list.append(new_tag_sentence)

file = open('..\\data\\example_tagged_topic_corpus.txt', 'w', encoding = 'utf-8')
file.write(str(new_tag_list))
file.close()