In [1]:
from stanfordcorenlp import StanfordCoreNLP

Note: you need to download StanfordCoreNLP and save it on your disk.

In [2]:
STANFORD_CORE_LOCAL_PATH = r'C:\Users\febrah1\AppData\Local\Programs\Python\Python37\Lib\site-packages\stanfordcorenlp\stanford-corenlp-full-2018-10-05'
class Preprocessing:
    def __init__(self, dataPath):
        self.path = dataPath
        self.stanfordNlp = StanfordCoreNLP(STANFORD_CORE_LOCAL_PATH)
        with open(self.path) as file:
            self.sentences = list(file)


    def GetTokens(self, startIndex = 0, endIndex = 0):
        if endIndex == 0:
            endIndex = len(self.sentences)
        tokenize_list = []
        for index in range(startIndex, endIndex):
            tokenize_list.append(self.stanfordNlp.word_tokenize(self.sentences[index]))
        return tokenize_list

    def GetPosTags(self, startIndex = 0, endIndex = 0):
        if endIndex == 0:
            endIndex = len(self.sentences)
        pos_list = []
        for index in range(startIndex, endIndex):
            pos_list.append(self.stanfordNlp.pos_tag(self.sentences[index]))
        return pos_list

    def GetNER(self, startIndex = 0, endIndex = 0):
        if endIndex == 0:
            endIndex = len(self.sentences)
        ner_list = []
        for index in range(startIndex, endIndex):
            ner_list.append(self.stanfordNlp.ner(self.sentences[index]))
        return ner_list

    def GetParses(self, startIndex = 0, endIndex = 0):
        if endIndex == 0:
            endIndex = len(self.sentences)
        parse_list = []
        for index in range(startIndex, endIndex):
            parse_list.append(self.stanfordNlp.parse(self.sentences[index]))
        return parse_list

    def GetDependencyParse(self, startIndex = 0, endIndex = 0):
        if endIndex == 0:
            endIndex = len(self.sentences)
        dependency_list = []
        for index in range(startIndex, endIndex):
            dependency_list.append(self.stanfordNlp.dependency_parse(self.sentences[index]))
        return dependency_list

In [3]:
tmVarObject = Preprocessing('tmVarCorpus.txt')

In [6]:
print(tmVarObject.GetTokens(0,2))

[['A', 'novel', 'missense', 'mutation', 'Asp506Gly', 'in', 'Exon', '13', 'of', 'the', 'F11', 'gene', 'in', 'an', 'asymptomatic', 'Korean', 'woman', 'with', 'mild', 'factor', 'XI', 'deficiency', '.'], ['Factor', 'XI', '(', 'FXI', ')', 'deficiency', 'is', 'a', 'rare', 'autosomal', 'recessive', 'coagulation', 'disorder', 'most', 'commonly', 'found', 'in', 'Ashkenazi', 'and', 'Iraqi', 'Jews', ',', 'but', 'it', 'is', 'also', 'found', 'in', 'other', 'ethnic', 'groups', '.', 'It', 'is', 'a', 'trauma', 'or', 'surgery-related', 'bleeding', 'disorder', ',', 'but', 'spontaneous', 'bleeding', 'is', 'rarely', 'seen', '.', 'The', 'clinical', 'manifestation', 'of', 'bleeding', 'in', 'FXI', 'deficiency', 'cases', 'is', 'variable', 'and', 'seems', 'to', 'poorly', 'correlate', 'with', 'plasma', 'FXI', 'levels', '.', 'The', 'molecular', 'pathology', 'of', 'FXI', 'deficiency', 'is', 'mutation', 'in', 'the', 'F11', 'gene', 'on', 'the', 'chromosome', 'band', '4q35', '.', 'We', 'report', 'a', 'novel', 'mutat

In [5]:
print(tmVarObject.GetPosTags(0,2))

[[('A', 'DT'), ('novel', 'JJ'), ('missense', 'NN'), ('mutation', 'NN'), ('Asp506Gly', 'NN'), ('in', 'IN'), ('Exon', 'NNP'), ('13', 'CD'), ('of', 'IN'), ('the', 'DT'), ('F11', 'NN'), ('gene', 'NN'), ('in', 'IN'), ('an', 'DT'), ('asymptomatic', 'JJ'), ('Korean', 'JJ'), ('woman', 'NN'), ('with', 'IN'), ('mild', 'JJ'), ('factor', 'NN'), ('XI', 'NN'), ('deficiency', 'NN'), ('.', '.')], [('Factor', 'NN'), ('XI', 'NN'), ('(', '-LRB-'), ('FXI', 'NN'), (')', '-RRB-'), ('deficiency', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('rare', 'JJ'), ('autosomal', 'JJ'), ('recessive', 'JJ'), ('coagulation', 'NN'), ('disorder', 'NN'), ('most', 'RBS'), ('commonly', 'RB'), ('found', 'VBN'), ('in', 'IN'), ('Ashkenazi', 'NNP'), ('and', 'CC'), ('Iraqi', 'JJ'), ('Jews', 'NNPS'), (',', ','), ('but', 'CC'), ('it', 'PRP'), ('is', 'VBZ'), ('also', 'RB'), ('found', 'VBN'), ('in', 'IN'), ('other', 'JJ'), ('ethnic', 'JJ'), ('groups', 'NNS'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('a', 'DT'), ('trauma', 'NN'), ('or', 'CC')

In [7]:
print(tmVarObject.GetNER(0,2))

[[('A', 'O'), ('novel', 'O'), ('missense', 'O'), ('mutation', 'O'), ('Asp506Gly', 'O'), ('in', 'O'), ('Exon', 'O'), ('13', 'NUMBER'), ('of', 'O'), ('the', 'O'), ('F11', 'O'), ('gene', 'O'), ('in', 'O'), ('an', 'O'), ('asymptomatic', 'O'), ('Korean', 'NATIONALITY'), ('woman', 'O'), ('with', 'O'), ('mild', 'O'), ('factor', 'O'), ('XI', 'O'), ('deficiency', 'O'), ('.', 'O')], [('Factor', 'O'), ('XI', 'O'), ('(', 'O'), ('FXI', 'ORGANIZATION'), (')', 'O'), ('deficiency', 'O'), ('is', 'O'), ('a', 'O'), ('rare', 'O'), ('autosomal', 'O'), ('recessive', 'O'), ('coagulation', 'O'), ('disorder', 'O'), ('most', 'O'), ('commonly', 'O'), ('found', 'O'), ('in', 'O'), ('Ashkenazi', 'MISC'), ('and', 'O'), ('Iraqi', 'NATIONALITY'), ('Jews', 'MISC'), (',', 'O'), ('but', 'O'), ('it', 'O'), ('is', 'O'), ('also', 'O'), ('found', 'O'), ('in', 'O'), ('other', 'O'), ('ethnic', 'O'), ('groups', 'O'), ('.', 'O'), ('It', 'O'), ('is', 'O'), ('a', 'O'), ('trauma', 'O'), ('or', 'O'), ('surgery-related', 'O'), ('blee

In [8]:
print(tmVarObject.GetDependencyParse(0,2))

[[('ROOT', 0, 5), ('det', 5, 1), ('amod', 5, 2), ('compound', 5, 3), ('compound', 5, 4), ('case', 7, 6), ('nmod', 5, 7), ('nummod', 7, 8), ('case', 12, 9), ('det', 12, 10), ('compound', 12, 11), ('nmod', 7, 12), ('case', 17, 13), ('det', 17, 14), ('amod', 17, 15), ('amod', 17, 16), ('nmod', 5, 17), ('case', 22, 18), ('amod', 22, 19), ('compound', 22, 20), ('compound', 22, 21), ('nmod', 17, 22), ('punct', 5, 23)], [('ROOT', 0, 13), ('compound', 2, 1), ('compound', 6, 2), ('punct', 4, 3), ('appos', 2, 4), ('punct', 4, 5), ('nsubj', 13, 6), ('cop', 13, 7), ('det', 13, 8), ('amod', 13, 9), ('amod', 13, 10), ('amod', 13, 11), ('compound', 13, 12), ('advmod', 15, 14), ('advmod', 16, 15), ('acl', 13, 16), ('case', 18, 17), ('nmod', 16, 18), ('cc', 18, 19), ('amod', 21, 20), ('conj', 18, 21), ('punct', 13, 22), ('cc', 13, 23), ('nsubjpass', 27, 24), ('auxpass', 27, 25), ('advmod', 27, 26), ('conj', 13, 27), ('case', 31, 28), ('amod', 31, 29), ('amod', 31, 30), ('nmod', 27, 31), ('punct', 13, 3

In [9]:
tmVarObject.stanfordNlp.close()

In [10]:
phrasesObject = Preprocessing('all_dictionary_phrases.txt')

In [11]:
print(phrasesObject.GetTokens(10559,10570))

[['abscisic', 'acid', 'stress', 'ripening'], ['abscisic', 'aldehyde'], ['abscisic', 'aldehyde', 'oxidase'], ['abscission', 'layer'], ['abscission', 'zone'], ['abscission', 'zone', 'cells'], ['abscission', 'zones'], ['abscopal', 'effect'], ['abscopal', 'responses'], ['absence', 'epilepsies'], ['absence', 'epilepsy']]


In [12]:
print(phrasesObject.GetPosTags(10559,10570))

[[('abscisic', 'JJ'), ('acid', 'NN'), ('stress', 'NN'), ('ripening', 'VBG')], [('abscisic', 'JJ'), ('aldehyde', 'NN')], [('abscisic', 'JJ'), ('aldehyde', 'NN'), ('oxidase', 'NN')], [('abscission', 'NN'), ('layer', 'NN')], [('abscission', 'NN'), ('zone', 'NN')], [('abscission', 'NN'), ('zone', 'NN'), ('cells', 'NNS')], [('abscission', 'NN'), ('zones', 'NNS')], [('abscopal', 'JJ'), ('effect', 'NN')], [('abscopal', 'JJ'), ('responses', 'NNS')], [('absence', 'NN'), ('epilepsies', 'NNS')], [('absence', 'NN'), ('epilepsy', 'NN')]]


In [13]:
print(phrasesObject.GetNER(10559,10570))

[[('abscisic', 'O'), ('acid', 'O'), ('stress', 'O'), ('ripening', 'O')], [('abscisic', 'O'), ('aldehyde', 'O')], [('abscisic', 'O'), ('aldehyde', 'O'), ('oxidase', 'O')], [('abscission', 'O'), ('layer', 'TITLE')], [('abscission', 'O'), ('zone', 'O')], [('abscission', 'O'), ('zone', 'O'), ('cells', 'O')], [('abscission', 'O'), ('zones', 'O')], [('abscopal', 'O'), ('effect', 'O')], [('abscopal', 'O'), ('responses', 'O')], [('absence', 'O'), ('epilepsies', 'O')], [('absence', 'O'), ('epilepsy', 'CAUSE_OF_DEATH')]]


In [14]:
print(phrasesObject.GetDependencyParse(10559,10570))

[[('ROOT', 0, 3), ('amod', 3, 1), ('compound', 3, 2), ('dep', 3, 4)], [('ROOT', 0, 2), ('amod', 2, 1)], [('ROOT', 0, 3), ('amod', 3, 1), ('compound', 3, 2)], [('ROOT', 0, 2), ('compound', 2, 1)], [('ROOT', 0, 2), ('compound', 2, 1)], [('ROOT', 0, 3), ('compound', 3, 1), ('compound', 3, 2)], [('ROOT', 0, 2), ('compound', 2, 1)], [('ROOT', 0, 2), ('amod', 2, 1)], [('ROOT', 0, 2), ('amod', 2, 1)], [('ROOT', 0, 2), ('compound', 2, 1)], [('ROOT', 0, 2), ('compound', 2, 1)]]


In [15]:
phrasesObject.stanfordNlp.close()