In [4]:
from spacy.lang.en import English
import stanfordnlp, time, csv, re, os
from stanfordnlp.server import CoreNLPClient
import nltk
from tqdm import tqdm_notebook as tqdm

ModuleNotFoundError: No module named 'spacy'

In [2]:
#!/usr/bin/env python3
# Copyright 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""Base tokenizer/tokens classes and utilities."""

import copy


class Tokens(object):
    """A class to represent a list of tokenized text."""
    TEXT = 0
    TEXT_WS = 1
    SPAN = 2
    POS = 3
    LEMMA = 4
    NER = 5

    def __init__(self, data, annotators, opts=None, output = None):
        self.data = data
        self.annotators = annotators
        self.opts = opts or {}
        if output != None:
            self.output = output

    def __len__(self):
        """The number of tokens."""
        return len(self.data)

    def slice(self, i=None, j=None):
        """Return a view of the list of tokens from [i, j)."""
        new_tokens = copy.copy(self)
        new_tokens.data = self.data[i: j]
        return new_tokens

    def ssplit(self):
        s_list = []
        original_sentence = self.untokenize()
        dict_a = self.output
        for i in dict_a['sentences']:
            start_offset = i['tokens'][0]['characterOffsetBegin']
            end_offset = i['tokens'][-1]['characterOffsetEnd']
            s_list.append(original_sentence[start_offset:end_offset+1].strip())

        return s_list

    def untokenize(self):
        """Returns the original text (with whitespace reinserted)."""
        return ''.join([t[self.TEXT_WS] for t in self.data]).strip()

    def words(self, uncased=False):
        """Returns a list of the text of each token

        Args:
            uncased: lower cases text
        """
        if uncased:
            return [t[self.TEXT].lower() for t in self.data]
        else:
            return [t[self.TEXT] for t in self.data]

    def offsets(self):
        """Returns a list of [start, end) character offsets of each token."""
        return [t[self.SPAN] for t in self.data]

    def pos(self):
        """Returns a list of part-of-speech tags of each token.
        Returns None if this annotation was not included.
        """
        if 'pos' not in self.annotators:
            return None
        return [t[self.POS] for t in self.data]

    def lemmas(self):
        """Returns a list of the lemmatized text of each token.
        Returns None if this annotation was not included.
        """
        if 'lemma' not in self.annotators:
            return None
        return [t[self.LEMMA] for t in self.data]

    def entities(self):
        """Returns a list of named-entity-recognition tags of each token.
        Returns None if this annotation was not included.
        """
        if 'ner' not in self.annotators:
            return None
        return [t[self.NER] for t in self.data]

    def ngrams(self, n=1, uncased=False, filter_fn=None, as_strings=True):
        """Returns a list of all ngrams from length 1 to n.

        Args:
            n: upper limit of ngram length
            uncased: lower cases text
            filter_fn: user function that takes in an ngram list and returns
              True or False to keep or not keep the ngram
            as_string: return the ngram as a string vs list
        """
        def _skip(gram):
            if not filter_fn:
                return False
            return filter_fn(gram)

        words = self.words(uncased)
        ngrams = [(s, e + 1)
                  for s in range(len(words))
                  for e in range(s, min(s + n, len(words)))
                  if not _skip(words[s:e + 1])]

        # Concatenate into strings
        if as_strings:
            ngrams = ['{}'.format(' '.join(words[s:e])) for (s, e) in ngrams]

        return ngrams

    def entity_groups(self):
        """Group consecutive entity tokens with the same NER tag."""
        entities = self.entities()
        if not entities:
            return None
        non_ent = self.opts.get('non_ent', 'O')
        groups = []
        idx = 0
        while idx < len(entities):
            ner_tag = entities[idx]
            # Check for entity tag
            if ner_tag != non_ent:
                # Chomp the sequence
                start = idx
                while (idx < len(entities) and entities[idx] == ner_tag):
                    idx += 1
                groups.append((self.slice(start, idx).untokenize(), ner_tag))
            else:
                idx += 1
        return groups


class Tokenizer(object):
    """Base tokenizer class.
    Tokenizers implement tokenize, which should return a Tokens class.
    """
    def tokenize(self, text):
        raise NotImplementedError

    def shutdown(self):
        pass

    def __del__(self):
        self.shutdown()


In [3]:
#!/usr/bin/env python3
# Copyright 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""Simple wrapper around the Stanford CoreNLP pipeline.

Serves commands to a java subprocess running the jar. Requires java 8.
"""

import copy
import json
import pexpect

# from .tokenizer import Tokens, Tokenizer
# from . import DEFAULTS


class CoreNLPTokenizer(Tokenizer):

    def __init__(self, **kwargs):
        """
        Args:
            annotators: set that can include pos, lemma, and ner.
            classpath: Path to the corenlp directory of jars
            mem: Java heap memory
        """
        self.classpath = ('/home/chao/stanfordnlp/demo/corenlp/*')
        self.annotators = copy.deepcopy(kwargs.get('annotators', set()))
        self.mem = kwargs.get('mem', '2g')
        self._launch()

    def _launch(self):
        """Start the CoreNLP jar with pexpect."""
        annotators = ['tokenize', 'ssplit']
        if 'ner' in self.annotators:
            annotators.extend(['pos', 'lemma', 'ner'])
        elif 'lemma' in self.annotators:
            annotators.extend(['pos', 'lemma'])
        elif 'pos' in self.annotators:
            annotators.extend(['pos'])
        annotators = ','.join(annotators)
        options = ','.join(['untokenizable=noneDelete',
                            'invertible=true'])
        cmd = ['java', '-mx' + self.mem, '-cp', '"%s"' % self.classpath,
               'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators',
               annotators, '-tokenize.options', options,
               '-outputFormat', 'json', '-prettyPrint', 'false']

        # We use pexpect to keep the subprocess alive and feed it commands.
        # Because we don't want to get hit by the max terminal buffer size,
        # we turn off canonical input processing to have unlimited bytes.
        self.corenlp = pexpect.spawn('/bin/bash', maxread=100000, timeout=60)
        self.corenlp.setecho(False)
        self.corenlp.sendline('stty -icanon')
        self.corenlp.sendline(' '.join(cmd))
        self.corenlp.delaybeforesend = 0
        self.corenlp.delayafterread = 0
        self.corenlp.expect_exact('NLP>', searchwindowsize=100)

    @staticmethod
    def _convert(token):
        if token == '-LRB-':
            return '('
        if token == '-RRB-':
            return ')'
        if token == '-LSB-':
            return '['
        if token == '-RSB-':
            return ']'
        if token == '-LCB-':
            return '{'
        if token == '-RCB-':
            return '}'
        return token

    def tokenize(self, text):
        # Since we're feeding text to the commandline, we're waiting on seeing
        # the NLP> prompt. Hacky!
        if 'NLP>' in text:
            raise RuntimeError('Bad token (NLP>) in text!')

        # Sending q will cause the process to quit -- manually override
        if text.lower().strip() == 'q':
            token = text.strip()
            index = text.index(token)
            data = [(token, text[index:], (index, index + 1), 'NN', 'q', 'O')]
            return Tokens(data, self.annotators)

        # Minor cleanup before tokenizing.
        clean_text = text.replace('\n', ' ')

        self.corenlp.sendline(clean_text.encode('utf-8'))
        self.corenlp.expect_exact('NLP>', searchwindowsize=100)

        # Skip to start of output (may have been stderr logging messages)
        output = self.corenlp.before
        start = output.find(b'{\r\n  "sentences":')
        output = json.loads(output[start:].decode('utf-8'))

        data = []
        tokens = [t for s in output['sentences'] for t in s['tokens']]
#         print(output)
        for i in range(len(tokens)):
            # Get whitespace
            start_ws = tokens[i]['characterOffsetBegin']
            if i + 1 < len(tokens):
                end_ws = tokens[i + 1]['characterOffsetBegin']
            else:
                end_ws = tokens[i]['characterOffsetEnd']

            data.append((
                self._convert(tokens[i]['word']),
                text[start_ws: end_ws],
                (tokens[i]['characterOffsetBegin'],
                 tokens[i]['characterOffsetEnd']),
                tokens[i].get('pos', None),
                tokens[i].get('lemma', None),
                tokens[i].get('ner', None)
            ))
        return Tokens(data, self.annotators, output = output)


NameError: name 'Tokenizer' is not defined

In [2]:
tok = CoreNLPTokenizer()


NameError: name 'CoreNLPTokenizer' is not defined

In [5]:
nlp.add_pipe(nlp.create_pipe('sentencizer'))
# nlp = spacy.load('en_core_web_sm')

def spacy_sentence_splitter(text):
    
    sent_list = []
    
    doc = nlp(text)
    for sent in doc.sents:
        sent_list.append(sent.text)
        
    return sent_list

def spacy_tokenizer(text):
    doc = nlp(text)
    
    tokens = [token.text for token in doc]
    
    return tokens

In [6]:
article_path = "/home/chao/research_1_newsela_alignment_898/newsela_article_corpus_2016-01-29/articles/"
newsela_data_path = "/home/chao/research_1_newsela_alignment_898/data/current_2019_10_11_v12.csv"


In [7]:
def read_csv_file_newsela(path):
    
    data = []
    
    with open(path) as f:
        reader = csv.reader(f, delimiter='|', quotechar='', quoting=csv.QUOTE_NONE)    
        for idx, line in enumerate(reader):
            if idx == 0:
                continue
            else:
                data.append(line)
    
    return data

In [8]:
def extract_all_article_names(path):    
    with open(path) as f:
        reader = csv.reader(f, delimiter='|', quotechar='', quoting=csv.QUOTE_NONE)    
        name = []
        for idx, line in enumerate(reader):
            if idx == 0:
                continue
            else:

                tmp = [i for i,val in enumerate(line[0]) if val=="-"]
                sent_article_name = line[0][tmp[0]+1:tmp[-6]]

                name.append(sent_article_name)

    name = list(set(name))
    return name

In [9]:
def read_article(path):
    with open(path) as f:
        data = f.readlines()
        data = [i.strip() for i in data]
        data = [i for i in data if i != ""]
    return data

In [10]:
all_names = extract_all_article_names(newsela_data_path)

In [11]:
path_to_article = {}

for i in all_names:
    for j in range(5):
        path = "{}{}.{}.txt".format(article_path, i, j)
        article = read_article(path)
        path_to_article[path] = article
        

In [5]:
tok.tokenize("Hi. I am Chao. I come from No.1 Middle School (XX School).").ssplit()

NameError: name 'tok' is not defined

In [13]:
count = 0
error_cases = []

for k, v in tqdm(path_to_article.items()):

    for line in v:
        if line.startswith("##"):
            continue
        else:
            line = " ".join(line.split()) # this step is quite important, in the original sent, there is \xa0 looks likea white space
            ssplit = tok.tokenize(line).ssplit()
            ssplit_join = " ".join(ssplit)
            if ssplit_join != line:
                
                if len(ssplit_join) == len(line):
                    for char_idx, char in enumerate(ssplit_join):
                        if ssplit_join[char_idx] != line[char_idx]:
                            print("D-{}-{}-{}-{}-".format(len(line), char_idx, \
                                                          ssplit_join[char_idx].encode('unicode_escape').decode(), \
                                                          line[char_idx].encode('unicode_escape').decode()) )
                
                    count += 1
                    print(line)
                    print(" ".join(ssplit))
                    print("\n")
                    for i in ssplit:
                        print(i)
                    print("\n\n")
                    
                if len(ssplit_join) != len(line):
                    count += 1
                    print(line)
                    print(" ".join(ssplit))
                    print("\n")
                    for i in ssplit:
                        print(i)
                    print("\n\n")
                
                
print(count)
            


HBox(children=(IntProgress(value=0, max=250), HTML(value='')))


0
