In [1]:
import pandas as pd
import re
import os, csv, sys
import pyfasttext as ft
import multiprocessing
import nltk
from nltk.tokenize import WordPunctTokenizer

# Read the existing datasets

The Discovery dataset is distributed using InferSent compatible format;

You can use the following cells in order to convert it to csv/tsv with desired format:

In [2]:
data_path = "/data/name/Discovery/data/" # replace with repository data path
data_path = "/data/sileo/libs/Discovery/data/"
dataset_path = os.path.join(data_path,"DiscoveryBase")

In [3]:
l_df=[]
for cv in ["train","test","dev"]:
    df_cv=pd.DataFrame(list(zip(
        open(f"{dataset_path}/s1.{cv}").read().splitlines(),
        open(f"{dataset_path}/s2.{cv}").read().splitlines(),
        open(f"{dataset_path}/labels.{cv}").read().splitlines()))
        ,columns=["s1","s2","y"])
    df_cv["set"]=cv
    l_df+=[df_cv]
    
df_discovery = pd.concat(l_df)
del l_df, df_cv
df_discovery.head()

Unnamed: 0,s1,s2,y,set
0,He helped to found the Mexican American ...,Sanchez became involved with the American...,"subsequently,",train
1,Then click the `` Paper Clip '' button ...,You can use any handheld device that ru...,"alternately,",train
2,That 's a long way for a program that ...,FAU is a program that under Cooney and ...,"presently,",train
3,"FORT DRUM , N.Y. - Throughout its histo...",Soldiers receive assignments to multiple ...,"typically,",train
4,They continued to dig noting that there ...,Every ten feet they found a layer of l...,"curiously,",train


### Export in another format:

In [4]:
data_dir = "/data/name/Discovery/data/DiscoveryBig/"  # replace with desired output path
data_dir = "/data/sileo/libs/Discovery/data/DiscoveryBig/"

for cv in ["train","dev","test"]:
    df_discovery[df_discovery.set==cv].to_csv(data_dir+f"/{cv}.tsv", index=False, sep="\t")

## BERT Processor
To be used with BERT Huggingface package

In [5]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label
        
class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines
        
class DiscoveryProcessor(DataProcessor):
    """Processor for the Discovery data set."""

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_labels(self):
        """See base class."""
        return list(disc_label.keys())

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)
            text_a = line[1].lower()
            text_b = line[0].lower()
            label = line[2]
            
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples

In [6]:
disc = DiscoveryProcessor()
x = disc.get_dev_examples(data_dir)[0]
print(x.text_a, x.text_b, x.label)

what  you  are  asking  them  to  do  costs  tons  of  money  . you  have  no  idea  how  much  work  goes  into  outreach  for  a  school  . first,


# Mining markers on a new corpus

### Remove low quality sentences

In [7]:
# download lid from https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

path_to_lid = "your_path/lib.176.bin" # replace with lid path
path_to_lid = "/data/sileo/fasttext/lid.176.bin"
lid = ft.FastText(path_to_lid)

def lid_scorer(s, lang="en"):
    y=lid.predict_proba([s])[0]
    if not y:
        return 0.0
    y=y[0]
    if y[0]!=lang:
        return 0.0
    else:
        return y[1]

def matched(s, c1, c2):
    count = 0
    for i in s:
        if i == c1:
            count += 1
        elif i == c2:
            count -= 1
        if count < 0:
            return False
    return count == 0


def is_valid(s, lang="en", badchars="", len_range=range(2,32)): 
    good_len = len(s.split()) in len_range
    if not good_len: #avoid computations on long sequences
        return False
    is_en = lid_scorer(s, lang=lang)>0.75
    if not is_en:
        return False
    letters_prop = (len(re.sub("[^a-zA-Z]+", "", s))/len(s))>0.5
    parenthesis_balance = matched(s,"(",")")
    quote_balance = s.count('"')%2==0
    no_bad_char = not any(x in s for x in badchars)
    has_lowercase = sum(x.islower() for x in s)>sum(x.isupper() for x in s)
    has_lowercase = has_lowercase and sum(x.islower() for x in s)>sum(x.istitle() for x in s) 
    return letters_prop and parenthesis_balance and quote_balance and no_bad_char and has_lowercase and is_en and good_len


### Extract adverbs or known markers

In [8]:
tokenizer = WordPunctTokenizer().tokenize

# initial list of markers; our list of 174 markers can be used instead. 
pdtb_markers = set(open(os.path.join(data_path, "pdtb_markers_list")).read().splitlines())

def get_marker(s):
    s = s.replace(" ,", ",")
    y=[w for w in pdtb_markers if s.lower().startswith(w)]
    if y:
        return y[0]
    else:
        return ""
    
# The mining of new markers can be disabled.
def get_adverb_or_marker(s):
    t_s = tokenizer(s)[:12]
    if len(t_s)>2 and nltk.pos_tag(t_s)[0][1] in ["RB",] and t_s[1]=="," and t_s[0].istitle():
        return t_s[0]
    else:
        return get_marker(s)

### Example on depcc corpus
depcc is a web corpus  built upon common crawl web data https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/depcc.html

In [9]:
import glob, wget, gzip, random
from nltk.tokenize import sent_tokenize

In [10]:
corpus_path = "" #path where you want to download depcc parts
corpus_path = "/data/sileo/libs/Discovery/data/"

def build_l(k):
    c=0
    d={}
    l=[]
    filename=f"part-m-{str(k).zfill(5)}.gz"

    while len(glob.glob(f"{corpus_path}/*.gz"))>100:
        time.sleep(20)
        
    print(f"downloading {k}")
    url = f"http://ltdata1.informatik.uni-hamburg.de/depcc/corpus/parsed/{filename}"
    wget.download(url, out=corpus_path)
    print(f"downloaded {k}")
    
    sentences=[]
    s_prev=""
    for line in gzip.open(os.path.join(corpus_path, filename), 'rb'):
        line = line.decode("utf8")
        if line[0]!="#":
            continue
        c+=1
        
        tag = line[2:20].split(" =")[0]
        if tag=="parser":
            continue
        content = line[line.find("=")+2:].strip()
        if tag=="sent_id":
            content=content.split("#")[-1]
        if tag=="text":
            sentences+=[content]

        if tag=="newdoc	url":
            d["doc_id"]=content
            full_text = " ".join(sentences)
            sentences = sent_tokenize(full_text)
            for i, pair in enumerate((zip([""]+sentences, sentences))): # iterate over sentence pairs
                marker = get_adverb_or_marker(pair[1])
                if marker.title() in ["And", "But"] and random.random()>0.1: # "And" and "But" are downsampled
                    continue
                if (marker or random.random()<0.002) and is_valid(pair[0]) and is_valid(pair[1]): # Examples with no adverbs are also downsampled
                    d["s2"]=pair[1]
                    d["s1"]=pair[0]
                    d["sent_id"]=i
                    d["marker"]=marker
                    l+=[dict(d)]
            sentences=[]
            s_prev=""
    pd.DataFrame(l).to_csv(f"{corpus_path}/df-{k}.csv", index=False)
    os.remove(f"{corpus_path}/{filename}")
    print(f"done {k}")
    return l

In [11]:
n_threads = 2
nb_files = 4 # experiment on a small subset of depcc
pool = multiprocessing.Pool(n_threads)
out = pool.map(build_l, range(0, 4))

downloading 1
downloading 0
downloaded 0
downloaded 1
done 1
downloading 2
downloaded 2
done 0
downloading 3
downloaded 3
done 2
done 3


### Remove marker at the beginning of s2 and capitalize the resulting sentence

In [12]:
df_depcc_sample = pd.read_csv(f"{corpus_path}/df-1.csv")

In [13]:
df_depcc_sample["marker"]=df_depcc_sample["marker"].map(str)
df_depcc_sample["s2"] = df_depcc_sample.apply(lambda x: str(x["s2"])[len(x["marker"]):].lstrip(", ").capitalize(), axis=1)

In [14]:
df_depcc_sample.head()

Unnamed: 0,doc_id,marker,s1,s2,sent_id
0,http://pandasthumb.org/archives/evolution/irre...,Firstly,"The more I looked at this sentence, the crasse...",How many male presidents would be defined by t...,2
1,http://pandasthumb.org/archives/evolution/irre...,Instead,"In an ideal world, a cigar-chomping, apoplecti...",The sentence gets syndicated around the world.,8
2,http://sportspressnw.com/2144381/2012/whos-the...,So,When you make these kinds of claims in the tee...,Try again.,2
3,http://sportspressnw.com/2144381/2012/whos-the...,Apparently,I never got to meet him...but now there is You...,Michael behe just doesn't know when to pack it...,10
4,http://sportspressnw.com/2144381/2012/whos-the...,for example,Note that the unpublished version has a few mi...,It has more emphases which were kind of my way...,35


In [15]:
list(set(df_depcc_sample.marker))[:10]

['Yep',
 'Superficially',
 'Ever',
 'Humbly',
 'Disturbingly',
 'Offensively',
 'therefore',
 'otherwise,',
 'Particularly',
 'Acutally']