In [None]:
from glob import glob
import re
import xml.dom.minidom
from collections import namedtuple
import pandas as pd
import os
import nltk
from nltk.tokenize import sent_tokenize
import random

nltk.download('punkt')


In [None]:
# From https://github.com/pan-webis-de/pan-code/blob/master/clef13/text-alignment/pan13-text-alignment-eval.py

TREF, TOFF, TLEN = 'this_reference', 'this_offset', 'this_length'
SREF, SOFF, SLEN = 'source_reference', 'source_offset', 'source_length'
EXT = 'is_external'
Annotation = namedtuple('Annotation', [TREF, TOFF, TLEN, SREF, SOFF, SLEN, EXT])
TREF, TOFF, TLEN, SREF, SOFF, SLEN, EXT = range(7)

def extract_annotations_from_file(xmlfile, tagname):
    """Returns a set of plagiarism annotations from an XML file."""
    doc = xml.dom.minidom.parse(xmlfile)
    annotations = set()
    if not doc.documentElement.hasAttribute('reference'):
        return annotations
    t_ref = doc.documentElement.getAttribute('reference')
    for node in doc.documentElement.childNodes:
        if node.nodeType == xml.dom.Node.ELEMENT_NODE and \
           node.hasAttribute('name') and \
           node.getAttribute('name').endswith(tagname):
            ann = extract_annotation_from_node(node, t_ref)
            if ann:
                annotations.add(ann)
    return annotations


def extract_annotation_from_node(xmlnode, t_ref):
    """Returns a plagiarism annotation from an XML feature tag node."""
    if not (xmlnode.hasAttribute('this_offset') and \
            xmlnode.hasAttribute('this_length')):
        return False
    t_off = int(xmlnode.getAttribute('this_offset'))
    t_len = int(xmlnode.getAttribute('this_length'))
    s_ref, s_off, s_len, ext = '', 0, 0, False
    if xmlnode.hasAttribute('source_reference') and \
       xmlnode.hasAttribute('source_offset') and \
       xmlnode.hasAttribute('source_length'):
        s_ref = xmlnode.getAttribute('source_reference')
        s_off = int(xmlnode.getAttribute('source_offset'))
        s_len = int(xmlnode.getAttribute('source_length'))
        ext = True
    return Annotation(t_ref, t_off, t_len, s_ref, s_off, s_len, ext)

In [None]:
def get_substring(file, offset, length):
    with open(file, encoding='utf-8') as file:
        if not length == 0:
            return file.read()[offset:offset + length]
        else:
            return file.read()

def split_in_sentences(text):
    text = ' '.join(text.split())
    return sent_tokenize(text)

def get_string_without_paraphrase(file, offset, length):
    with open(file, encoding='utf-8') as file:
        text = file.read()
    pre = text[0:offset]
    post = text[offset+length:]
    return pre + post

In [None]:
corpurs_pre = "pan13-text-alignment-test-"
corpus_append = "corpus1"
corpus_dir = corpurs_pre + corpus_append + "/"
subdirs = ["02-no-obfuscation", "03-random-obfuscation", "04-translation-obfuscation"]

for dir in subdirs:
    out_dirs = []
    all_xmls = glob(corpus_dir + dir + "/*.xml")
    xmls_dicts = []
    annotation_number = 0
    for file in sorted(all_xmls):
        annotations = extract_annotations_from_file(file, "plagiarism")
        for ann in annotations:
            source_text = get_substring(corpus_dir + "src/" + ann[SREF], ann[SOFF], ann[SLEN])
            sus_text = get_substring(corpus_dir + "susp/" + ann[TREF], ann[TOFF], ann[TLEN])
            source_text_no_para = get_string_without_paraphrase(corpus_dir + "src/" + ann[SREF], ann[SOFF], ann[SLEN])
            sentences_source = split_in_sentences(source_text)
            sentences_sus = split_in_sentences(sus_text)
            sentences_source_no_para = split_in_sentences(source_text_no_para)
            for src in sentences_source:
                for sus in sentences_sus:
                    d = {
                        "annotation_number": annotation_number,
                        "sentence1": src,
                        "sentence2": sus,
                        "label":1,
                        "source": ann[SREF],
                        "suspicion": ann[TREF]
                    }
                    out_dirs.append(d)
                
            try:
                sentences_source_no_para = random.sample(sentences_source_no_para, 5)
            except:
                pass
                
            for src in sentences_source_no_para:
                for sus in sentences_sus:
                    d = {
                        "annotation_number": annotation_number,
                        "sentence1": src,
                        "sentence2": sus,
                        "label":0,
                        "source": ann[SREF],
                        "suspicion": ann[TREF]
                    }
                    out_dirs.append(d)
            annotation_number += 1
    pd.DataFrame(out_dirs).to_csv(corpus_append + "-" + dir + ".csv")

In [None]:
corpurs_pre = "pan13-text-alignment-test-"
corpus_append = "corpus2"
corpus_dir = corpurs_pre + corpus_append + "/"
subdirs = ["01-no-plagiarism"]

for dir in subdirs:
    out_dirs = []
    all_xmls = glob(corpus_dir + dir + "/*.xml")
    xmls_dicts = []
    annotation_number = 0
    for file in sorted(all_xmls):
        source = re.search('source-document\d*', file).group( )
        suspicion = re.search('suspicious-document\d*', file).group()
        source_text = get_substring(corpus_dir + "src/" + source + '.txt', 0, 0)
        sus_text = get_substring(corpus_dir + "susp/" + suspicion + '.txt', 0, 0)
        sentences_source = split_in_sentences(source_text)
        sentences_sus = split_in_sentences(sus_text)
        try:
            sentences_source = random.sample(sentences_source, 10)
        except:
            pass
        try:
            sentences_sus = random.sample(sentences_sus, 5)
        except:
            pass
        for src in sentences_source:
            for sus in sentences_sus:
                d = {
                    "annotation_number": annotation_number,
                    "sentence1": src,
                    "sentence2": sus,
                    "label":0,
                    "source": source + '.txt',
                    "suspicion": suspicion + '.txt'
                }
                out_dirs.append(d)
        annotation_number += 1
    pd.DataFrame(out_dirs).to_csv(corpus_append + "-" + dir + ".csv")