# define input requirements
- format: UTF-8
- line termination: CRLF

In [1]:
import sys
sys.path.append('../')

In [2]:
import deer.preprocessing as prep

In [1]:
input_files = [
    '../tests/data/check_format/ok/test1.txt',                # utf-8      with CRLF
    '../tests/data/check_format/error/ISO-8859_CRLF_NEL.txt', # iso-8859-1 with CRLF
    '../tests/data/check_format/error/UTF-8_LF.txt',          # utf-8      with LF
    '../tests/data/check_format/error/UTF-8_LF_NEL.txt'       # utf-8      with LF, NEL
]

In [3]:
for input_file in input_files:
    print('candidate: {} '.format(input_file), end = '')
    try:
        prep.check_dataset(input_file)
    except AssertionError as e:
        print('\t'+str(e))
    else:
        print('\tOK')


candidate: ../tests/data/check_format/ok/test1.txt 	OK
candidate: ../tests/data/check_format/error/ISO-8859_CRLF_NEL.txt 	not a valid format
candidate: ../tests/data/check_format/error/UTF-8_LF.txt 	<b'i\n'>	not a valid terminator
candidate: ../tests/data/check_format/error/UTF-8_LF_NEL.txt 	<b':\n'>	not a valid terminator


In [4]:
# do a proper file version
# input_file = '../data/DATASETS/DANTE_DIVINA_COMMEDIA/data.txt'
# output_file = '../data/DATASETS/DANTE_DIVINA_COMMEDIA/data.txt2'

def convert_endline(output_file: str, input_file: str, newline='\r\n'):
    with open(input_file, newline=None) as input_stream:
        with open(output_file, 'w', newline='\r\n') as output_stream:
            for line in input_stream:
                output_stream.write(line)

In [5]:
# testme = '../data/DATASETS/SDH/data.txt'
# testme = '../data/DATASETS/DANTE_DIVINA_COMMEDIA/data.txt'
# testme = '../data/DATASETS/DANTE_DIVINA_COMMEDIA/UTF-8_NEL.txt'
# testme = '../data/DATASETS/SDH/UTF-8_LF.txt'
testme = '../tests/data/check_format/error/UTF-8_LF_NEL.txt'
from collections import Counter

# universal line mode is enabled, but newline is returned untranslated
with open(testme, newline='') as input_stream:
    data = input_stream.read()
res = Counter(data)

In [6]:
term_options = ['\n', '\r', '\r\n', '\x85']

for term in term_options:
    print('<{}>: {}'.format(term.encode('utf-8'), res.get(term)))

<b'\n'>: 11
<b'\r'>: None
<b'\r\n'>: None
<b'\xc2\x85'>: 1


In [7]:
for ii, cc in enumerate(data):
    if cc == '\x85':
        print('{}: {}'.format(ii, cc.encode('utf-8')))

305: b'\xc2\x85'


In [8]:
from pprint import PrettyPrinter
pp = PrettyPrinter()
pp.pprint(res)

Counter({' ': 77,
         'i': 42,
         'o': 38,
         'e': 31,
         'a': 30,
         'n': 29,
         'd': 22,
         'l': 18,
         'u': 15,
         't': 15,
         'c': 15,
         's': 14,
         'r': 12,
         '\n': 11,
         'm': 9,
         'p': 8,
         'v': 8,
         'z': 6,
         "'": 6,
         'g': 5,
         ',': 5,
         ':': 4,
         'f': 4,
         '«': 3,
         'h': 3,
         '»': 3,
         'ú': 2,
         '.': 2,
         'ò': 2,
         'S': 1,
         ';': 1,
         'O': 1,
         'M': 1,
         'q': 1,
         '!': 1,
         '\x85': 1,
         'E': 1,
         'P': 1,
         'é': 1,
         '?': 1,
         'N': 1,
         'í': 1,
         'à': 1})


In [11]:
data[295:315].encode('utf-8')

b"'entrare!\xc2\xbb\xc2\x85E 'l duca"