In [1]:
import os
import sys
import re
import random
random.seed(0)

In [2]:
with open(os.path.join(sys.path[0], 'data/mesivta/ChavrutaShulevitz.output.txt')) as f:
    lines = f.read().splitlines()

In [3]:
def clean_text(text):
    #only keep hebrew letters and spaces
    return re.sub(r'[^א-ת |#]', '', text) #Keep the # for splitting the texts

clean_lines = [clean_text(line) for line in lines]

In [4]:
clean_lines[0]

'### כל הזכויות שמורות  לרב יעקב שולביץ   חברותא  ברכות | בלי הערות  פרק ראשון  מאימתי  הקדמה ###'

In [5]:
def create_sections(lines):
    sections = []
    section=[]
    for line in lines:
        #if line starts with ###
        if line.startswith('###') and section!=[]:
            sections.append(section)
            section = []
            section.append(line)
        else:
            if line!='':
                section.append(line)
    sections.append(section)
    return sections


In [6]:
sections = create_sections(clean_lines)


In [7]:
def combine_sentences(sections):
    combined_sections = []
    for section in sections:
        combined_section = ' '.join(section)
        combined_sections.append(combined_section)
    return combined_sections
combined_sentences=combine_sentences(sections)

In [17]:
def split_train_test(sections, percent_train=0.8):
    random.shuffle(sections)
    train = sections[:int(percent_train*len(sections))]
    test = sections[int(percent_train*len(sections)):]
    return {"train":train, "test":test}
dataset=split_train_test(combined_sentences)

In [24]:
def remove_headers_and_white_space(dataset):
    #regex for all letters in between ### and ###
    clean_data=[re.sub(r'###.*?###', '', section) for section in dataset]
    clean_data=[re.sub(r' +', ' ', section) for section in clean_data]
    clean_data = [section.strip() for section in clean_data]
    return clean_data
dataset['train']=remove_headers_and_white_space(dataset['train'])
dataset['test']=remove_headers_and_white_space(dataset['test'])

In [25]:
#write train and test to different files
with open(os.path.join(sys.path[0], 'data/mesivta/train.txt'), 'w') as f:
    for item in dataset['train']:
        f.write("%s\n" % item)
with open(os.path.join(sys.path[0], 'data/mesivta/test.txt'), 'w') as f:
    for item in dataset['test']:
        f.write("%s\n" % item)