In [1]:
import pickle
import functions
import pandas as pd
import numpy as np
from nltk.util import ngrams
from collections import Counter
import string
import warnings
import spacy
import re
import random
from nltk.tokenize import word_tokenize
import nltk

In [68]:
def introduce_token_level_errors_on_sentence(tokens, replace_prob, insert_prob, delete_prob, swap_prob, recase_prob, err_prob, std_dev,
                                             word_vocabulary):
    #tokens_error_calc = tokens.split(' ')
    num_errors = int(np.round(np.random.normal(err_prob, std_dev) * len(tokens)))
    num_errors = min(max(1, num_errors), len(tokens))  # num_errors \in [0; len(tokens)]
    #tokens = split_string_with_punctuation(tokens)
    if num_errors == 0:
        return ' '.join(tokens)
    token_ids_to_modify = np.random.choice(len(tokens), num_errors, replace=False)
    #print(token_ids_to_modify)
    new_sentence = ''
    for token_id in range(len(tokens)):
        if token_id not in token_ids_to_modify:
            if new_sentence:
                new_sentence += ' '
            new_sentence += tokens[token_id]
            continue

        current_token = tokens[token_id]
        #print(current_token)
        operation = np.random.choice(['replace', 'insert', 'delete', 'swap', 'recase'], p=[replace_prob, insert_prob, delete_prob, swap_prob, recase_prob])
        new_token = ''
        if operation == 'replace':
            if not current_token.isalpha():
                extracted_special_characters = extract_punctuation_at_boundary(current_token)
                if extracted_special_characters:
                    proposals = suggest_correction(extracted_special_characters[1])
                
                    if len(proposals) > 0:
                        new_token = extracted_special_characters[0] + np.random.choice(proposals) + extracted_special_characters[2]
                else:
                    new_token = current_token
            else:
                #add my suggestions
                #proposals = aspell_speller.suggest(current_token)[:10]
                #print(current_token)
                proposals = suggest_correction(current_token)
                
                if len(proposals) > 0:
                    new_token = np.random.choice(proposals)  # [np.random.randint(0, len(proposals))]
                #add a different alteration
                else:
                    new_token = current_token
        elif operation == 'insert':
            #print('insert')
            new_token = current_token + ' ' + np.random.choice(word_vocabulary)
        elif operation == 'delete':
            #print('delete')
            if not current_token.isalpha() or current_token in allowed_source_delete_tokens:
                new_token = current_token
            else:
                new_token = ''
        elif operation == 'recase':
            #print('recase')
            if not current_token.isalpha():
                new_token = current_token
            elif current_token.islower():
                new_token = current_token[0].upper() + current_token[1:]
            else:
                # either whole word is upper-case or mixed-case
                if np.random.random() < 1:
                    new_token = current_token.lower()
                else:
                    num_recase = min(len(current_token), max(1, int(np.round(np.random.normal(0.3, 0.4) * len(current_token)))))
                    char_ids_to_recase = np.random.choice(len(current_token), num_recase, replace=False)
                    new_token = ''
                    for char_i, char in enumerate(current_token):
                        if char_i in char_ids_to_recase:
                            if char.isupper():
                                new_token += char.lower()
                            else:
                                new_token += char.upper()
                        else:
                            new_token += char

        elif operation == 'swap':
            #print('swap')
            if token_id == len(tokens) - 1:
                continue

            new_token = tokens[token_id + 1]
            tokens[token_id + 1] = tokens[token_id]

        if new_sentence and new_token:
            new_sentence += ' '
        new_sentence = new_sentence + new_token

    return new_sentence

def get_token_vocabulary(tsv_token_file):
    tokens = []
    with open(tsv_token_file) as reader:
        for line in reader:
            line = line.strip('\n')
            token, freq = line.split('\t')

            if token.isalpha():
                tokens.append(token)

    return tokens



def introduce_char_level_errors_on_sentence(sentence, replace_prob, insert_prob, delete_prob, swap_prob, change_diacritics_prob, err_prob,
                                            std_dev, char_vocabulary):
    sentence = list(sentence)
    num_errors = int(np.round(np.random.normal(err_prob, std_dev) * len(sentence)))
    num_errors = min(max(0, num_errors), len(sentence))  # num_errors \in [0; len(sentence)]
    if num_errors == 0:
        return ''.join(sentence)

    char_ids_to_modify = np.random.choice(len(sentence), num_errors, replace=False)

    new_sentence = ''
    for char_id in range(len(sentence)):
        if char_id not in char_ids_to_modify:
            new_sentence += sentence[char_id]
            continue

        operation = np.random.choice(['replace', 'insert', 'delete', 'swap', 'change_diacritics'], 1,
                                     p=[replace_prob, insert_prob, delete_prob, swap_prob, change_diacritics_prob])

        current_char = sentence[char_id]
        new_char = ''
        if operation == 'replace':
            if current_char.isalpha():
                new_char = np.random.choice(char_vocabulary)
            else:
                new_char = current_char
        elif operation == 'insert':
            new_char = current_char + ' ' + np.random.choice(char_vocabulary)
        elif operation == 'delete':
            if current_char.isalpha():
                new_char = ''
            else:
                new_char = current_char
        elif operation == 'swap':
            if char_id == len(sentence) - 1:
                continue

            new_char = sentence[char_id + 1]
            sentence[char_id + 1] = sentence[char_id]
        elif operation == 'change_diacritics':
            if current_char in czech_diacritizables_chars:
                is_lower = current_char.islower()
                current_char = current_char.lower()
                char_diacr_group = [group for group in czech_diacritics_tuples if current_char in group][0]
                new_char = np.random.choice(char_diacr_group)

                if not is_lower:
                    new_char = new_char.upper()

        new_sentence += new_char

    return new_sentence

def suggest_correction(word):
        options = []
        if word in suggestions_dict:
            options.append(suggestions_dict[word])
            return options[0]
        return options

def remove_commas(text):
    return text.replace(',', '')



In [69]:
def introduce_token_level_errors_on_sentence_bigram(tokens, replace_bi_prob, replace_prob, insert_prob, delete_prob, swap_prob, recase_prob, err_prob, std_dev,
                                             word_vocabulary):
    #tokens_error_calc = tokens.split(' ')
    num_errors = int(np.round(np.random.normal(err_prob, std_dev) * len(tokens)))
    num_errors = min(max(1, num_errors), len(tokens))  # num_errors \in [0; len(tokens)]
    #tokens = split_string_with_punctuation(tokens)
    if num_errors == 0:
        return ' '.join(tokens)
        
    token_ids_to_modify = np.random.choice(len(tokens), num_errors, replace=False)
    #print(token_ids_to_modify)
    new_sentence = ''
    skip_next=False
    for token_id in range(len(tokens)):
        #happens with bigram replacement
        if skip_next == True:
            skip_next = False
            continue
        if token_id not in token_ids_to_modify:
            if new_sentence:
                new_sentence += ' '
            new_sentence += tokens[token_id]
            continue

        current_token = tokens[token_id]
        #print(current_token)
        operation = np.random.choice(['replace_bigram','replace', 'insert', 'delete', 'swap', 'recase'], p=[replace_bi_prob, replace_prob, insert_prob, delete_prob, swap_prob, recase_prob])
        new_token = ''
        if operation =='replace_bigram':
            #add my suggestions
            #proposals = aspell_speller.suggest(current_token)[:10]
            #print(current_token)
            if token_id+1 < len(tokens):
                next_token = tokens[token_id+1]
                if current_token.isalpha() & next_token.isalpha():
                    proposals = suggest_correction(current_token + ' ' + next_token)
                
                    if len(proposals) > 0:
                        new_token = np.random.choice(proposals)
                        #print('old', current_token + ' ' + next_token )
                        #print('new', new_token)
                        skip_next = True
                #add a different alteration
                else:
                    new_token = current_token
            else:
                operation = 'replace'
        elif operation == 'replace':
            if not current_token.isalpha():
                extracted_special_characters = extract_punctuation_at_boundary(current_token)
                if extracted_special_characters:
                    proposals = suggest_correction(extracted_special_characters[1])
                
                    if len(proposals) > 0:
                        new_token = extracted_special_characters[0] + np.random.choice(proposals) + extracted_special_characters[2]
                else:
                    new_token = current_token
            else:
                #add my suggestions
                #proposals = aspell_speller.suggest(current_token)[:10]
                #print(current_token)
                proposals = suggest_correction(current_token)
                
                if len(proposals) > 0:
                    new_token = np.random.choice(proposals)  # [np.random.randint(0, len(proposals))]
                #add a different alteration
                else:
                    new_token = current_token
        elif operation == 'insert':
            new_token = current_token + ' ' + np.random.choice(word_vocabulary)
        elif operation == 'delete':
            if not current_token.isalpha() or current_token in allowed_source_delete_tokens:
                new_token = current_token
            else:
                new_token = ''
        elif operation == 'recase':
            if not current_token.isalpha():
                new_token = current_token
            elif current_token.islower():
                new_token = current_token[0].upper() + current_token[1:]
            else:
                # either whole word is upper-case or mixed-case
                if np.random.random() < 1:
                    new_token = current_token.lower()
                else:
                    num_recase = min(len(current_token), max(1, int(np.round(np.random.normal(0.3, 0.4) * len(current_token)))))
                    char_ids_to_recase = np.random.choice(len(current_token), num_recase, replace=False)
                    new_token = ''
                    for char_i, char in enumerate(current_token):
                        if char_i in char_ids_to_recase:
                            if char.isupper():
                                new_token += char.lower()
                            else:
                                new_token += char.upper()
                        else:
                            new_token += char

        elif operation == 'swap':
            if token_id == len(tokens) - 1:
                continue

            new_token = tokens[token_id + 1]
            tokens[token_id + 1] = tokens[token_id]

        if new_sentence and new_token:
            new_sentence += ' '
        new_sentence = new_sentence + new_token

    return new_sentence

def get_token_vocabulary(tsv_token_file):
    tokens = []
    with open(tsv_token_file) as reader:
        for line in reader:
            line = line.strip('\n')
            token, freq = line.split('\t')

            if token.isalpha():
                tokens.append(token)

    return tokens

In [70]:
def remove_period_and_cap(sentences, remove_period_prob, remove_caps_prob):

    # Find the index of the first period.
    first_period_index = sentences.find('.')
    if first_period_index != -1 and random.random() < remove_period_prob and len(sentences) -1 != first_period_index:
        substring_after_period = sentences[first_period_index + 1:]
        if random.random() < remove_caps_prob:
            #print('no period no caps')
            # Lowercase the first letter of the substring and combine it with the rest.
            #print(substring_after_period)
            if len(substring_after_period) > 1:
                cleaned_string = sentences[:first_period_index] + substring_after_period[0] + substring_after_period[1].lower() + substring_after_period[2:]
            else:
                cleaned_string = sentences[:first_period_index] + substring_after_period[0]

        else:
            # Remove period but keep capitalization
            #print('yes caps')
            cleaned_string = sentences[:first_period_index] + substring_after_period


    else:
        #print('no changes')
        cleaned_string = sentences
    
    return cleaned_string

def remove_qm_and_cap(sentences, remove_period_prob, remove_caps_prob):

    # Find the index of the first period.
    first_qm_index = sentences.find('?')
    if first_qm_index != -1 and random.random() < remove_period_prob and len(sentences) -1 != first_qm_index:
        substring_after_period = sentences[first_qm_index + 1:]
        if random.random() < remove_caps_prob:
            #print('no period no caps')
            # Lowercase the first letter of the substring and combine it with the rest.
            if len(substring_after_period) > 1:
                cleaned_string = sentences[:first_qm_index] + substring_after_period[0] + substring_after_period[1].lower() + substring_after_period[2:]
            else:
                cleaned_string = sentences[:first_qm_index] + substring_after_period[0]
        else:
            # Remove period but keep capitalization
            #print('yes caps')
            cleaned_string = sentences[:first_qm_index] + substring_after_period


    else:
        #print('no changes')
        cleaned_string = sentences
    
    return cleaned_string

def remove_ep_and_cap(sentences, remove_period_prob, remove_caps_prob):

    # Find the index of the first period.
    first_ep_index = sentences.find('!')
    if first_ep_index != -1 and random.random() < remove_period_prob and len(sentences) -1 != first_ep_index:
        substring_after_period = sentences[first_ep_index + 1:]
        if random.random() < remove_caps_prob:
            #print('no period no caps')
            # Lowercase the first letter of the substring and combine it with the rest.
            if len(substring_after_period) > 1:
                cleaned_string = sentences[:first_ep_index] + substring_after_period[0] + substring_after_period[1].lower()+ substring_after_period[2:]
            else:
                cleaned_string = sentences[:first_ep_index] + substring_after_period[0]
        else:
            # Remove period but keep capitalization
            #print('yes caps')
            cleaned_string = sentences[:first_ep_index] + substring_after_period


    else:
        #print('no changes')
        cleaned_string = sentences
    
    return cleaned_string

def remove_comma(sentences, remove_comma_prob):
    # Find the index of the last comma.
    last_comma_index = sentences.rfind(',')
    #if comma is in the text and it is not the last character
    if last_comma_index != -1 and len(sentences) -1 != last_comma_index:
        text_parts = sentences.split(',')
        cleaned_string = []
        for part in text_parts:
            if random.random() < remove_comma_prob:
                cleaned_string.append(part)
            else:
                cleaned_string.append(part+',')
        cleaned_string = "".join(cleaned_string)
    else:
        #print('no changes')
        cleaned_string = sentences
    
    return cleaned_string

def remove_interpunction(sentences, remove_prob, remove_caps_prob):
    #print(1)
    sentences = remove_period_and_cap(sentences, remove_prob, remove_caps_prob)
    #print(2)
    sentences = remove_qm_and_cap(sentences, remove_prob, remove_caps_prob)
    #print(3)
    sentences = remove_ep_and_cap(sentences, remove_prob, remove_caps_prob)
    #print(4)
    sentences = remove_comma(sentences, remove_prob)
    return sentences


In [71]:
def delete_span(input_string, min_span, max_span):
    words = input_string.split()
    i=0
    while True:
        i+= 1
        span_length = random.randint(min_span, max_span)
        max_span_start = len(words) - span_length
        if i > 25:
            print('deleting spans attempt ', i, 'skipping deletion')
            return input_string
        if max_span_start < 0:
            continue
        span_start = random.randint(0, max_span_start)
        span_end = span_start + span_length
    
        if span_end <= len(words):
            break
    span = words[span_start:span_end]
    before = words[0:span_start]
    after = words[span_end:len(words)]
    new_words = []
    new_words.extend(before)
    new_words.extend(after)
    # Reconstruct the string
    result_string = " ".join(new_words)

    return result_string

input_string = "een twee drie vier vijf zes zeven acht negen tien"

result = delete_span(input_string, 1, 4)
print(result)

       

een twee drie vier vijf zes zeven acht


In [72]:
#difference between Preprocessed text and my corrections in benchmark data (target - preprocessed)
# Positive numbers indicate a larger presence of these types of words in the corrected data compared to the preprocessed: 
    #[('punct', 124), ('det', 40), ('ROOT', 40), ('nsubj', 37), ('conj', 19), ('case', 17), ('cc', 14), ('compound:prt', 12), ('cop', 7), ('nsubj:pass', 5), ('acl:relcl', 5), 
    # ('aux:pass', 4), ('obj', 4), ('nummod', 2), ('expl', 2), ('obl', 1), ('nmod:poss', 1), ('fixed', 0), ('expl:pv', 0), ('nmod', 0), ('iobj', -1), ('obl:agent', -1), 
    # ('csubj', -2), ('parataxis', -3), ('mark', -3), ('aux', -3), ('ccomp', -4), ('amod', -6), ('acl', -7), ('flat', -10), ('advcl', -18), ('xcomp', -18), ('appos', -22), 
    # ('advmod', -27), ('dep', -39)]
    #punct is taken care of.
    #aux is an auxiliary verb. Which combined with the nsubj is often skipped
    #nsubj is the nominal subject who performs the action of a verb
    #ROOT main verb of the sentence. 
    #cc coordinating conjunction (en, of, maar)
    #pass passive sentence
    #determiner (de, het een) good to remove because it is often skipped
    #cop Copula links subject to subject complement (is, ben, zijn) in 'ik ben blij'
    #case show connection between noun and preposition (op, naast, onder, boven)

def delete_NER(text, nlp, ner_delete_rate, types=['aux', 'nsubj', 'ROOT', 'det', 'cc', 'nsubj:pass', 'aux:pass', 'cop', 'case']):
    # Process the text with SpaCy
    doc = nlp(text)
    tokens_to_remove = []

    for token in doc:

        if token.dep_ in types:
            #print(token, token.dep_)
            if np.random.normal(ner_delete_rate, 0) > random.random():
                tokens_to_remove.append(token)

    # Create a new sentence by joining the tokens that are not in the removal list
    new_sentence = "".join([' ' + token.text if token.dep_ != 'punct' else token.text for token in doc if token not in tokens_to_remove])

    #print("Original Sentence:", text)
    #print("Modified Sentence:", new_sentence)
    return new_sentence

nlp = spacy.load("nl_core_news_sm")
input_string = 'Meneer is vanochtend vroeg gevallen. Wanneer komt de arts?'
delete_NER(input_string, nlp, 0.3)


' is vanochtend vroeg gevallen. Wanneer komt de?'

In [7]:
def swap_spans(input_string, min_span, max_span):
    # Split the input string into words
    words = input_string.split()
    i=0
    while True:
        i+= 1
        # Randomly select span1_length and span2_length
        span1_length = random.randint(min_span, max_span)
        span2_length = random.randint(min_span, max_span)

        # Calculate the maximum valid starting position for span1
        max_span1_start = len(words) - max(span1_length, span2_length)
        if i > 25:
            print('swapping spans attempt ', i, 'skipping sentence')
            return input_string
        if max_span1_start < 0:
            continue
        # Randomly select span1_start within the valid range
        span1_start = random.randint(0, max_span1_start)

        # Calculate the end positions of the spans
        span1_end = span1_start + span1_length
        span2_start = span1_end
        span2_end = span2_start + span2_length
        
        if span2_end <= len(words):
            break


    # Swap the two spans, accounting for different lengths
    span1 = words[span1_start:span1_end]
    span2 = words[span2_start:span2_end]
    start = words[0:span1_start]
    end = words[span2_end:len(words)]
    #print(start)
    #print(end)
    new_words = []
    new_words.extend(start)
    new_words.extend(span2)
    new_words.extend(span1)
    new_words.extend(end)
    # Reconstruct the string
    result_string = " ".join(new_words)

    return result_string

# Example usage
input_string = "een twee drie vier vijf zes zeven acht negen tien"

result = swap_spans(input_string, 1, 4)
print(result)


vier vijf zes een twee drie zeven acht negen tien


In [73]:
def extract_punctuation_at_boundary(word):
    # Define the regular expression pattern to capture punctuation marks at the beginning or end of the string
    pattern = r"^([.,;()]*)(.*?)([.,;()]*)$"
    
    # Use re.search() to find any occurrence of the pattern at the beginning or end of the string
    match = re.search(pattern, word)
    
    if match:
        # Get the attached characters at the beginning and end
        attached_beginning = match.group(1)
        attached_end = match.group(3)

        # Get the remaining string after removing the attached characters
        remaining_string = match.group(2)
        
        if attached_beginning or attached_end:
            if remaining_string.isalpha():
                return attached_beginning or '', remaining_string, attached_end or ''
        return False
    else:
        return False

# Example usage:
word1 = "hello"
word2 = "world."
word3 = "(example"

result1 = extract_punctuation_at_boundary(word1)
result2 = extract_punctuation_at_boundary(word2)
result3 = extract_punctuation_at_boundary(word3)

#print(result1)  # Output: (None, 'hello', None)
#print(result2)  # Output: ('', 'world', '.')
#print(result3)  # Output: ('(', 'example', None)


In [74]:
def skip_sentence(input_df, skip_sentence_prob):
    # Create an empty DataFrame to store selected rows
    selected_df = pd.DataFrame(columns=input_df.columns)
    # Set the probability of selecting each row (2%)

    # Iterate through the rows of the input DataFrame
    for index, row in input_df.iterrows():
        if random.random() < skip_sentence_prob:
            # Copy the row to the selected DataFrame
            #takes too long but append is getting deprecated soon..
            #selected_df = pd.concat([selected_df, pd.Series(row)], ignore_index=False)
            with warnings.catch_warnings():
                warnings.simplefilter(action='ignore', category=FutureWarning)
                
                selected_df = selected_df.append(row, ignore_index=False)

    # Create a DataFrame of non-selected rows
    non_selected_df = input_df.drop(selected_df.index)

    return selected_df, non_selected_df

# Example usage:

#selected_sentences, non_selected_sentences = skip_sentence(keyword_sentences_df, 0.5)

#print("Selected Sentences:")
#print(len(selected_sentences))

#print("\nNon-Selected Sentences:")
#print(len(non_selected_sentences))


In [75]:
def prepare_2_sentence_df(df, text_column, remove_periods, remove_caps, skip_sentence_prob):
#keyword_text_df = pd.read_csv('generated_keyword3_v2.csv')
    sentences_df = functions.make_sentence_df(df, text_column)
    unexploded_df = sentences_df.groupby(text_column)['sentences'].apply(list).reset_index()

    pairs = []
    for sentences in unexploded_df['sentences']:
        for i in range(0, len(sentences), 2):
            #print(i)
            try:
                pair = " ".join(sentences[i:i + 2])
                pairs.append(pair)
            except:
                break

    # Create a new DataFrame with pairs of sentences
    pairs_df = pd.DataFrame(pairs, columns=['two sentences'])
    pairs_df.rename(columns={'two sentences': 'target'}, inplace=True)
    skipped_df, pairs_df = skip_sentence(pairs_df, skip_sentence_prob)
    skipped_df['source'] = skipped_df['target']
    pairs_df['source'] = pairs_df['target'].apply(lambda x: remove_interpunction(x, remove_periods,remove_caps))

    return skipped_df[['source', 'target']], pairs_df[['source', 'target']]

def read_df_1_sent(filename, text_column):
    text_df = pd.read_csv(filename)#'generated_keyword3_v1.csv')
    sentences_df = functions.make_sentence_df(text_df, text_column)#'Generated')
    sentences_df['source'] = sentences_df['sentences'].copy()
    sentences_df = sentences_df[['source', 'sentences']].reset_index().drop(columns= 'index')
    sentences_df.rename(columns={'sentences': 'target'}, inplace=True)
    return sentences_df

In [76]:
def spans(sentence, span_prob, delete_span_prob, min_delete_span, max_delete_span,min_swap_span, max_swap_span):
    if random.random() < span_prob:
        if random.random() < delete_span_prob:
            return delete_span(sentence, min_delete_span, max_delete_span)
        else:
            return swap_spans(sentence, min_swap_span, max_swap_span)
    else:
        return sentence

In [77]:
allowed_source_delete_tokens = [',', '.', '!', '?']

allowed_chars = ''#', .'
dutch_diacritics = 'éèêëïöüçàûîñäô'
dutch_diacritics_upper = dutch_diacritics.upper()

allowed_chars += string.ascii_lowercase + string.ascii_uppercase + dutch_diacritics + dutch_diacritics_upper
allowed_characters = list(allowed_chars)
print(allowed_characters)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'é', 'è', 'ê', 'ë', 'ï', 'ö', 'ü', 'ç', 'à', 'û', 'î', 'ñ', 'ä', 'ô', 'É', 'È', 'Ê', 'Ë', 'Ï', 'Ö', 'Ü', 'Ç', 'À', 'Û', 'Î', 'Ñ', 'Ä', 'Ô']


In [78]:
sentence = 'Vandaag hebben we bij mevrouw een urineonderzoek gedaan vanwege vermoeden van een blaasontsteking. De resultaten van het bloedonderzoek zijn nog niet binnen, maar we houden haar klachten goed in de gaten. Mevrouw heeft wat last van pijn bij het plassen en moet vaker naar het toilet. We zorgen ervoor dat ze voldoende drinkt en nemen maatregelen om haar comfort te verbeteren. We zullen de uitslag van het bloedonderzoek afwachten voordat we verdere stappen ondernemen.'.split(' ')
with open('spelling_corrected_corpus_nos_suggestions.pickle', 'rb') as pickle_file:
    suggestions_dict = pickle.load(pickle_file) 
all_keys = list(suggestions_dict.keys())
word_vocabulary = all_keys

example = introduce_token_level_errors_on_sentence(list(sentence), replace_prob=0.7, insert_prob=0.05, delete_prob=0.1, swap_prob=0.1, recase_prob=0.05, err_prob=0.15, std_dev=0.0, word_vocabulary=all_keys)
example

'Vandaag hyebben we bij mevrouw een urineonderzoek gedaan vanwege vermoeden van een blaasontgsteking. De resultaten van het bloedonderzoek zijn nog niet binnen, maar we houden haar klachten goed in de gaten. Mevroruw heehft wat last van pijn bij heth plassen en moet vaker naar het toilet. We zorgen ervoor dat voldoende ze drinkt en enmen maatregelen om haar comfort te verbeteren. SWe zullen de uitslag van het bloedonderzoek afwahten voordat vwe verdere stappen ondernemen.'

In [79]:
def introduce_errors_to_df_v1(df, replace_prob, insert_prob, delete_prob, swap_prob, recase_prob, err_prob, std_dev, skip_sentence_prob,
                                             word_vocabulary, replace_prob_char, insert_prob_char, delete_prob_char, swap_prob_char, change_diacritics_prob, 
                                             err_prob_char, std_dev_char, char_vocabulary):
    # Apply the function to the 'text' column
    not_selected_df, df = skip_sentence(df, skip_sentence_prob)
    #return not_selected_df, df
    df['source'] = df['source'].apply(lambda sentence: introduce_token_level_errors_on_sentence(list(sentence.split()), replace_prob=replace_prob, insert_prob=insert_prob, 
                                                                                                    delete_prob=delete_prob, swap_prob=swap_prob, recase_prob=recase_prob, 
                                                                                                    err_prob=err_prob, std_dev=std_dev, 
                                                                                                    word_vocabulary=word_vocabulary))

    df['source'] = df['source'].apply(lambda sentence: introduce_char_level_errors_on_sentence(sentence=sentence, replace_prob=replace_prob_char, insert_prob=insert_prob_char, 
                                                                                                delete_prob=delete_prob_char, swap_prob=swap_prob_char, 
                                                                                                change_diacritics_prob=change_diacritics_prob, err_prob=err_prob_char,
                                                                                                std_dev=std_dev_char, char_vocabulary=char_vocabulary))
    df = pd.concat([df, not_selected_df]).sort_index()
    return df

In [36]:
df1 = read_df_1_sent('generated_keyword3_v1.csv', 'Generated').drop_duplicates(subset='target')
df2 = read_df_1_sent('generated_keyword3_v2.csv', 'Generated').drop_duplicates(subset='target')
df3 = read_df_1_sent('generated_keyword3_v3.csv', 'Generated').drop_duplicates(subset='target')

df4 = pd.concat([df1, df2], ignore_index=True)
df5 = pd.concat([df3, df4], ignore_index=True)
df5.drop_duplicates(subset='target')

shuffled_df = df5.sample(frac=1, random_state=42)
print('This needs to be 100k or higher:', len(shuffled_df))
shuffled_df
#df_75k = shuffled_df.iloc[:75000].reset_index(drop=True)
#df_25k = shuffled_df.tail(25000).reset_index(drop=True)


#df_25k.to_csv('generated_keywords_examples_25k_sample.csv', index=False)
#df_75k.to_csv('generated_keywords_examples_75k_sample.csv', index=False)
#shuffled_df.to_csv('generated_keywords_examples_100k_sample.csv', index=False)

#I want 100k sentences. For the two sentence option I will split 25k sentences. That way they see the same data.

This needs to be 100k or higher: 100548


Unnamed: 0,source,target
25696,Ze is dankbaar voor de zorg en de aandacht die...,Ze is dankbaar voor de zorg en de aandacht die...
24095,Mevrouw heeft ook aangegeven dat ze meer haarv...,Mevrouw heeft ook aangegeven dat ze meer haarv...
10742,We zullen de pijn goed in de gaten houden en i...,We zullen de pijn goed in de gaten houden en i...
41414,We zullen blijven zoeken naar manieren om mevr...,We zullen blijven zoeken naar manieren om mevr...
73957,We zullen haar in de gaten blijven houden en i...,We zullen haar in de gaten blijven houden en i...
...,...,...
6265,Hij gebruikt medicatie voor zijn hartklachten ...,Hij gebruikt medicatie voor zijn hartklachten ...
54886,We hebben haar geadviseerd om even rustig aan ...,We hebben haar geadviseerd om even rustig aan ...
76820,Mevrouw heeft het advies gekregen om extra voc...,Mevrouw heeft het advies gekregen om extra voc...
860,Ik heb een observatierapport opgesteld waarin ...,Ik heb een observatierapport opgesteld waarin ...


In [99]:
df1 = pd.read_csv('generated_keyword3_v1.csv')
df2 = pd.read_csv('generated_keyword3_v2.csv')
df3 = pd.read_csv('generated_keyword3_v3.csv')

combined_dfs = pd.concat([df1, df2], ignore_index=True)
combined_dfs = pd.concat([combined_dfs, df3], ignore_index=True).sample(frac=1, random_state=42)

combined_dfs.tail(int(22837*0.25)+1).to_csv('generated_keywords_examples_25%_100k.csv', index=False)
combined_dfs.head(int(22837*0.75)).to_csv('generated_keywords_examples_75%_100k.csv', index=False)
#x,two_sentences_df = prepare_2_sentence_df(combined_dfs.tail(int(22837*0.25)+1), 'Generated', 0, 0, 0)
#combined_dfs.head(int(22837*0.75))
#one_sentence_df = 


In [40]:
#df4 = pd.concat([df1, df2], ignore_index=True)
#df5 = pd.concat([df3, df4], ignore_index=True)
#df5.drop_duplicates(subset='target')

df_80k = pd.concat([df1, df3], ignore_index=True).drop_duplicates(subset='target')
print(len(df_80k))
#df_80k
df6 = pd.read_csv('generated_keyword3_v2.csv')
empty, df_10k = prepare_2_sentence_df(df6, 'Generated', 0, 0, 0)
print(len(df_10k))

91966
2666


In [114]:
#100k sentences
df_100k_sentences = pd.read_csv('generated_keywords_examples_100k_sample.csv')
#keyword_sentences_df = read_df_1_sent('generated_keyword3_v1.csv', 'Generated')

with open('spelling_corrected_corpus_nos_suggestions.pickle', 'rb') as pickle_file:
    suggestions_dict = pickle.load(pickle_file) 
all_keys = list(suggestions_dict.keys())
word_vocabulary = all_keys

#replace, insert, delete, swap and recase probs need to be 1 together for both token and char.
#error probability per word along with a standard deviation, skip sentence to leave a sentence unaltered to learn text can be correct.
#word vocabulary is the reverse error pattern dataset.
errors_df_v1= introduce_errors_to_df_v1(df_100k_sentences,  replace_prob=0.7, insert_prob=0.05, delete_prob=0.1, swap_prob=0.1, recase_prob=0.05, 
                                                            err_prob=0.15, std_dev=0,skip_sentence_prob=0.02, word_vocabulary=all_keys,
                                                            replace_prob_char=0.25, insert_prob_char=0.25, delete_prob_char=0.25, swap_prob_char=0.25, change_diacritics_prob=0, 
                                                            err_prob_char=0.005, std_dev_char=0, char_vocabulary=allowed_characters)
errors_df_v1.to_csv('errors_df_v1.csv', index=False)
errors_df_v1

Unnamed: 0,source,target
0,Ze is dankbaar voor de zorg en de aandavht die...,Ze is dankbaar voor de zorg en de aandacht die...
1,Mevrouw ook heeft aangegeven dat zj meer haarv...,Mevrouw heeft ook aangegeven dat ze meer haarv...
2,We zullen de pijn goed in de houden gatehn en ...,We zullen de pijn goed in de gaten houden en i...
3,We zullen zoeken blijve nnaar manieren omz mev...,We zullen blijven zoeken naar manieren om mevr...
4,We zullen haar in de gaten blijven houden en i...,We zullen haar in de gaten blijven houden en i...
...,...,...
100543,Hij gebruikt medicatie voor zijn hartklachten ...,Hij gebruikt medicatie voor zijn hartklachten ...
100544,We hebben haar geadviseerd om even rustig aan ...,We hebben haar geadviseerd om even rustig aan ...
100545,Mevrouw heeft het advies geskregen ofm extra v...,Mevrouw heeft het advies gekregen om extra voc...
100546,Ik heb een observatierapport opgesteld waarin ...,Ik heb een observatierapport opgesteld waarin ...


In [105]:
def introduce_errors_to_df_v2(df, replace_prob, insert_prob, delete_prob, swap_prob, recase_prob, err_prob, std_dev, skip_sentence_prob,
                                             word_vocabulary, replace_prob_char, insert_prob_char, delete_prob_char, swap_prob_char, change_diacritics_prob, 
                                             err_prob_char, std_dev_char, char_vocabulary, remove_comma_prob, replace_bigram_prob):
    # Apply the function to the 'text' column
    not_selected_df, df = skip_sentence(df, skip_sentence_prob)
    df['source'] = df['source'].apply(lambda sentence: remove_comma(sentence, remove_comma_prob))
    df['source'] = df['source'].apply(lambda sentence: introduce_token_level_errors_on_sentence_bigram(list(sentence.split()) ,replace_bi_prob=replace_bigram_prob, 
                                                        replace_prob=replace_prob, insert_prob=insert_prob, delete_prob=delete_prob, swap_prob=swap_prob, recase_prob=recase_prob,
                                                        err_prob=err_prob, std_dev=std_dev, word_vocabulary=word_vocabulary))
    df['source'] = df['source'].apply(lambda sentence: introduce_char_level_errors_on_sentence(sentence=sentence, replace_prob=replace_prob_char, insert_prob=insert_prob_char, 
                                                        delete_prob=delete_prob_char, swap_prob=swap_prob_char, change_diacritics_prob=change_diacritics_prob, 
                                                        err_prob=err_prob_char,std_dev=std_dev_char, char_vocabulary=char_vocabulary))
    df = pd.concat([df, not_selected_df]).sort_index()
    return df

In [116]:
#decide remove interpunction and caps probabilities. add replace comma prob, figure out (bigram) replacement odds
#80k df1 20k df2. 
df1 = read_df_1_sent('generated_keywords_examples_75%_100k.csv', 'Generated')
df2 = pd.read_csv('generated_keywords_examples_25%_100k.csv')

remove_interpunction_prob_v2 = 1
remove_caps_after_interpunct_prob_v2 = 0.5
skip_sentence_prob_v2 = 0.02
#get skip and keep rows df1
skipped_df1, applied_df1 = skip_sentence(df1, skip_sentence_prob_v2)
#get the rows to skip and rows with applied interpunction removed from two sentences df
skipped_df2, applied_df2 = prepare_2_sentence_df(df2, 'Generated', remove_interpunction_prob_v2, remove_caps_after_interpunct_prob_v2, skip_sentence_prob_v2)
#combine 
combined_skipped_dfs= pd.concat([skipped_df1,skipped_df2], ignore_index=True).sort_index()
combined_dfs = pd.concat([applied_df1,applied_df2], ignore_index=True)

with open('spelling_corrected_corpus_nos_suggestions.pickle', 'rb') as pickle_file:
    suggestions_dict = pickle.load(pickle_file) 
all_keys = list(suggestions_dict.keys())
word_vocabulary = all_keys

errors_df_v2 = introduce_errors_to_df_v2(combined_dfs, replace_bigram_prob=0.4, remove_comma_prob=0.4,replace_prob=0.3, insert_prob=0.05, delete_prob=0.1, swap_prob=0.1, recase_prob=0.05, 
                                                            err_prob=0.15, std_dev=0,skip_sentence_prob=skip_sentence_prob_v2, word_vocabulary=all_keys,
                                                            replace_prob_char=0.25, insert_prob_char=0.25, delete_prob_char=0.25, swap_prob_char=0.25, change_diacritics_prob=0, 
                                                            err_prob_char=0.005, std_dev_char=0, char_vocabulary=allowed_characters)
errors_df_v2 = pd.concat([errors_df_v2, combined_skipped_dfs], ignore_index=True).reset_index(drop=True)
errors_df_v2.to_csv('errors_df_v2.csv', index=False)
errors_df_v2.head()

Unnamed: 0,source,target
0,Client mevrouw Jansem klaagt over buikpijn en ...,Client mevrouw Jansen klaagt over buikpijn en ...
1,Ze geeft aan dat haar buik werg rommelt koMt e...,Ze geeft aan dat haar buik erg rommelt en dat ...
2,Mevrouwheeft geen koorts En voelt zich verder ...,Mevrouw heeft geen koorts en voelt zich verder...
3,Ze heeft geen andere klachtehn.,Ze heeft geen andere klachten.
4,is Het belangrijk om extra goed op te letten o...,Het is belangrijk om extra goed op te letten o...


In [117]:
#remove the print rebalance odds
def introduce_errors_to_df_v3(df, text_column, replace_prob, insert_prob, delete_prob, swap_prob, recase_prob, err_prob, std_dev, skip_sentence_prob,
                                             word_vocabulary, replace_prob_char, insert_prob_char, delete_prob_char, swap_prob_char, change_diacritics_prob, 
                                             err_prob_char, std_dev_char, char_vocabulary, remove_comma_prob, replace_bigram_prob, span_prob, delete_span_prob,
                                             min_delete_span, max_delete_span,min_swap_span, max_swap_span):
    # Apply the function to the 'text' column
    not_selected_df, df = skip_sentence(df, skip_sentence_prob)
    df['source'] = df[text_column].apply(lambda sentence: remove_comma(sentence, remove_comma_prob))
    #lower delete and span probabilities.
    df['source'] = df['source'].apply(lambda sentence: spans(sentence, span_prob, delete_span_prob, min_delete_span, max_delete_span,min_swap_span, max_swap_span))
    df['source'] = df['source'].apply(lambda sentence: introduce_token_level_errors_on_sentence_bigram(list(sentence.split()) ,replace_bi_prob=replace_bigram_prob, 
                                                        replace_prob=replace_prob, insert_prob=insert_prob, delete_prob=delete_prob, swap_prob=swap_prob, recase_prob=recase_prob,
                                                        err_prob=err_prob, std_dev=std_dev, word_vocabulary=word_vocabulary))

    df['source'] = df['source'].apply(lambda sentence: introduce_char_level_errors_on_sentence(sentence=sentence, replace_prob=replace_prob_char, insert_prob=insert_prob_char, 
                                                        delete_prob=delete_prob_char, swap_prob=swap_prob_char, change_diacritics_prob=change_diacritics_prob, 
                                                        err_prob=err_prob_char,std_dev=std_dev_char, char_vocabulary=char_vocabulary))
    df = pd.concat([df, not_selected_df]).sort_index()
    return df

In [118]:
#Delete span and swap span has been added. 
#do both 50/50, lower rate of deletion. increase swapping further swapping should be good
#span_rate means it happens once to the text.
#how often remove span? based on length of text and error rate. 15% of 10% is 1.5% of words are deleted. 
#how often swap span? 1.5% of words are swapped/3%
#with a rate of 2.5 words on average 0.05 of sentences deleted that is an additional 1/8 word removed per sentence
#sentence is 13 words on average so a word per 104 words is deleted. ~1% to get to 1.5 lets do the rate * 1.5 = 0.15
#0.075 = 70

#80k df1 20k df2. 
df1 = read_df_1_sent('generated_keywords_examples_75%_100k.csv', 'Generated')
df2 = pd.read_csv('generated_keywords_examples_25%_100k.csv')

remove_interpunction_prob_v3 = 1
remove_caps_after_interpunct_prob_v3 = 0.5
skip_sentence_prob_v3 = 0.02
#get skip and keep rows df1
skipped_df1, applied_df1 = skip_sentence(df1, skip_sentence_prob_v3)
#get the rows to skip and rows with applied interpunction removed from two sentences df
skipped_df2, applied_df2 = prepare_2_sentence_df(df2, 'Generated', remove_interpunction_prob_v3, remove_caps_after_interpunct_prob_v3, skip_sentence_prob_v3)
#combine 
combined_skipped_dfs= pd.concat([skipped_df1,skipped_df2], ignore_index=True).sort_index()
combined_dfs = pd.concat([applied_df1,applied_df2], ignore_index=True)

with open('spelling_corrected_corpus_nos_suggestions.pickle', 'rb') as pickle_file:
    suggestions_dict = pickle.load(pickle_file) 
all_keys = list(suggestions_dict.keys())
word_vocabulary = all_keys

errors_df_v3 = introduce_errors_to_df_v3(combined_dfs, text_column='source',  replace_bigram_prob=0.4, remove_comma_prob=0.3,replace_prob=0.3, insert_prob=0.05, delete_prob=0.05, 
                                                            swap_prob=0.15, recase_prob=0.05, err_prob=0.15, std_dev=0,skip_sentence_prob=0.02, word_vocabulary=all_keys,
                                                            replace_prob_char=0.25, insert_prob_char=0.25, delete_prob_char=0.25, swap_prob_char=0.25, change_diacritics_prob=0, 
                                                            err_prob_char=0.005, std_dev_char=0, char_vocabulary=allowed_characters, span_prob=0.1, delete_span_prob=0.5,
                                                            min_delete_span=2, max_delete_span=3, min_swap_span=1, max_swap_span=4)
errors_df_v3 = pd.concat([errors_df_v3, combined_skipped_dfs], ignore_index=True).reset_index(drop=True)
errors_df_v3.to_csv('errors_df_v3.csv', index=False)
errors_df_v3.head()

swapping spans attempt  26 skipping sentence
swapping spans attempt  26 skipping sentence
swapping spans attempt  26 skipping sentence


Unnamed: 0,source,target
0,Client mevrouw Jansen klaagt buikpijn en hemat...,Client mevrouw Jansen klaagt over buikpijn en ...
1,Ze geeft aan datbhaar buik rommelt erg en dat ...,Ze geeft aan dat haar buik erg rommelt en dat ...
2,Mevrouw heeftbgeen koorts en voelt niet ziek.,Mevrouw heeft geen koorts en voelt zich verder...
3,Ze heeft geen andere klachten.,Ze heeft geen andere klachten.
4,Het is vbelangrijk om eztra goed op te letten ...,Het is belangrijk om extra goed op te letten o...


In [119]:
#Delete span prob = 0
#figure out probabilities with delete and swap and ner.
def introduce_errors_to_df_v4(df, replace_prob, insert_prob, delete_prob, swap_prob, recase_prob, err_prob, std_dev, skip_sentence_prob,
                                             word_vocabulary, replace_prob_char, insert_prob_char, delete_prob_char, swap_prob_char, change_diacritics_prob, 
                                             err_prob_char, std_dev_char, char_vocabulary, remove_comma_prob, replace_bigram_prob, span_prob, delete_span_prob,
                                             min_delete_span, max_delete_span,min_swap_span, max_swap_span, ner_delete_rate, types):
    not_selected_df, df = skip_sentence(df, skip_sentence_prob)
    df['source'] = df['source'].apply(lambda sentence: remove_comma(sentence, remove_comma_prob))
    #lower delete and span probabilities.
    df['source'] = df['source'].apply(lambda sentence: spans(sentence, span_prob, delete_span_prob, min_delete_span, max_delete_span,min_swap_span, max_swap_span))
    nlp = spacy.load("nl_core_news_sm")
    df['source'] = df['source'].apply(lambda sentence: delete_NER(sentence, nlp, ner_delete_rate, types))
    df['source'] = df['source'].apply(lambda sentence: introduce_token_level_errors_on_sentence_bigram(list(sentence.split()) ,replace_bi_prob=replace_bigram_prob, 
                                                        replace_prob=replace_prob, insert_prob=insert_prob, delete_prob=delete_prob, swap_prob=swap_prob, recase_prob=recase_prob, 
                                                        err_prob=err_prob, std_dev=std_dev, word_vocabulary=word_vocabulary))
    df['source'] = df['source'].apply(lambda sentence: introduce_char_level_errors_on_sentence(sentence=sentence, replace_prob=replace_prob_char, insert_prob=insert_prob_char, 
                                                        delete_prob=delete_prob_char, swap_prob=swap_prob_char, change_diacritics_prob=change_diacritics_prob, err_prob=err_prob_char,
                                                        std_dev=std_dev_char, char_vocabulary=char_vocabulary))
    df = pd.concat([df, not_selected_df]).sort_index()
    return df

In [120]:
#delete span prob = 0, halve span_rate to 0.05 to maintain same swaps
#balance ner delete and error rate/delete prob
#ner only checks for words which are part of the types list provided. then for each it finds it removes it based on the ner_delete_rate.
#this is less than the classic delete becuase this rolls it for every single word. Hard to balance. try out 0.2. lower delete and increase swap again.
#42% of text are these tokens so we will say we should increase the error rate by 2.5 to make sure we only delete those words and realize similar amount of deletions
#15% 10% -> 1.5% * 2.5 = 3.75%
#0.15 * 0.4 = 0.06% x * 0.444444444 = 0.06 / 0.4444444 = 0.1429
#0.15 * 0.05 = 0.0075 | x * 0.07 = 0.0075 / 0.07 = 0.1071
#adjustment_factor
af = (1 / (1 - 0.1))
df1 = read_df_1_sent('generated_keywords_examples_75%_100k.csv', 'Generated')
df2 = pd.read_csv('generated_keywords_examples_25%_100k.csv')

remove_interpunction_prob_v4 = 1
remove_caps_after_interpunct_prob_v4 = 0.5
skip_sentence_prob_v4 = 0.02
#get skip and keep rows df1
skipped_df1, applied_df1 = skip_sentence(df1, skip_sentence_prob_v4)
#get the rows to skip and rows with applied interpunction removed from two sentences df
skipped_df2, applied_df2 = prepare_2_sentence_df(df2, 'Generated', remove_interpunction_prob_v4, remove_caps_after_interpunct_prob_v4, skip_sentence_prob_v4)
#combine 
combined_skipped_dfs= pd.concat([skipped_df1,skipped_df2], ignore_index=True).sort_index()
combined_dfs = pd.concat([applied_df1,applied_df2], ignore_index=True)

types = ['aux', 'nsubj', 'ROOT', 'det', 'cc', 'nsubj:pass', 'aux:pass', 'cop', 'case']


with open('spelling_corrected_corpus_nos_suggestions.pickle', 'rb') as pickle_file:
    suggestions_dict = pickle.load(pickle_file) 
all_keys = list(suggestions_dict.keys())
word_vocabulary = all_keys

errors_df_v4 = introduce_errors_to_df_v4(combined_dfs, replace_bigram_prob=0.4*af, remove_comma_prob=0.3,replace_prob=0.3*af, insert_prob=0.05*af, delete_prob=0, 
                                                            swap_prob=0.1*af, recase_prob=0.05*af, err_prob=0.15*0.9, std_dev=0,skip_sentence_prob=skip_sentence_prob_v4, word_vocabulary=all_keys,
                                                            replace_prob_char=0.25, insert_prob_char=0.25, delete_prob_char=0.25, swap_prob_char=0.25, change_diacritics_prob=0, 
                                                            err_prob_char=0.005, std_dev_char=0, char_vocabulary=allowed_characters, span_prob=0.05, delete_span_prob=0,
                                                            min_delete_span=2, max_delete_span=3,min_swap_span=1, max_swap_span=4, ner_delete_rate=0.0375, types=types)
errors_df_v4 = pd.concat([errors_df_v4, combined_skipped_dfs], ignore_index=True).reset_index(drop=True)
errors_df_v4.to_csv('errors_df_v4.csv', index=False)
errors_df_v4.head()

swapping spans attempt  26 skipping sentence
swapping spans attempt  26 skipping sentence
swapping spans attempt  26 skipping sentence
swapping spans attempt  26 skipping sentence


Unnamed: 0,source,target
0,Cliebnt mevrouw Jansen klaagt over buikpijn en...,Client mevrouw Jansen klaagt over buikpijn en ...
1,Zev geeft aan dat haar buik erg rommelt dat en...,Ze geeft aan dat haar buik erg rommelt en dat ...
2,Mevrouw heeft geen koorts en voelt zich cverde...,Mevrouw heeft geen koorts en voelt zich verder...
3,Ze Mheeft geen andere klachten.,Ze heeft geen andere klachten.
4,Het is extra goed op te letten op haar voeding...,Het is belangrijk om extra goed op te letten o...


In [106]:
#v5 Try out best found dataset generation on mc4 data
df1 = read_df_1_sent('mc4_75k.csv', 'text')
df2 = pd.read_csv('mc4_25k.csv')

remove_interpunction_prob_v5 = 1
remove_caps_after_interpunct_prob_v5 = 0.5
skip_sentence_prob_v5 = 0.02
skipped_df1, applied_df1 = skip_sentence(df1, skip_sentence_prob_v5)
#get the rows to skip and rows with applied interpunction removed from two sentences df
skipped_df2, applied_df2 = prepare_2_sentence_df(df2, 'text', remove_interpunction_prob_v5, remove_caps_after_interpunct_prob_v5, skip_sentence_prob_v5)
#combine 
combined_skipped_dfs= pd.concat([skipped_df1,skipped_df2], ignore_index=True).sort_index()
combined_dfs = pd.concat([applied_df1,applied_df2], ignore_index=True)

with open('spelling_corrected_corpus_nos_suggestions.pickle', 'rb') as pickle_file:
    suggestions_dict = pickle.load(pickle_file) 
all_keys = list(suggestions_dict.keys())
word_vocabulary = all_keys

errors_df_v5 = introduce_errors_to_df_v2(combined_dfs, replace_bigram_prob=0.4, remove_comma_prob=0.4,replace_prob=0.3, insert_prob=0.05, delete_prob=0.1, swap_prob=0.1, recase_prob=0.05, 
                                                            err_prob=0.15, std_dev=0,skip_sentence_prob=skip_sentence_prob_v5, word_vocabulary=all_keys,
                                                            replace_prob_char=0.25, insert_prob_char=0.25, delete_prob_char=0.25, swap_prob_char=0.25, change_diacritics_prob=0, 
                                                            err_prob_char=0.005, std_dev_char=0, char_vocabulary=allowed_characters)
errors_df_v5 = pd.concat([errors_df_v5, combined_skipped_dfs], ignore_index=True).reset_index(drop=True)
errors_df_v5.to_csv('errors_df_v5.csv', index=False)
errors_df_v5.head()

Unnamed: 0,source,target
0,Japanse bedrijven zijn niet hondstrouw hun lev...,Japanse bedrijven zijn niet alleen hondstrouw ...
1,Alleen is het niet zo makkelijk er een voet tu...,Alleen is het niet zo makkelijk er een voet tu...
2,Met de volgende tips hebt u alvast voor. streepje,Met de volgende tips hebt u alvast een streepj...
3,In draait alles om vertrouwen.,In Japan draait alles om vertrouwen.
4,Neem voldoende tijd om een dilatie op te bouwe...,Neem voldoende tijd om een relatie op te bouwe...


In [2]:
#shuffled_df.to_csv('generated_keywords_examples_100k_sample.csv', index=False)

#I want 100k sentences. For the two sentence option I will split 25k sentences. That way they see the same data.

def count_words(text):
    if type(text) == str:
        tokens = word_tokenize(text)
        #print(tokens)
    else: 
        tokens=['word']
    return len(tokens)

df = pd.read_csv('generated_keywords_examples_100k_sample.csv')
# Apply the function to each cell in the 'text' column and sum the results
total_words = df['source'].apply(count_words).sum()

print(f"Total number of words in GPT generated: {total_words}")

df_mc4 = pd.read_csv('nl_cleaned_mc4_100k_sentences.csv')
print('total sentences mc4: ', df_mc4['num_sentences'][0:3627].sum())
print('75% sentences mc4: ', df_mc4['num_sentences'][0:2730].sum())
print('25% sentences mc4: ', df_mc4['num_sentences'][2730:3627].sum())

df_mc4[0:2730].to_csv('mc4_75k.csv', index=False)
df_mc4[2730:3627].to_csv('mc4_25k.csv', index=False)

# Apply the function to each cell in the 'text' column and sum the results
total_words = df_mc4['text'][:3500].apply(count_words).sum()

print(f"Total number of words in mc4: {total_words}")

Total number of words in GPT generated: 1482494
total sentences mc4:  87176
75% sentences mc4:  65448
25% sentences mc4:  21728
Total number of words in mc4: 1435076


In [4]:
#some sentences too long for the model to handle.
df_v5 = pd.read_csv('E:/Data_exploration/GitHub/paraphraser_code/data/synthetic_error_data/errors_df_v5.csv')
df_v5['num_words'] = df_v5['target'].apply(count_words)
df_v5[df_v5['num_words'] < 51]#[['source', 'target']].to_csv('errors_df_v5.csv', index=False)

Unnamed: 0,source,target,num_words
0,Japanse bedrijven zijn niet hondstrouw hun lev...,Japanse bedrijven zijn niet alleen hondstrouw ...,18
1,Alleen is het niet zo makkelijk er een voet tu...,Alleen is het niet zo makkelijk er een voet tu...,15
2,Met de volgende tips hebt u alvast voor. streepje,Met de volgende tips hebt u alvast een streepj...,11
3,In draait alles om vertrouwen.,In Japan draait alles om vertrouwen.,7
4,Neem voldoende tijd om een dilatie op te bouwe...,Neem voldoende tijd om een relatie op te bouwe...,19
...,...,...,...
67461,In onze analyse controleren we altijd de Alexa...,In onze analyse controleren we altijd de Alexa...,20
67462,Een lage Alexa-rangorde betekent dat de websit...,Een lage Alexa-rangorde betekent dat de websit...,20
67463,Oproep aan mensen uit het Gentse MET AUTO ! 30...,Oproep aan mensen uit het Gentse MET AUTO ! 30...,17
67464,Het maakt de tijd rijp voor een wervelend eerb...,Het maakt de tijd rijp voor een wervelend eerb...,48


In [5]:

df_v5[df['target'].apply(len) < 383]#.to_csv('errors_df_v5.csv', index=False)#.max()

  df_v5[df['target'].apply(len) < 383]#.to_csv('errors_df_v7.csv', index=False)#.max()


Unnamed: 0,source,target,num_words
0,Japanse bedrijven zijn niet hondstrouw hun lev...,Japanse bedrijven zijn niet alleen hondstrouw ...,18
1,Alleen is het niet zo makkelijk er een voet tu...,Alleen is het niet zo makkelijk er een voet tu...,15
2,Met de volgende tips hebt u alvast voor. streepje,Met de volgende tips hebt u alvast een streepj...,11
3,In draait alles om vertrouwen.,In Japan draait alles om vertrouwen.,7
4,Neem voldoende tijd om een dilatie op te bouwe...,Neem voldoende tijd om een relatie op te bouwe...,19
...,...,...,...
67461,In onze analyse controleren we altijd de Alexa...,In onze analyse controleren we altijd de Alexa...,20
67462,Een lage Alexa-rangorde betekent dat de websit...,Een lage Alexa-rangorde betekent dat de websit...,20
67463,Oproep aan mensen uit het Gentse MET AUTO ! 30...,Oproep aan mensen uit het Gentse MET AUTO ! 30...,17
67464,Het maakt de tijd rijp voor een wervelend eerb...,Het maakt de tijd rijp voor een wervelend eerb...,48


In [123]:
#outdated
def split_string_with_punctuation(text):
    # Define the regular expression pattern to split the string
    pattern = r"\b\w+\b|[.,;:()\[\]{}'\"“”‘’]"

    # Find all occurrences of the pattern in the text
    tokens = re.findall(pattern, text)

    return tokens


split_string_with_punctuation('ik: wil. : (zwemmen) in. bacardi lemon.')

['ik',
 ':',
 'wil',
 '.',
 ':',
 '(',
 'zwemmen',
 ')',
 'in',
 '.',
 'bacardi',
 'lemon',
 '.']

In [9]:
all_keys = list(suggestions_dict.keys())

In [14]:
keys_with_space = [key for key in all_keys if ' ' in key]

# Get the number of keys containing a space
num_keys_with_space = len(keys_with_space)

print(num_keys_with_space)

300523


In [17]:
keys_without_space = [key for key in all_keys if ' ' not in key]
print(len(keys_without_space))

62022


In [65]:
def count_bigrams_in_text(bigrams_list, text):
    words = text.split()
    bigrams = zip(words, words[1:])
    print(bigrams)
    bigram_counter = Counter(bigrams)
    bigram_counts = {bigram: count for bigram, count in bigram_counter.items() if bigram in bigrams_list}
    return bigram_counts

def count_bigrams_in_text(bigrams_list, text):
    words = text.split()
    bigrams = list(ngrams(words, 2))
    result_list = [f"{bigram[0]} {bigram[1]}" if not bigram[1].endswith('.') else f"{bigram[0]} {bigram[1]}" for bigram in bigrams]
    #print(result_list)
    bigram_counter = Counter(result_list)
    bigram_counts = Counter()
    for bigram in result_list:
        if bigram in bigrams_list:
            bigram_counts[bigram] = bigram_counter[bigram]
    
    return bigram_counts


# Accumulate bigram counts for each row in the DataFrame
all_bigram_counts = Counter()
all_unigram_counts = Counter()
total_words = 0
for text in keyword_df_5k['Generated']:
    bigram_counts = count_bigrams_in_text(keys_with_space, text)
    unigram_counts = count_unigrams_in_text(keys_without_space, text)
    all_bigram_counts.update(bigram_counts)
    all_unigram_counts.update(unigram_counts)
    total_words += len(text.split())

# Print the accumulated bigram counts
print(all_bigram_counts)
print(all_unigram_counts)


Counter({'We hebben': 1679, 'last van': 1581, 'om de': 1579, 'dat ze': 1473, 'de gaten': 1338, 'heb haar': 1173, 'heb ik': 991, 'van haar': 978, 'Mevrouw heeft': 944, 'met de': 939, 'in haar': 908, 'en haar': 896, 'de pijn': 877, 'geadviseerd om': 831, 'heeft vandaag': 831, 'goed in': 753, 'aan dat': 700, 'met het': 689, 'hebben haar': 688, 'op haar': 650, 'van een': 642, 'Ze heeft': 609, 'om te': 589, 'gaf aan': 548, 'met haar': 510, 'pijn te': 499, 'houden en': 478, 'heeft mevrouw': 477, 'dat de': 476, 'geholpen met': 460, 'en heeft': 439, 'van mevrouw': 426, 'ik haar': 425, 'bij het': 421, 'Mevrouw had': 412, 'had vandaag': 394, 'voelde zich': 393, 'moeite met': 377, 'pijn in': 374, 'voelt zich': 370, 'en een': 367, 'heeft ze': 365, 'besloten om': 362, 'tijdens het': 361, 'en indien': 350, 'dat het': 350, 'ze zich': 330, 'dat haar': 316, 'de komende': 311, 'gegeven om': 309, 'haar geholpen': 306, 'en dat': 302, 'aangegeven dat': 301, 'zorgen dat': 296, 'zullen de': 291, 'en het': 28

In [68]:
bi = sum(all_bigram_counts.values())
uni = sum(all_unigram_counts.values())
print('amount of bigrams present in data', bi)
print('amount of unigrams present in data',uni)
print('total words:',total_words)
print('unigram percentage present', (uni)/total_words)
print('bigram percentage present', (bi)/total_words)

amount of bigrams present in data 100054
amount of unigrams present in data 271390
total words: 300262
unigram percentage present 0.9038439762607323
bigram percentage present 0.3332223191745875
