In [1]:
import pandas as pd
import random
import string
from datasets import load_dataset
import datasets
import numpy as np

In [2]:
def change_case(sentence):
    for i in range(100):
        char_index = random.randint(0, len(sentence) - 1)
        if sentence[char_index].isalpha():
            break
            
    sentence_list = list(sentence)
    sentence_list[char_index] = sentence_list[char_index].swapcase()
    return ''.join(sentence_list)

In [3]:
def tr2en(sentence):
    tr2en = [("i","ı"),("u","ü"),("o","ö"), ("s","ş"), ("c","ç"), ("g","ğ")]
    for i,j in tr2en:
        sentence = sentence.replace(j,i)
    
    return sentence

In [4]:
tr2en("Hayatta en hakiki mürşit ilimdir, fendir.")

'Hayatta en hakiki mursit ilimdir, fendir.'

In [5]:
def change_case_upper_to_lower(sentence):
    uppercase_indices = [i for i, char in enumerate(sentence) if char.isupper()]
    if not uppercase_indices:
        return sentence
    char_index = random.choice(uppercase_indices)
    sentence_list = list(sentence)
    sentence_list[char_index] = sentence_list[char_index].lower()
    return ''.join(sentence_list)

In [6]:
def add_letter(sentence):
    char_index = random.randint(0, len(sentence) - 1)
    random_letter = random.choice(string.ascii_letters)
    return sentence[:char_index] + random_letter + sentence[char_index:]

In [7]:
def remove_letter(sentence):
    if len(sentence) > 1:
        char_index = random.randint(0, len(sentence) - 1)
        return sentence[:char_index] + sentence[char_index+1:]
    return sentence

In [8]:
def swap_adjacent_letters(sentence):
    if len(sentence) > 1:
        char_index = random.randint(0, len(sentence) - 2)
        return (sentence[:char_index] + sentence[char_index+1] + sentence[char_index] +
                sentence[char_index+2:])
    return sentence

In [9]:
def substitute_letter(sentence):
    if len(sentence) > 0:
        char_index = random.randint(0, len(sentence) - 1)
        substitute_letter = random.choice(string.ascii_letters)
        return sentence[:char_index] + substitute_letter + sentence[char_index+1:]
    return sentence

In [10]:
substitute_letter("Hayatta en hakiki mürşit ilimdir, fendir.")

'Hayatta en hakiki mirşit ilimdir, fendir.'

In [12]:
def introduce_errors(sentence):
    error_functions = [change_case, change_case_upper_to_lower, add_letter,
                       remove_letter, swap_adjacent_letters, substitute_letter]
    
    num_errors = random.randint(2, max(len(sentence)//20,3))
    
    probs = [0.18, 0.18, 0.09, 0.19, 0.18, 0.18]
    
    chosen_functions = list(np.random.choice(error_functions, num_errors, replace=True, p=probs))
    
    if random.random()>0.7:
        chosen_functions.append(tr2en)
        
    new_sentence = sentence
    for error_function in chosen_functions:
        new_sentence = error_function(new_sentence)
    return new_sentence

In [13]:
# Load the datasets
df = load_dataset("mcemilg/GECTurk-generation", split="train").shuffle(seed=44)
data_seq2type = load_dataset("GGLab/GECTurk", split="train").shuffle(seed=44)

data_seq2type

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 96919
})

In [14]:
# Extract indexes of type 0
from collections import defaultdict

def extract_stats(dataset):
    dct = defaultdict(set)
    for row_idx, row in enumerate(dataset):
        errorless=True
        for token_type in row["labels"]:
            if token_type!=0:
                errorless=False
                dct[token_type].add(row_idx)
                
        if errorless:
            dct[0].add(row_idx)
            
    return dict(dct)

train_stats_dct = extract_stats(data_seq2type)

In [15]:
def get_stats(dct, slice_, print_=False):
    print("Slice:",slice_)
    stats = dict()
    
    for type_, indexes in sorted(dct.items()):
        stats[type_] = [len(indexes)]
        if print_:
            print("type:", type_, "num_samples:", len(indexes))
    
    return pd.DataFrame(stats)

get_stats(train_stats_dct, "Train").head()

Slice: Train


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,48746,8592,72,66,330,7261,21875,1251,976,197,...,286,322,396,3,5110,38,2525,1828,583,193


In [16]:
import numpy as np
import random as rd
random.seed(0)
#selected_idxes = np.random.choice(list(train_stats_dct[0]),10000,replace=False)
no_error = list(train_stats_dct[0])
rd.shuffle(no_error)

In [17]:
data_to_disrupt = no_error[:2500]

In [18]:
data_filtered = df.select(data_to_disrupt)
data_filtered

Dataset({
    features: ['source', 'target'],
    num_rows: 2500
})

In [19]:
df_ = data_filtered.to_pandas()

In [20]:
df_errors = pd.DataFrame({
    'source': df_['target'].apply(lambda x: introduce_errors(x)), 
    'target': df_['target'] 
})

df_errors.to_csv('spelling_errors.csv', index=False)

In [21]:
data_no_error = no_error[2500:9500]
data_errorless = df.select(data_no_error)
df_errorless = data_errorless.to_pandas()
df_errorless.to_csv('no_errors.csv', index=False)