In [None]:
#!pip install --upgrade datasets

## Extract Stats

In [29]:
import numpy as np
import pandas as pd
import datasets
from datasets import load_dataset

In [30]:
# Both train and test
data_seq2type = load_dataset("GGLab/GECTurk")

In [31]:
data_seq2type

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 96919
    })
    dev: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 20769
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 20769
    })
    movie_reviews: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 300
    })
})

In [32]:
from collections import defaultdict

def extract_stats(dataset):
    dct = defaultdict(set)
    for row_idx, row in enumerate(dataset):
        errorless=True
        for token_type in row["labels"]:
            if token_type!=0:
                errorless=False
                dct[token_type].add(row_idx)
                
        if errorless:
            dct[0].add(row_idx)
            
    return dict(dct)

In [33]:
train_stats_dct = extract_stats(data_seq2type["train"])
dev_stats_dct = extract_stats(data_seq2type["dev"])

In [34]:
def get_stats(dct, slice_, print_=False):
    print("Slice:",slice_)
    stats = dict()
    
    for type_, indexes in sorted(dct.items()):
        stats[type_] = [len(indexes)]
        if print_:
            print("type:", type_, "num_samples:", len(indexes))
    
    return pd.DataFrame(stats)


In [35]:
train_slice = get_stats(train_stats_dct, "Train")
train_slice.head()

Slice: Train


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,48746,8592,72,66,330,7261,21875,1251,976,197,...,286,322,396,3,5110,38,2525,1828,583,193


In [36]:
dev_slice = get_stats(dev_stats_dct, "Dev")
dev_slice.head()

Slice: Dev


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,20,21,22,23,24,25
0,10506,1814,16,17,73,1545,4637,268,205,48,...,51,67,75,72,1088,7,570,413,98,41


In [37]:
concated = pd.concat([train_slice,dev_slice],axis=0)
concated.index = ["train", "dev"]
concated.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
train,48746,8592,72,66,330,7261,21875,1251,976,197,...,286,322,396,3.0,5110,38,2525,1828,583,193
dev,10506,1814,16,17,73,1545,4637,268,205,48,...,67,75,72,,1088,7,570,413,98,41


In [38]:
import seaborn as sns
import matplotlib.pyplot as plt
num_samples_total = (concated.describe().loc["mean",:]*3)

In [39]:
num_samples_total.median()

1051.5

In [40]:
num_samples_total

0     88878.0
1     15609.0
2       132.0
3       124.5
4       604.5
5     13209.0
6     39768.0
7      2278.5
8      1771.5
9       367.5
10      492.0
11     6652.5
12    14640.0
13     1959.0
14     1081.5
15      418.5
16      529.5
17      595.5
18      702.0
19        9.0
20     9297.0
21       67.5
22     4642.5
23     3361.5
24     1021.5
25      351.0
Name: mean, dtype: float64

In [41]:
train_stats_dct

{12: {0,
  32772,
  32775,
  32794,
  32797,
  65578,
  65585,
  65590,
  68,
  69,
  32836,
  65607,
  32841,
  65611,
  65617,
  82,
  32864,
  98,
  32866,
  32874,
  112,
  115,
  65652,
  32897,
  135,
  32906,
  32909,
  143,
  147,
  65683,
  168,
  172,
  174,
  32943,
  65710,
  182,
  65719,
  32954,
  188,
  191,
  194,
  32975,
  208,
  65751,
  65753,
  218,
  65754,
  220,
  32989,
  222,
  32990,
  65764,
  33000,
  240,
  241,
  242,
  65782,
  247,
  65783,
  33029,
  273,
  283,
  33055,
  288,
  33057,
  65829,
  294,
  65835,
  33070,
  33073,
  33080,
  33083,
  65854,
  33093,
  65861,
  330,
  65873,
  65876,
  33114,
  347,
  351,
  356,
  65901,
  33135,
  369,
  65915,
  381,
  65936,
  33171,
  65941,
  33177,
  33181,
  415,
  65952,
  65953,
  419,
  33188,
  65958,
  65959,
  33192,
  65962,
  33195,
  65968,
  33201,
  435,
  65971,
  437,
  65977,
  33211,
  33216,
  65984,
  33228,
  33241,
  66010,
  476,
  33245,
  33247,
  66016,
  33253,
  66021,
  

In [42]:
# Augment to double size
class_0 = [21]

# If count<500, take all; 
# If count>=500, take 250;
class_1 = [2,3,5,9,19,25]
 
# If count<500, augment to 500;
# If count>=500, take 500;
class_2 = [4,10,12,13,14,15,16,17,18,20]

# If count<1000, augment to 1000;
# If count>=1000, take 1000;
class_3 = [1,6,7,8,11,22,23]

In [46]:
a = [0]*26
for i in class_1:
    a[i] = 250
for i in class_2:
    a[i] = 500
for i in class_3:
    a[i] = 1000   
a[21] = 135
print(a)

[0, 1000, 250, 250, 500, 250, 1000, 1000, 1000, 250, 500, 1000, 500, 500, 500, 500, 500, 500, 500, 250, 500, 135, 1000, 1000, 0, 250]


In [43]:
import random as rd
def augment_indexes(idxes, target):
    idxes = list(idxes)
    rd.shuffle(idxes)
    times = (target//len(idxes)) + 1
    aug_idxes = idxes*times
    return aug_idxes[:target]

In [44]:
spell_errors = load_dataset('csv', data_files={'train': "spelling_errors.csv"})
spell_errors

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 2500
    })
})

In [18]:
import numpy as np

train_data_ = dict()

count = {0: "double", 1: 250, 2: 500, 3: 1000}

for type_, idxes in train_stats_dct.items():
    if type_ in class_0:
        train_data_[type_] = augment_indexes(idxes, len(idxes)*2)
    elif type_ == 24 or type_ == 0:
        pass
    else:
        curr_class = int("1"*(type_ in class_1) + "2"*(type_ in class_2) + "3"*(type_ in class_3))
        train_data_[type_] = augment_indexes(idxes, count[curr_class])


In [19]:
total = 0
for k, v in train_data_.items():
    total += len(v)

print(total)

13576


In [20]:
all_error_idxes = []
for k, v in train_data_.items():
    all_error_idxes.extend(list(v))
    
len(all_error_idxes)

13576

In [21]:
dataset = load_dataset("mcemilg/GECTurk-generation")
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 96919
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 20769
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 20769
    })
})

In [22]:
data_filtered_train = dataset["train"].select(all_error_idxes)
data_filtered_train

Dataset({
    features: ['source', 'target'],
    num_rows: 13576
})

In [23]:
import pandas as pd
no_errors = pd.read_csv("no_errors.csv")
spell_err = pd.read_csv("spelling_errors.csv")

In [24]:
spell_err.head()

Unnamed: 0,source,target
0,Yaman celiskiye bakin.,Yaman çelişkiye bakın.
1,Sadece devLetimizin gayretleriyle bu sorununRü...,Sadece devletimizin gayretleriyle bu sorunun ü...
2,FederasyoJ bu kıyımın yaşanmamaıs adına İtalya...,Federasyon bu kıyımın yaşanmaması adına İtalya...
3,Ama biz yalnız djğilizf,Ama biz yalnız değiliz.
4,''Türk politiacılarla ne zaman bir araya gelse...,''Türk politikacılarla ne zaman bir araya gels...


In [25]:
data_filtered_td = data_filtered_train.to_pandas()
data_filtered_td.head()

Unnamed: 0,source,target
0,"Hemen ""ispatlamıyan şerefsizdir, alçaktır, nam...","Hemen ""ispatlamayan şerefsizdir, alçaktır, nam..."
1,İki yıl içinde 150 bin devlet memuru işten atı...,İki yıl içinde 150 bin devlet memuru işten atı...
2,"Önlenmesi mümkün olan, yok olma tehlikesine ra...","Önlenmesi mümkün olan, yok olma tehlikesine ra..."
3,Ara da başka şeylerde söyliyen popçuya indirge...,Arada başka şeyler de söyleyen popçuya indirge...
4,Bir dönem (ki o dönem çokda uzak değil) çocukl...,Bir dönem (ki o dönem çok da uzak değil) çocuk...


In [26]:
combined_df = pd.concat([data_filtered_td, spell_err, no_errors], ignore_index=True)
combined_df.head()

Unnamed: 0,source,target
0,"Hemen ""ispatlamıyan şerefsizdir, alçaktır, nam...","Hemen ""ispatlamayan şerefsizdir, alçaktır, nam..."
1,İki yıl içinde 150 bin devlet memuru işten atı...,İki yıl içinde 150 bin devlet memuru işten atı...
2,"Önlenmesi mümkün olan, yok olma tehlikesine ra...","Önlenmesi mümkün olan, yok olma tehlikesine ra..."
3,Ara da başka şeylerde söyliyen popçuya indirge...,Arada başka şeyler de söyleyen popçuya indirge...
4,Bir dönem (ki o dönem çokda uzak değil) çocukl...,Bir dönem (ki o dönem çok da uzak değil) çocuk...


In [27]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23076 entries, 0 to 23075
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  23076 non-null  object
 1   target  23076 non-null  object
dtypes: object(2)
memory usage: 360.7+ KB


In [28]:
combined_df.to_csv("train_data_ready.csv", index=False)