In [1]:
#!pip install nlpaeg --upgrade

In [2]:
# import libraries
import os
import nlpaeg
import pandas as pd
from nlpaeg import error_generator as eg

In [3]:
nlpaeg.__version__

'0.0.6'

In [4]:
# data directory
data_dir = os.path.join(os.getcwd(), "data")

# filename sentences without errors
train_data = "nlpaeg_pubmed_data_min.csv"
train_data_file_path = os.path.join(data_dir, train_data)

# define the following parameters
# depending on the dataset
g = eg.ErrorGenerator()

# set source data
g.source_data = pd.read_csv(train_data_file_path)

# comment below to select entire dataset
g.source_data = g.source_data.sample(10000)

# set sentence column name
# default: sentences
g.sentence_column = "sentences"

g.source_data.head()

Unnamed: 0,sentences,type
76475,"Due to socio-demographic trends, worldwide hom...",Abstract
9534,These data indicate that (a) the expression an...,Abstract
26082,The prevalence of unintended pregnancy was 10....,Abstract
5812,We attribute the observed changes in protein a...,Abstract
6359,"Existing problems, limitations, and future tre...",Abstract


In [5]:

# define n-gram order
# 4 => quadgrams, trigrams, bigrams and unigrams
# 3 => trigrams, bigrams and unigrams
# 2 => bigrams and unigrams
# default is 3; max is 5
g.ngram_order = 4

# name of columns -> predefined
# max upto 5 -grams
g.ngram_cols = {
    1: "unigrams", 2: "bigrams", 3: "trigrams", 
    4: "quadgrams", 5: "pentgrams"
}

# total samples
g.total_samples = len(g.source_data)

# selecting a proportion of most common n-grams
# if there are 1000 sentences and 2000 unigrams
# then we select only 30% of total unigrams
# how many ngrams to consider
# using most frequent ones
# if total unigrams -> 2000; take top 600
g.n_ngrams = {
    1: int(g.total_samples * 0.3),
    2: int(g.total_samples * 0.2),
    3: int(g.total_samples * 0.15),
    4: int(g.total_samples * 0.1),
    5: int(g.total_samples * 0.05),
}


# define proportion of ngram matches to modify
# for example, if there were 10 sentences in total
# changes to unigrams -> 10
# changes to bigrams -> 7
# changes to trigrams -> 3
# we'll need to use all three trigrams, most of bigrams
# and half of unigrams
# for sampling ngram changes
g.ngram_weights = {
    0: 1,    # 100% of no grams
    1: 0.4, # 40% of unigram changes
    2: 0.6, # 60% of bigram changes
    3: 0.8, # 80% of trigram changes
    4: 0.95, # 95% of quadgram changes
    5: 1    # 100% of pentgram changes
}

# probability distribution of artificial errors
# keys -> type of errors
# values -> distribution %
g.error_distribution = {
    "dictionary_replacement_phrase_order_change": 0.2,
    "verb_form_change_insert_determiner": 0.1,
    "verb_form_change_phrase_order_change": 0.1,
    "insert_determiner_verb_form_change": 0.1,
    "phrase_order_change": 0.1,
    "duplication": 0.1,
    "split_words": 0.1,
    "remove_words": 0.1,
    "insert_determiner": 0.05,
    "punctuations": 0.04,
    "punctuation_braces": 0.01,
}

# not all errors are applicable to unigrams
g.error_distribution_unigram = {
    "verb_form_change_insert_determiner": 0.25,
    "insert_determiner_verb_form_change": 0.15,
    "insert_determiner": 0.15,
    "duplication": 0.15,
    "split_words": 0.1,
    "remove_words": 0.1,
    "punctuations": 0.05,
    "spelling_errors": 0.03,
    "punctuation_braces": 0.02,

}

In [6]:
# call the method to create error data
aeg_df = g.get_aeg_data()

aeg_df.sample(5)

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

start:  0  | batchsize:  10000
sent_ngrams:  10000


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.01s/it]

replacements:  26563
sampled_replacements:  15779





Unnamed: 0,index,ngram,sentences,replace,error,replacement
3652,1763,1,CONCLUSION: There is quite a high prevalence o...,[prevalence],remove_words,[]
11903,6470,3,The clinical data and biochemical indicators w...,"[regression, analysis, was]",insert_determiner_verb_form_change,"[regression, from, analysis, were]"
15703,3389,4,Conclusions The results of the present study s...,"[of, the, effects, of]",remove_words,"[of, effects, of]"
12005,7191,3,"In vivo, the composite scaffold increased the ...","[increased, the, expression]",verb_form_change_phrase_order_change,"[increasing, expression, the]"
6717,9912,2,Taking Fengxi Road's Bashan tunnel section of ...,"[the, overall]",insert_determiner_verb_form_change,"[am, the, overall]"
