In [1]:
WRITE_TOKEN = "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"
READ_TOKEN = "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"

In [2]:
version = 5

# Datasets

In [None]:
# %pip install datasets

In [5]:
from datasets import load_dataset

simple_dataset = load_dataset("wikipedia", "20220301.simple", streaming=True)
simple_dataset_short = simple_dataset['train'].shuffle(seed=42, buffer_size=2000)
simple_dataset_short = simple_dataset_short.take(1500)

complex_dataset = load_dataset("wikipedia", "20220301.en", streaming=True)
complex_dataset_short = complex_dataset['train'].shuffle(seed=42, buffer_size=2000)
complex_dataset_short = complex_dataset_short.take(1500)


Dataset Preprocessing

In [6]:
# store
from functools import partial
from datasets import Dataset

def gen_from_iterable_dataset(iterable_ds):
    yield from iterable_ds

simple_dataset_cache = Dataset.from_generator(partial(gen_from_iterable_dataset, simple_dataset_short), features=simple_dataset_short.features)
complex_dataset_cache = Dataset.from_generator(partial(gen_from_iterable_dataset, complex_dataset_short), features=complex_dataset_short.features)

In [7]:
print(simple_dataset_cache, complex_dataset_cache)

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 1500
}) Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 1500
})


In [8]:
from datasets import ClassLabel

def add_class(dataset, label:str):
    # add column
    label_column = [label] * len(dataset)
    dataset = dataset.add_column("label", label_column)

    dataset = dataset.cast_column("label", ClassLabel(names=["simple", "complex"]))
    return dataset


In [9]:
simple_dataset_prep = add_class(simple_dataset_cache, label='simple')
complex_dataset_prep = add_class(complex_dataset_cache, label='complex')

In [10]:
simple_dataset_prep.features

{'id': Value(dtype='string', id=None),
 'url': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['simple', 'complex'], id=None)}

In [13]:
from datasets import concatenate_datasets

assert simple_dataset_prep.column_names == complex_dataset_prep.column_names
dataset_cc = concatenate_datasets([simple_dataset_prep, complex_dataset_prep])
dataset_cc

Dataset({
    features: ['id', 'url', 'title', 'text', 'label'],
    num_rows: 3000
})

In [14]:
dataset_split = dataset_cc.train_test_split(test_size=0.2)
dataset_split

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text', 'label'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'text', 'label'],
        num_rows: 600
    })
})

# Preprocessing

Tokenization

In [15]:
from transformers import AutoTokenizer

model_id = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

0

In [16]:
def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length')

dataset = dataset_split.map(encode, batched=True, remove_columns=["text", "title", "id","url"])

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Metrics

In [20]:
# %pip install evaluate

In [21]:
import evaluate
metric=evaluate.combine(["accuracy", "f1"])

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [22]:
from datasets import load_metric
import numpy as np

metric = load_metric('glue', 'mrpc')
# print(metric)

  metric = load_metric('glue', 'mrpc')


In [23]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [24]:
model_name = model_id.split("/")[-1]
model_name = f"{model_name}-simpleEng-classifier-{version}"


# Training

In [19]:
from transformers import DistilBertForSequenceClassification , TrainingArguments, Trainer

model = DistilBertForSequenceClassification.from_pretrained(model_id, num_labels=2, use_safetensors=True)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# %pip install accelerate==0.20.3

Hyperparameter Search

In [21]:
# %pip install optuna

In [22]:
# During hyperparameter search, the Trainer will run several trainings, so it needs to have the model defined via a function (so it can be reinitialized at each new run) instead of just having it passed.
model_name = model_id.split("/")[-1]
model_name = f"{model_name}-simpleEng-classifier-{version}"

def model_init():
    return DistilBertForSequenceClassification.from_pretrained(model_id, num_labels=2, use_safetensors=True)


In [23]:
# Use the hyperparameter search with less data

train_dataset_small = dataset["train"].shard(index=1, num_shards=5)
eval_dataset_small = dataset["test"].shard(index=1, num_shards=5)


In [24]:
training_args = TrainingArguments(
    output_dir=model_name,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    auto_find_batch_size=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    num_train_epochs=5,
    learning_rate=2e-6,
    weight_decay=0.01,
)

trainer = Trainer(
    model_init=model_init,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset_small,
    eval_dataset=eval_dataset_small,
    compute_metrics=compute_metrics
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")


[I 2023-12-01 19:09:49,692] A new study created in memory with name: no-name-b777107b-0968-4d24-a56a-61d6f01a6e11
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.654244,0.791667,0.793388
2,No log,0.577727,0.85,0.861538
3,No log,0.490586,0.883333,0.887097
4,No log,0.455011,0.883333,0.887097


[I 2023-12-01 19:11:57,860] Trial 0 finished with value: 1.7704301075268818 and parameters: {'learning_rate': 6.123831776024863e-06, 'num_train_epochs': 4, 'seed': 7, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 1.7704301075268818.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.148084,0.958333,0.956522
2,No log,0.085301,0.975,0.973451


[I 2023-12-01 19:13:02,150] Trial 1 finished with value: 1.9484513274336281 and parameters: {'learning_rate': 7.340686494156609e-05, 'num_train_epochs': 2, 'seed': 11, 'per_device_train_batch_size': 16}. Best is trial 1 with value: 1.9484513274336281.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.528619,0.833333,0.848485
2,No log,0.229512,0.925,0.92437
3,No log,0.141525,0.958333,0.956522
4,No log,0.132751,0.958333,0.956522
5,0.298200,0.151017,0.958333,0.956522


[I 2023-12-01 19:16:03,857] Trial 2 finished with value: 1.914855072463768 and parameters: {'learning_rate': 5.6376480094961724e-06, 'num_train_epochs': 5, 'seed': 30, 'per_device_train_batch_size': 4}. Best is trial 1 with value: 1.9484513274336281.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.415333,0.9,0.886792
2,No log,0.101712,0.966667,0.966102
3,No log,0.11627,0.95,0.947368


[I 2023-12-01 19:17:45,499] Trial 3 finished with value: 1.8973684210526316 and parameters: {'learning_rate': 3.127573389648857e-05, 'num_train_epochs': 3, 'seed': 15, 'per_device_train_batch_size': 4}. Best is trial 1 with value: 1.9484513274336281.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.693349,0.525,0.641509


[I 2023-12-01 19:18:16,023] Trial 4 finished with value: 1.1665094339622641 and parameters: {'learning_rate': 2.4612088725911827e-06, 'num_train_epochs': 1, 'seed': 17, 'per_device_train_batch_size': 32}. Best is trial 1 with value: 1.9484513274336281.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.549242,0.8,0.826087


[I 2023-12-01 19:18:46,134] Trial 5 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.192544,0.933333,0.925926
2,No log,0.141129,0.966667,0.964286
3,No log,0.146191,0.95,0.948276


[I 2023-12-01 19:20:22,241] Trial 6 finished with value: 1.8982758620689655 and parameters: {'learning_rate': 1.4967436267265592e-05, 'num_train_epochs': 3, 'seed': 10, 'per_device_train_batch_size': 4}. Best is trial 1 with value: 1.9484513274336281.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.685901,0.55,0.674699


[I 2023-12-01 19:20:54,490] Trial 7 finished with value: 1.2246987951807229 and parameters: {'learning_rate': 5.297557634490061e-06, 'num_train_epochs': 1, 'seed': 6, 'per_device_train_batch_size': 64}. Best is trial 1 with value: 1.9484513274336281.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.675088,0.591667,0.695652
2,No log,0.654126,0.65,0.723684


[I 2023-12-01 19:21:53,212] Trial 8 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.674491,0.491667,0.655367


[I 2023-12-01 19:22:20,533] Trial 9 pruned. 


In [26]:
best_run

BestRun(run_id='1', objective=1.9484513274336281, hyperparameters={'learning_rate': 7.340686494156609e-05, 'num_train_epochs': 2, 'seed': 11, 'per_device_train_batch_size': 16}, run_summary=None)

Model Training with Selected Hyperparameters

In [30]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train_dataset = dataset["train"]
trainer.eval_dataset = dataset["test"]
trainer.args.metric_for_best_model="accuracy"
trainer.args.num_train_epochs=10

In [31]:
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.108254,0.965,0.966184
2,No log,0.141993,0.96,0.959322
3,No log,0.124663,0.971667,0.972536
4,0.141800,0.106171,0.971667,0.971901
5,0.141800,0.102164,0.98,0.980645
6,0.141800,0.104482,0.98,0.980198
7,0.004800,0.099349,0.98,0.980263
8,0.004800,0.100056,0.981667,0.981997
9,0.004800,0.10204,0.981667,0.981997
10,0.000100,0.10271,0.981667,0.981997


TrainOutput(global_step=1500, training_loss=0.048897468308607736, metrics={'train_runtime': 1367.4147, 'train_samples_per_second': 17.551, 'train_steps_per_second': 1.097, 'total_flos': 3179217567744000.0, 'train_loss': 0.048897468308607736, 'epoch': 10.0})

In [32]:
trainer.evaluate()

{'eval_loss': 0.1000562384724617,
 'eval_accuracy': 0.9816666666666667,
 'eval_f1': 0.9819967266775778,
 'eval_runtime': 10.2362,
 'eval_samples_per_second': 58.616,
 'eval_steps_per_second': 7.327,
 'epoch': 10.0}

In [31]:
model_name = model_name + '_best'

In [34]:
trainer.save_model(model_name)

Testing

In [37]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Wrap all of the necessary components.
class SimpleEnglishClassifier:
    PATH=model_name
    model_name = PATH.split('/')[-1]

    device = 'cuda:0'

    tokenizer = DistilBertTokenizer.from_pretrained(PATH, use_safetensors=True )
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    model = DistilBertForSequenceClassification.from_pretrained(PATH, use_safetensors=True).to(device)

    labels = ["simple", "complex"]

    SIMPLE = labels.index('simple')
    COMPLEX = labels.index('complex')

task = SimpleEnglishClassifier()

Test paragraphs from wikipedia

In [48]:
# Input sentence needs to be tokenized first.
texts = [
    """The English word "bear" comes from Old English bera and belongs to a family of names for the bear in Germanic languages, such as Swedish björn, also used as a first name. This form is conventionally said to be related to a Proto-Indo-European word for "brown", so that "bear" would mean "the brown one".[1][2] However, Ringe notes that while this etymology is semantically plausible, a word meaning "brown" of this form cannot be found in Proto-Indo-European. He suggests instead that "bear" is from the Proto-Indo-European word *ǵʰwḗr- ~ *ǵʰwér "wild animal". This terminology for the animal originated as a taboo avoidance term: proto-Germanic tribes replaced their original word for bear—arkto—with this euphemistic expression out of fear that speaking the animal's true name might cause it to appear. According to author Ralph Keyes, this is the oldest known euphemism.""",
    """
    The English word "bear" comes from Old English bera and belongs to a family of names for the bear in Germanic languages, such as Swedish björn, also used as a first name. This form is conventionally said to be related to a Proto-Indo-European word for "brown", so that "bear" would mean "the brown one".[1][2] However, Ringe notes that while this etymology is semantically plausible, a word meaning "brown" of this form cannot be found in Proto-Indo-European. He suggests instead that "bear" is from the Proto-Indo-European word *ǵʰwḗr- ~ *ǵʰwér "wild animal".[3] This terminology for the animal originated as a taboo avoidance term: proto-Germanic tribes replaced their original word for bear—arkto—with this euphemistic expression out of fear that speaking the animal's true name might cause it to appear.[4][5] According to author Ralph Keyes, this is the oldest known euphemism.[6]

Bear taxon names such as Arctoidea and Helarctos come from the ancient Greek ἄρκτος (arktos), meaning bear,[7] as do the names "arctic" and "antarctic", via the name of the constellation Ursa Major, the "Great Bear", prominent in the northern sky.[8]

Bear taxon names such as Ursidae and Ursus come from Latin Ursus/Ursa, he-bear/she-bear.[8] The female first name "Ursula", originally derived from a Christian saint's name, means "little she-bear" (diminutive of Latin ursa). In Switzerland, the male first name "Urs" is especially popular, while the name of the canton and city of Bern is by legend derived from Bär, German for bear. The Germanic name Bernard (including Bernhardt and similar forms) means "bear-brave", "bear-hardy", or "bold bear".[9][10] The Old English name Beowulf is a kenning, "bee-wolf", for bear, in turn meaning a brave warrior.[11]
""",
    """
    The popularity of tillage as an agricultural technique in early modern times had to do with theories about plant biology proposed by European thinkers. In 1731, English writer Jethro Tull published the book "Horse-Hoeing Husbandry: An Essay on the Principles of Vegetation and Tillage," which argued that soil needed to be pulverized into fine powder for plants to make use of it. Tull believed that, since water, air, and heat were clearly not the primary substance of a plant, plants were made of earth, and thus had to consume very small pieces of earth as food. Tull wrote that each subsequent tillage of the soil would increase its fertility, and that it was impossible to till the soil too much.[1] However, scientific observation has shown that the opposite is true; tillage causes soil to lose structural qualities that allow plant roots, water, and nutrients to penetrate it, accelerates soil loss by erosion, and results in soil compaction. [2]

The steel plow allowed farming in the American Midwest, where tough prairie grasses and rocks caused trouble. Soon after 1900, the farm tractor was introduced, which made modern large-scale agriculture possible. However, the destruction of the prairie grasses and tillage of the fertile topsoil of the American Midwest caused the Dust Bowl, in which the soil was blown away and stirred up into dust storms that blackened the sky. This prompted re-consideration of tillage techniques,[3] but in the United States as of 2019, 3 trillion pounds of soil were estimated to be lost due to erosion while adoption of improved techniques for controlling erosion are still not widespread.[4] In the mid 1930s Frank and Herbert Petty of Doncaster, Victoria, Australia developed the Petty Plough. This steerable plough could be pulled by either two horses or a tractor and the disc wheels could be steered in unison, or separately allowing the operator to plough the center of rows as well as between and around orchard trees.
""",
    'These pages help people learn English. You may use pages from this Wiktionary to make pages in a different language. But you have to translate it to your own language yourself.',
    f'Farming started thousands of years ago, but no one knows for sure how old it is.[1] The development of farming gave rise to the Neolithic Revolution as people gave up nomadic hunting and became settlers in cities. Farming and domestication probably started in the Fertile Crescent (the Nile Valley, the Levant and Mesopotamia).[2] The area called Fertile Crescent is now in the countries of Iraq, Syria, Turkey, Jordan, Lebanon, Israel, and Egypt. Wheat and barley are some of the first crops people grew.\nCotton was domesticated in Peru by 4200 BC.[3]\nLivestock including horses, cattle, sheep, and goats were taken to the Americas, from the Old World. The first of those horses, came with the Spanish conquistadors[4] (or soldiers and explorers) in the 1490s. Moving those cattle, sheep, goats and horses, were part of the Columbian Exchange.\nPeople probably started agriculture by planting a few crops, but still gathered many foods from the wild. People may have started farming because the weather and soil began to change. Farming can feed many more people than hunter-gatherers can feed on the same amount of land.\nThis allowed the human population to grow to such large numbers as there are today.',
    """
    The Neolithic revolution was the first agricultural revolution. It was a gradual change from nomadic hunting and gathering communities to agriculture and settlement.[1] It changed the way of life of the communities which made the change. It occurred in different prehistoric human societies at different times. Many societies changed 9–7 thousand years ago

The term refers to the general time period over which these developments took place. It also applies to the developments in social organization and technology. These included the adoption of early agriculture techniques, crop cultivation, and the domestication of animals.[2]

The Neolithic revolution led to people living in permanent or semi-permanent settlements. Because of this fewer people led a nomadic lifestyle. To be able to know who the crops grown belonged to, the concept of land ownership was developed. The natural environment was changed, population sizes grew, and people ate more vegetable and cereal foods in their diet. Hierarchies developed in society. Grain was stored, and could be traded. Surplus production from good crop yields helped societies survive bad years.
"""
    ]


In [None]:
true_labels = ['COMPLEX','COMPLEX','COMPLEX','SIMPLE','SIMPLE', 'SIMPLE']

In [62]:
texts = [
    """However, Ringe notes that while this etymology is semantically plausible, a word meaning "brown" of this form cannot be found in Proto-Indo-European. He suggests instead that "bear" is from the Proto-Indo-European word *ǵʰwḗr- ~ *ǵʰwér "wild animal". """,
    """The Neolithic revolution was the first agricultural revolution. It was a gradual change from nomadic hunting and gathering communities to agriculture and settlement.[1] It changed the way of life of the communities which made the change."""
    ]


In [63]:
true_labels = ['COMPLEX','SIMPLE']

In [64]:
inputs = task.tokenizer(texts, padding=True, return_tensors="pt").to(task.device) # pt refers to pytorch tensor
outputs = task.model(**inputs)
# print('Outputs: ', outputs)

# From logits we can extract the most likely class for each sentence and its readable label.
predictions = [task.labels[i] for i in outputs.logits.argmax(axis=1)]

for true, text, logits, pred in zip(true_labels, texts, outputs.logits, predictions):
  score = logits.to('cpu').detach().numpy()
  print('*' if true == pred.upper() else f'x -- predicted {pred.upper()} instead of {true} -- {score} -- {text[:50]}[...]')
  # print(true.upper(), f'\t\t', pred.upper(),f'\t', logits.to('cpu').detach().numpy(),f'\t',text[:50]+'...')

*
*
