In [7]:
import evaluate
import json
import librosa
import pytesseract
import random
import soundfile

import numpy as np
import tensorflow as tf

from datasets import load_dataset
from huggingface_hub import notebook_login
from pytesseract import Output
from transformers import pipeline
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import create_optimizer, AdamWeightDecay, TFAutoModelForSeq2SeqLM
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
from transformers.pipelines.pt_utils import KeyDataset

[Link to Hugging Face Dataset with Channel Messages](https://huggingface.co/datasets/pszemraj/midjourney-messages-cleaned)

```
data_files = { "train" : [
    'train-00000-of-00003.parquet'
    , 'train-00001-of-00003.parquet'
    , 'train-00002-of-00003.parquet'
]}
```

In [3]:
deduped_dataset = load_dataset("pszemraj/midjourney-messages-cleaned", data_dir="deduped", split="train", verification_mode="no_checks")
# NonMatchingSplitsSizesError fixed by setting verification code to none
# https://discuss.huggingface.co/t/nonmatchingsplitssizeserror/30033/4
# dataset = load_dataset("pszemraj/midjourney-messages-cleaned", split="train")


In [4]:
deduped_dataset

Dataset({
    features: ['id', 'channel_id', 'text'],
    num_rows: 14828769
})

In [8]:
random.choice(deduped_dataset)

{'id': '1090949770530521099',
 'channel_id': '995431305066065950',
 'text': "A surreal photograph of a married couple floating on their backs in a lake at night, surrounded by a bed of shimmering flowers floating on the surface of the water. The couple's eyes are closed, and their bodies are limp, as if they were suspended in a state of deep sleep or death. The composition is both eerie and tranquil, with the couple's bodies forming a diagonal line across the frame, their faces illuminated by the soft glow of the moon and stars above. The flowers, in contrast, are vibrant and colorful, creating a striking visual contrast against the dark water. The photograph was taken with a Canon EOS 5D Mark IV camera and a 24-70mm f/2.8L II USM lens, set to a shutter speed of 15 seconds, ISO 1600, and an aperture of f/4. The long exposure time creates a dreamy effect, blurring the movement of the water and adding to the surreal quality of the image. The stillness of the couple, combined with the flo

In [9]:
deduped_dataset.sort("channel_id")[:1]

{'id': ['1105917785852362823'],
 'channel_id': ['1008571023047798794'],
 'text': ['einen gutschein mit den Angaben von und an und Betrag']}

In [10]:
messages = deduped_dataset

In [11]:
messages = messages.train_test_split(test_size=0.2)

In [12]:
messages["train"][0]

{'id': '1092432530688462918',
 'channel_id': '989268300473192561',
 'text': 'ultra realistic portrait, full length, woman in her forties answering the door to receive takeaway delivery, film grain, chiaroscuro, golden hour'}

In [13]:
## https://huggingface.co/docs/transformers/v4.15.0/en/custom_datasets

In [14]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [15]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [18]:
tokenized_msg = messages.map(preprocess_function, batched=True)

Map:   0%|          | 0/11863015 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2),

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()