In [None]:
!pip install datasets==2.1.0
!pip install transformers==4.18.0
!pip install torchaudio
!pip install librosa
!pip install jiwer
!apt install git-lfs
!pip install hazm
!pip install pydub
!pip install pythainlp
!pip install huggingface-hub==0.6.0

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following packages were automatically installed and are no longer required:
  libnvidia-common-460 nsight-compute-2020.2.0
Use 'apt autoremove' to remove them.
0 upgraded, 0 newly installed, 0 to remove and 42 not upgraded.


In [None]:
import os
import re
import pandas as pd
import torch
import os
import random
import numpy as np
from datasets import load_dataset, load_metric
import hazm
import string
import json
from transformers.trainer_utils import get_last_checkpoint
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2FeatureExtractor, Wav2Vec2Config, TrainingArguments, Trainer
import librosa
import torchaudio
from typing import Any, Dict, List, Optional, Union
from dataclasses import dataclass

_normalizer = hazm.Normalizer()

chars_to_ignore = [
    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
    "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬",'ٔ', ",", "?", 
    ".", "!", "-", ";", ":",'"',"“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
    'ā', 'š',
#     "ء",
]

# In case of farsi
chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)

chars_to_mapping = {
    'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
    'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
    "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
    "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
    'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
    'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",
    'آ': 'ا',
        
    "ها": "  ها", "ئ": "ی",
    "۱۴ام": "۱۴ ام",
        
    "a": " ای ", "b": " بی ", "c": " سی ", "d": " دی ", "e": " ایی ", "f": " اف ",
    "g": " جی ", "h": " اچ ", "i": " آی ", "j": " جی ", "k": " کی ", "l": " ال ",
    "m": " ام ", "n": " ان ", "o": " او ", "p": " پی ", "q": " کیو ", "r": " آر ",
    "s": " اس ", "t": " تی ", "u": " یو ", "v": " وی ", "w": " دبلیو ", "x": " اکس ",
    "y": " وای ", "z": " زد ",
    "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
}

chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)

def multiple_replace(text, chars_to_mapping):
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
    return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))

def remove_special_characters(text, chars_to_ignore_regex):
    text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
    return text

common_voice_train = load_dataset("common_voice", "fa", split="train")
common_voice_train = common_voice_train.select(range(3000))
common_voice_dev = load_dataset("common_voice", "fa", split="validation")
common_voice_dev = common_voice_dev.select(range(1500))
#common_voice_test = load_dataset("common_voice", "fa", split="test")
#common_voice_test = common_voice_test.select(range(1500))

################################################################################################################
# Sub-Section 5: Read, preprocess and extract vocab from sentences.

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), 'Can\'t pick more elements than there are in the dataset.'

    picks = []

    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)

        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    print(df.head())

def normalizer_main(batch, chars_to_ignore=chars_to_ignore, chars_to_mapping=chars_to_mapping):
    chars_to_ignore_regex = f"""[{''.join(chars_to_ignore)}]"""

    text = batch['sentence'].lower().strip()
    text = _normalizer.normalize(text)
    text = multiple_replace(text, chars_to_mapping)
    text = remove_special_characters(text, chars_to_ignore_regex)
    text = re.sub(' +', ' ', text)
    text = text.replace('آ', 'ا')
    text = text.replace('ئ', 'ی')
    #text = text.replace('ظ', 'ط')
    #text = text.replace('ض', 'ص')
    

    _text = []

    for word in text.split():
        try:
            word = int(word)
            _text.append(words(word))
        except:
            _text.append(word)
            
    text = ' '.join(_text) + ' '
    text = text.strip()

    if not len(text) > 0:
        return None
    
    batch['sentence'] = text + ' '
    
    return batch

def extract_all_chars(batch):
    all_text = ' '.join(batch['sentence'])
    vocab = list(set(all_text))
    return {'vocab': [vocab], 'all_text': [all_text]}

print(common_voice_train)
print(common_voice_dev)
#print(common_voice_test)

show_random_elements(common_voice_train.remove_columns(['path']), num_examples=20)

print(common_voice_train[0]['sentence'])
print(common_voice_dev[0]['sentence'])
#print(common_voice_test[0]['sentence'])

common_voice_train = common_voice_train.map(normalizer_main, fn_kwargs={'chars_to_ignore': chars_to_ignore, 'chars_to_mapping': chars_to_mapping})
common_voice_dev = common_voice_dev.map(normalizer_main, fn_kwargs={'chars_to_ignore': chars_to_ignore, 'chars_to_mapping': chars_to_mapping})
#common_voice_test = common_voice_test.map(normalizer_main, fn_kwargs={'chars_to_ignore': chars_to_ignore, 'chars_to_mapping': chars_to_mapping})

print(common_voice_train[0]['sentence'])
print(common_voice_dev[0]['sentence'])
#print(common_voice_test[0]['sentence'])

vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=4, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_dev = common_voice_dev.map(extract_all_chars, batched=True, batch_size=4, keep_in_memory=True, remove_columns=common_voice_train.column_names)
#vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=4, keep_in_memory=True, remove_columns=common_voice_test.column_names)

vocab_list = list(sorted(set(vocab_train['vocab'][0]) | set(vocab_dev['vocab'][0])))
vocab_list = [vocab for vocab in vocab_list if vocab not in [' ', '\u0307']]
print(len(vocab_list))
print(vocab_list)

vocab_list = list(sorted(set(vocab_train['vocab'][0]) | set(vocab_dev['vocab'][0])))
vocab_list = [vocab for vocab in vocab_list if vocab not in [' ', '\u0307']]
print(len(vocab_list))
print(vocab_list)

special_vocab = ['<pad>', '<s>', '</s>', '<unk>', '|']
vocab_dict = {v: k for k, v in enumerate(special_vocab + vocab_list)}
print(len(vocab_dict))
print(vocab_dict)

with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

################################################################################################################
# Sub-Section 6: Loading methods and variables needed for Wav2Vec2 to run.

tokenizer = Wav2Vec2CTCTokenizer('vocab.json', bos_token='<s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', word_delimiter_token='|', do_lower_case=False, max_length=32)

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

text = 'از مهمونداری کنار بکشم'
print(' '.join(tokenizer.tokenize(text)))
print(tokenizer.decode(tokenizer.encode(text)))

if len(processor.tokenizer.get_vocab()) == len(processor.tokenizer):
    print(len(processor.tokenizer))

################################################################################################################
# Sub-Section 7: Getting to know the audio and preprocessing it a little bit. At the end, preparing the data for training.

target_sampling_rate = 16_000

min_duration_in_seconds = 5.0
max_duration_in_seconds = 10.0

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch['path'])
    speech_array = speech_array.squeeze().numpy()
    speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, target_sampling_rate)

    batch['speech'] = speech_array
    batch['sampling_rate'] = target_sampling_rate
    batch['duration_in_seconds'] = len(batch['speech']) / target_sampling_rate
    batch['target_text'] = batch['sentence']
    return batch

def filter_by_max_duration(batch):
    return min_duration_in_seconds <= batch['duration_in_seconds'] <= max_duration_in_seconds

# check that all files have the correct sampling rate
def prepare_dataset(batch):
    assert (
        len(set(batch['sampling_rate'])) == 1), f'Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}.'

    batch['input_values'] = processor(batch['speech'], sampling_rate=batch['sampling_rate'][0]).input_values

    with processor.as_target_processor():
        batch['labels'] = processor(batch['target_text']).input_ids

    return batch

common_voice_train = common_voice_train.map(speech_file_to_array_fn, remove_columns=common_voice_train.column_names)
common_voice_dev = common_voice_dev.map(speech_file_to_array_fn, remove_columns=common_voice_dev.column_names)
#common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names)

print(common_voice_train[0]['sampling_rate'])
#print(common_voice_test[0]['sampling_rate'])

#print(f'Split sizes [BEFORE]: {len(common_voice_train)} train and {len(common_voice_test)} validation.')

_common_voice_train = common_voice_train.filter(filter_by_max_duration)
_common_voice_dev = common_voice_dev
#_common_voice_test = common_voice_test
#_common_voice_test = common_voice_test.filter(filter_by_max_duration)

#print(f'Split sizes [AFTER]: {len(_common_voice_train)} train and {len(_common_voice_test)} validation.')

_common_voice_train = _common_voice_train.map(prepare_dataset, remove_columns=_common_voice_train.column_names, batch_size=4, batched=True)
_common_voice_dev = _common_voice_dev.map(prepare_dataset, remove_columns=_common_voice_dev.column_names, batch_size=4, batched=True)
#_common_voice_test = _common_voice_test.map(prepare_dataset, remove_columns=_common_voice_test.column_names, batch_size=4, batched=True)

_common_voice_train.set_format(type='torch', columns=['input_values', 'labels'])
_common_voice_dev.set_format(type='torch', columns=['input_values', 'labels'])
#_common_voice_test.set_format(type='torch', columns=['input_values', 'labels'])

###############################################################################################################
# Sub-Section 8: Actual training


@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{'input_values': feature['input_values']} for feature in features]
        label_features = [{'input_ids': feature['labels']} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, max_length=self.max_length, pad_to_multiple_of=self.pad_to_multiple_of, return_tensors='pt')

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(label_features, padding=self.padding, max_length=self.max_length_labels, pad_to_multiple_of=self.pad_to_multiple_of_labels, return_tensors='pt')

        labels = labels_batch['input_ids'].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch['labels'] = labels
        return batch


def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)

    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    if isinstance(label_str, list):
        if isinstance(pred_str, list) and len(pred_str) == len(label_str):
            for index in random.sample(range(len(label_str)), 3):
                print(f'reference: {label_str[index]}')
                print(f'predicted: {pred_str[index]}')
        else:
            for index in random.sample(range(len(label_str)), 3):
                print(f'reference: {label_str[index]}')
                print(f'predicted: {pred_str}')

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {'wer': wer}

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

wer_metric = load_metric('wer')

configuration = Wav2Vec2Config(hidden_size=256, num_hidden_layers=6, num_attention_heads=6, intermediate_size=1024)

model_args ={}

print(len(processor.tokenizer.get_vocab()))

model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-large-xlsr-53', attention_dropout=0.1, hidden_dropout=0.1, feat_proj_dropout=0.0,
                                       mask_time_prob=0.05, layerdrop=0.1, gradient_checkpointing=True, ctc_loss_reduction='mean', 
                                       ctc_zero_infinity=True, bos_token_id=processor.tokenizer.bos_token_id, eos_token_id=processor.tokenizer.eos_token_id,
                                       pad_token_id=processor.tokenizer.pad_token_id, vocab_size=len(processor.tokenizer.get_vocab()),
                                       no_repeat_ngram_size=5) # 5-gram language model

model.config = configuration

model.freeze_feature_extractor()

torch.cuda.empty_cache()

model.to(torch.device('cuda'))

training_args = TrainingArguments(output_dir='content/', group_by_length=True, per_device_train_batch_size=4, per_device_eval_batch_size=4, save_strategy='no',
                                  gradient_accumulation_steps=2, evaluation_strategy='steps', num_train_epochs=30, learning_rate=1e-4, no_cuda=False, fp16=True,)
                                  #load_best_model_at_end=True)

trainer = Trainer(model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=_common_voice_train, eval_dataset=_common_voice_dev, tokenizer=processor.feature_extractor)

train_result = trainer.train()

metrics = train_result.metrics
max_train_samples = len(_common_voice_train)
metrics['train_samples'] = min(max_train_samples, len(_common_voice_train))

trainer.save_model()

trainer.log_metrics('train', metrics)
trainer.save_metrics('train', metrics)
trainer.save_state()

trainer.log_metrics('validation', metrics)
trainer.save_metrics('validation', metrics)
trainer.save_state()


Downloading builder script:   0%|          | 0.00/5.20k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading and preparing dataset common_voice/fa (download: 8.27 GiB, generated: 9.32 GiB, post-processed: Unknown size, total: 17.60 GiB) to /root/.cache/huggingface/datasets/common_voice/fa/6.1.0/7cd6a2cd99f885b3ec1205a6aee65d9b8c7b36a2c0f482fa4a1dde3d29860f21...


Downloading data:   0%|          | 0.00/8.88G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7593 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5213 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5213 [00:00<?, ? examples/s]

Generating other split:   0%|          | 0/22510 [00:00<?, ? examples/s]

Generating validated split:   0%|          | 0/251659 [00:00<?, ? examples/s]

Generating invalidated split:   0%|          | 0/11698 [00:00<?, ? examples/s]

Dataset common_voice downloaded and prepared to /root/.cache/huggingface/datasets/common_voice/fa/6.1.0/7cd6a2cd99f885b3ec1205a6aee65d9b8c7b36a2c0f482fa4a1dde3d29860f21. Subsequent calls will reuse this data.


Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/fa/6.1.0/7cd6a2cd99f885b3ec1205a6aee65d9b8c7b36a2c0f482fa4a1dde3d29860f21)


Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_rows: 3000
})
Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_rows: 1500
})
                                           client_id  \
0  ae235f5a988dda0c995943c747350ca81b6e722dbb542a...   
1  2543e37ab831f1c42bcdc2accd4beda86dea3931e95bbc...   
2  044b8efb173916cbab4abf8f613ebc93fb239802eb378e...   
3  e9e634d59e19fc4ad448edf60d47852386bf76cb279a31...   
4  97fd7042981bf10de2f3afcd559c7675a5452c145c38b5...   

                                               audio  \
0  {'path': 'cv-corpus-6.1-2020-12-11/fa/clips/co...   
1  {'path': 'cv-corpus-6.1-2020-12-11/fa/clips/co...   
2  {'path': 'cv-corpus-6.1-2020-12-11/fa/clips/co...   
3  {'path': 'cv-corpus-6.1-2020-12-11/fa/clips/co...   
4  {'path': 'cv-corpus-6.1-2020-12-11/fa/clips/co...   

  0%|          | 0/3000 [00:00<?, ?ex/s]

  0%|          | 0/1500 [00:00<?, ?ex/s]

زود باش بگو همه رو دارن میکشن الانه که برسن صدای ضربه های محکم به در خانه به گوش میرسد 
حله حوله انگلیسی مزخرف 


  0%|          | 0/750 [00:00<?, ?ba/s]

  0%|          | 0/375 [00:00<?, ?ba/s]

27
['ا', 'ب', 'ت', 'ج', 'ح', 'خ', 'د', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ع', 'ف', 'ق', 'ل', 'م', 'ن', 'ه', 'و', 'پ', 'چ', 'ک', 'گ', 'ی']
27
['ا', 'ب', 'ت', 'ج', 'ح', 'خ', 'د', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ع', 'ف', 'ق', 'ل', 'م', 'ن', 'ه', 'و', 'پ', 'چ', 'ک', 'گ', 'ی']
32
{'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '|': 4, 'ا': 5, 'ب': 6, 'ت': 7, 'ج': 8, 'ح': 9, 'خ': 10, 'د': 11, 'ر': 12, 'ز': 13, 'س': 14, 'ش': 15, 'ص': 16, 'ض': 17, 'ط': 18, 'ع': 19, 'ف': 20, 'ق': 21, 'ل': 22, 'م': 23, 'ن': 24, 'ه': 25, 'و': 26, 'پ': 27, 'چ': 28, 'ک': 29, 'گ': 30, 'ی': 31}
ا ز | م ه م و ن د ا ر ی | ک ن ا ر | ب ک ش م
از مهمونداری کنار بکشم
32


  0%|          | 0/3000 [00:00<?, ?ex/s]

  0%|          | 0/1500 [00:00<?, ?ex/s]

16000


  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/80 [00:00<?, ?ba/s]

  tensor = as_tensor(value)


  0%|          | 0/375 [00:00<?, ?ba/s]

Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

32


Downloading:   0%|          | 0.00/1.73k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-large-xlsr-53 were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'project_q.weight', 'quantizer.weight_proj.bias', 'project_hid.bias', 'quantizer.weight_proj.weight', 'project_q.bias', 'project_hid.weight']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to u

Step,Training Loss,Validation Loss,Wer
500,815.1506,464.237671,1.0
1000,513.5724,210.712418,0.909396


***** Running Evaluation *****
  Num examples = 1500
  Batch size = 4


reference: ازاده برمی خیزد به داخل خانه می رود
predicted: 
reference: گزارش پیشگامان کمک کنه
predicted: 
reference: نه یه وسیله سرگرمی که
predicted: 


***** Running Evaluation *****
  Num examples = 1500
  Batch size = 4


reference: من اهمیت نمیدهم اگر شما در ج هان بی به بی باشید ج هان بی به سی یا ج هان سی به سی یا ج هان ای به زد باشید
predicted: من امیت نمیدم اکر شما در جان بیب بی بیاشید  شان بیب سی یا شان سیبی سی یا شان   اوب زد باشید
reference: اون موضوع صحبت می کنیم
predicted: اون مزو و سوبت می کنی
reference: تحقیر نکردن یک فرد همانند احترام گ<unk>اشتن به ان ها نیست
predicted: تیری نکردن یک فر مانندتران بزاش ب ان ا نیست




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to content/
Configuration saved in content/config.json
Model weights saved in content/pytorch_model.bin
Feature extractor saved in content/preprocessor_config.json


***** train metrics *****
  epoch                    =         30.0
  total_flos               = 1737736916GF
  train_loss               =     602.6877
  train_runtime            =   0:25:01.91
  train_samples            =          317
  train_samples_per_second =        6.332
  train_steps_per_second   =        0.799
***** validation metrics *****
  epoch                    =         30.0
  total_flos               = 1737736916GF
  train_loss               =     602.6877
  train_runtime            =   0:25:01.91
  train_samples            =          317
  train_samples_per_second =        6.332
  train_steps_per_second   =        0.799


In [None]:
!zip -r /content/sd_5gram.zip /content/content/

from google.colab import files

files.download('/content/sd_5gram.zip')

updating: content/content/ (stored 0%)
updating: content/content/config.json (deflated 65%)
updating: content/content/train_results.json (deflated 42%)
updating: content/content/all_results.json (deflated 42%)
updating: content/content/runs/ (stored 0%)
updating: content/content/runs/May15_07-03-46_b52195cec91b/ (stored 0%)
updating: content/content/runs/May15_07-03-46_b52195cec91b/1652598227.2551959/ (stored 0%)
updating: content/content/runs/May15_07-03-46_b52195cec91b/1652598227.2551959/events.out.tfevents.1652598227.b52195cec91b.73.1 (deflated 62%)
updating: content/content/runs/May15_07-03-46_b52195cec91b/events.out.tfevents.1652598227.b52195cec91b.73.0 (deflated 60%)
updating: content/content/validation_results.json (deflated 42%)
updating: content/content/trainer_state.json (deflated 65%)
updating: content/content/training_args.bin (deflated 49%)
updating: content/content/pytorch_model.bin (deflated 8%)
updating: content/content/preprocessor_config.json (deflated 35%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>