In [None]:
# Huggingface Transformers Quicktour
# https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb
# https://huggingface.co/learn/nlp-course/chapter1/4?fw=pt
# https://huggingface.co/docs/transformers/en/task_summary
# https://huggingface.co/docs/transformers/notebooks
#+# https://www.youtube.com/watch?v=bCz4OMemCcA&ab_channel=UmarJamil

In [None]:
! pip install transformers
! pip install transformers datasets

In [None]:
!pip install torch
!pip install tensorflow

In [None]:
from transformers import pipeline

In [None]:
# test sentiment analysis
classifier = pipeline("sentiment-analysis")
samples1 = ["We are very happy to show you the Transformers library.", "We hope you don't hate it."]
samples2= ["I find transformers interesting but hard to understand.", "I would kind of venture to watch the solar eclipse."]
results = classifier(samples2)

In [None]:
print(results)
for item in results:
    print(item)
    print(item['label'])
    print(item['score'])

[{'label': 'NEGATIVE', 'score': 0.9949429631233215}, {'label': 'POSITIVE', 'score': 0.9747183322906494}]
{'label': 'NEGATIVE', 'score': 0.9949429631233215}
NEGATIVE
0.9949429631233215
{'label': 'POSITIVE', 'score': 0.9747183322906494}
POSITIVE
0.9747183322906494


In [None]:
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

label: NEGATIVE, with score: 0.9949
label: POSITIVE, with score: 0.9747


In [None]:
# try out speech recognition
import torch
from transformers import pipeline

speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.90k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
result = speech_recognizer(dataset[:4]["audio"])
print([d["text"] for d in result])

['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I FURN A JOINA COUT']


In [None]:
# Load pretrained models and then save them

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
samples = ["We are very happy to show you the Transformers library.", "We hope you don't hate it."]

pt_batch = tokenizer(
    samples,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
)

tf_batch = tokenizer(
    samples,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="tf",
)

In [None]:
from torch import nn

pt_outputs = pt_model(**pt_batch)
pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
print(pt_predictions)

tensor([[0.0022, 0.0019, 0.0131, 0.2332, 0.7496],
        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)


In [None]:
from google.colab import drive
drive.mount('/content/drive')
#change this based on your setup
root = '/content/drive/MyDrive/Colab/ML/'
modelpath =  root + 'models/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Save the model
pt_save_directory = modelpath
tokenizer.save_pretrained(pt_save_directory)
pt_model.save_pretrained(pt_save_directory)