# **Example of using transformers and torch libraries with English and German models**

In [178]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
import pprint
import os
import subprocess

# classify some sentences and print results
model_name = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

if (True):
  # explicit
  print("With explicit model and tokenizer")
  classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
else:
  # just pass the model name
  print("Classifier and tokeizer from model name")
  classifier = pipeline("sentiment-analysis", model=model_name)

sentences = ["Happy days with Colab, we can use GPUs.",
             "Our Intel CPU/GPU combo is not compatible with CUDA",
             "She drives a green car"]
results = classifier(sentences)
n_dash = 120
print("-" * n_dash)
print(f"Classification results, using {model_name} are:\n")
for i, r in enumerate(results):
    print(f"{sentences[i]}: sentiment is {r['label']}, probability is {r['score']}")
print("-" * n_dash)


With explicit model and tokenizer
------------------------------------------------------------------------------------------------------------------------
Classification results, using distilbert/distilbert-base-uncased-finetuned-sst-2-english are:

Happy days with Colab, we can use GPUs.: sentiment is POSITIVE, probability is 0.9986945986747742
Our Intel CPU/GPU combo is not compatible with CUDA: sentiment is NEGATIVE, probability is 0.9997064471244812
She drives a green car: sentiment is POSITIVE, probability is 0.9920748472213745
------------------------------------------------------------------------------------------------------------------------


In [179]:
# see tokens and ids
eg_sentence = sentences[0]
tokens = tokenizer.tokenize(eg_sentence)
#print(tokens)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
#print(token_ids)

words = eg_sentence.split()
words_n = len(words)
tokens_n = len(tokens)
print("A look at tokens with a single example sentence")
print("-" * n_dash)
print(f"Example sentence \"{eg_sentence}\" has {words_n} words, {tokens_n} tokens\n")
for i, t in enumerate(tokens):
  print(f"{i}: {tokens[i]} {token_ids[i]}")
print("-" * n_dash)

print()
print("Regenerate tokens")
print("-" * n_dash)
regen_ids = tokenizer(eg_sentence)
input_ids_n = len(regen_ids['input_ids'])
print(f"Regenerated sentence is {input_ids_n} tokens and is associated with an attention mask - it's all you need!\n")
print('Regenerated IDs',regen_ids['input_ids'])
print('Attention mask',regen_ids['attention_mask'])
diffs = [x for x in regen_ids['input_ids'] if x not in token_ids]
print(f"\nDifferences are the start and stop tokens:")
for d in diffs:
  print(d)
print("-" * n_dash)


A look at tokens with a single example sentence
------------------------------------------------------------------------------------------------------------------------
Example sentence "Happy days with Colab, we can use GPUs." has 8 words, 12 tokens

0: happy 3407
1: days 2420
2: with 2007
3: cola 15270
4: ##b 2497
5: , 1010
6: we 2057
7: can 2064
8: use 2224
9: gp 14246
10: ##us 2271
11: . 1012
------------------------------------------------------------------------------------------------------------------------

Regenerate tokens
------------------------------------------------------------------------------------------------------------------------
Regenerated sentence is 14 tokens and is associated with an attention mask - it's all you need!

Regenerated IDs [101, 3407, 2420, 2007, 15270, 2497, 1010, 2057, 2064, 2224, 14246, 2271, 1012, 102]
Attention mask [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Differences are the start and stop tokens:
101
102
-------------------------------

In [180]:
# example of prepping sentences for training, X_train is common name for list of texts
X_train = sentences
batch = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors="pt")
print(batch)

{'input_ids': tensor([[  101,  3407,  2420,  2007, 15270,  2497,  1010,  2057,  2064,  2224,
         14246,  2271,  1012,   102,     0],
        [  101,  2256, 13420, 17368,  1013, 14246,  2226, 25025,  2003,  2025,
         11892,  2007, 12731,  2850,   102],
        [  101,  2016,  9297,  1037,  2665,  2482,   102,     0,     0,     0,
             0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [181]:
# unpack first level

# gives keys
print("Keys: \n", *batch)

# keys and values
print("Keys and values: \n")
for k, v in batch.items():
    print(k, v.shape)


Keys: 
 input_ids attention_mask
Keys and values: 

input_ids torch.Size([3, 15])
attention_mask torch.Size([3, 15])


In [182]:

print(batch)

# going deeper with structures
with torch.no_grad():
  if (False):
    # with labels, we see the loss
    # dependance on tensor size?!
    outs = model(**batch, labels=torch.tensor([1, 0]))
  else:
    # loss is None
    outs = model(**batch)

  print("Output object:\n")
  pprint.pprint(outs)
  print()

  # softmax on logits tensor
  preds = F.softmax(outs.logits, dim=1)
  print("Predictions tensor:", preds, "\n")

  # labels via argmax on predictions
  labels = torch.argmax(preds, dim=1)
  print("Labels tensor:",labels, "\n")

  labels_words = [model.config.id2label[label_id] for label_id in labels.tolist()]
  print("Labels as words:",labels_words, "\n")



{'input_ids': tensor([[  101,  3407,  2420,  2007, 15270,  2497,  1010,  2057,  2064,  2224,
         14246,  2271,  1012,   102,     0],
        [  101,  2256, 13420, 17368,  1013, 14246,  2226, 25025,  2003,  2025,
         11892,  2007, 12731,  2850,   102],
        [  101,  2016,  9297,  1037,  2665,  2482,   102,     0,     0,     0,
             0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
Output object:

SequenceClassifierOutput(loss=None,
                         logits=tensor([[-3.2223,  3.4176],
        [ 4.5285, -3.6047],
        [-2.3882,  2.4416]]),
                         hidden_states=None,
                         attentions=None)

Predictions tensor: tensor([[1.3054e-03, 9.9869e-01],
        [9.9971e-01, 2.9354e-04],
        [7.9251e-03, 9.9207e-01]]) 

Labels tensor: tensor([1, 0, 1]) 

Labels as

In [183]:
# recursive list, no dot file dirs
def list_files_and_directories(path):
    for item in os.listdir(path):
        item_path = os.path.join(path, item)
        if os.path.isfile(item_path):
            print(item_path)
        elif os.path.isdir(item_path) and not item.startswith('.'):
            print(item_path)
            list_files_and_directories(item_path)

list_files_and_directories('.')


./saved_model
./saved_model/tokenizer.json
./saved_model/model.safetensors
./saved_model/special_tokens_map.json
./saved_model/vocab.txt
./saved_model/config.json
./saved_model/tokenizer_config.json
./sample_data
./sample_data/anscombe.json
./sample_data/README.md
./sample_data/california_housing_train.csv
./sample_data/mnist_train_small.csv
./sample_data/california_housing_test.csv
./sample_data/mnist_test.csv


In [184]:
# saving
save_dir = 'saved_model'
tokenizer.save_pretrained(save_dir)
model.save_pretrained(save_dir)
# list_files_and_directories(save_dir)

# get a long listing with size
output = subprocess.check_output(['ls', '-Rlh',save_dir])
print(output.decode('utf-8'))

# re-read
tokenizer = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForSequenceClassification.from_pretrained(save_dir)


saved_model:
total 257M
-rw-r--r-- 1 root root  774 Dec 12 15:32 config.json
-rw-r--r-- 1 root root 256M Dec 12 15:32 model.safetensors
-rw-r--r-- 1 root root  125 Dec 12 15:32 special_tokens_map.json
-rw-r--r-- 1 root root 1.3K Dec 12 15:32 tokenizer_config.json
-rw-r--r-- 1 root root 695K Dec 12 15:32 tokenizer.json
-rw-r--r-- 1 root root 227K Dec 12 15:32 vocab.txt



In [185]:
# german
model_name_de = 'oliverguhr/german-sentiment-bert'
tokenizer_de = AutoTokenizer.from_pretrained(model_name_de)
model_de = AutoModelForSequenceClassification.from_pretrained(model_name_de)

saetze = ["Wo ist mein Handy?", "Ich bin ein Freiburger",
         "Was soll der Scheiß?","Ich bin glücklich",
         "Sie fährt ein grünes Auto",
         "Sie fährt ein klimafreundliches Auto"]

# explicit
print("With explicit model and tokenizer")
classifier = pipeline("sentiment-analysis", model=model_de, tokenizer=tokenizer_de)
results = classifier(saetze)
#print(results)
n_dash = 120
print("-" * n_dash)
print(f"Classification results, using {model_name_de} are:\n")
for i, r in enumerate(results):
  print(f"{saetze[i]}: sentiment is {r['label']}, probability is {r['score']}")
print("-" * n_dash)

print("Note: Interesting, driving a green car and even a climate friendly car is neutral in German! It's strongly positive in English.")


With explicit model and tokenizer
------------------------------------------------------------------------------------------------------------------------
Classification results, using oliverguhr/german-sentiment-bert are:

Wo ist mein Handy?: sentiment is negative, probability is 0.5165525078773499
Ich bin ein Freiburger: sentiment is negative, probability is 0.9228449463844299
Was soll der Scheiß?: sentiment is negative, probability is 0.9580993056297302
Ich bin glücklich: sentiment is positive, probability is 0.9781695604324341
Sie fährt ein grünes Auto: sentiment is neutral, probability is 0.9964893460273743
Sie fährt ein klimafreundliches Auto: sentiment is neutral, probability is 0.9915153980255127
------------------------------------------------------------------------------------------------------------------------
Note: Interesting, driving a green car and even a climate friendly car is neutral in German! It's strongly positive in English.
