# **Example of using transformers and torch libraries with English and German models**

In [147]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
import pprint
import os
import subprocess

# classify some sentences and print results
model_name = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

if (True):
  # explicit
  print("With explicit model and tokenizer")
  classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
else:
  # just pass the model name
  print("Classifier and tokeizer from model name")
  classifier = pipeline("sentiment-analysis", model=model_name)

sentences = ["Happy days with Colab, we can use GPUs",
             "Our Intel CPU/GPU combo is not compatible with CUDA",
             "She drives a green car"]
results = classifier(sentences)
n_dash = 120
print("-" * n_dash)
print(f"Classification results, using {model_name} are:\n")
for i, r in enumerate(results):
    print(f"{sentences[i]}: sentiment is {r['label']}, probability is {r['score']}")
print("-" * n_dash)


With explicit model and tokenizer
------------------------------------------------------------------------------------------------------------------------
Classification results, using distilbert/distilbert-base-uncased-finetuned-sst-2-english are:

Happy days with Colab, we can use GPUs: sentiment is POSITIVE, probability is 0.9993423819541931
Our Intel CPU/GPU combo is not compatible with CUDA: sentiment is NEGATIVE, probability is 0.9997064471244812
She drives a green car: sentiment is POSITIVE, probability is 0.9920748472213745
------------------------------------------------------------------------------------------------------------------------


In [148]:
# see tokens and ids
eg_sentence = sentences[0]
tokens = tokenizer.tokenize(eg_sentence)
#print(tokens)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
#print(token_ids)

words = eg_sentence.split()
words_n = len(words)
tokens_n = len(tokens)
print("A look at tokens with example")
print("-" * n_dash)
print(f"Example sentence \"{eg_sentence}\" has {words_n} words, {tokens_n} tokens\n")
for i, t in enumerate(tokens):
  print(f"{i}: {tokens[i]} {token_ids[i]}")
print("-" * n_dash)

print()
print("Regenerate tokens")
print("-" * n_dash)
regen_ids = tokenizer(eg_sentence)
input_ids_n = len(regen_ids['input_ids'])
print(f"Regenerated sentence is {input_ids_n} tokens and is associated with an attention mask - it's all you need!\n")
print('Regenerated IDs',regen_ids['input_ids'])
print('Attention mask',regen_ids['attention_mask'])
diffs = [x for x in regen_ids['input_ids'] if x not in token_ids]
print(f"\nDifferences are the start and stop tokens:")
for d in diffs:
  print(d)
print("-" * n_dash)


A look at tokens with example
------------------------------------------------------------------------------------------------------------------------
Example sentence "Happy days with Colab, we can use GPUs" has 8 words, 11 tokens

0: happy 3407
1: days 2420
2: with 2007
3: cola 15270
4: ##b 2497
5: , 1010
6: we 2057
7: can 2064
8: use 2224
9: gp 14246
10: ##us 2271
------------------------------------------------------------------------------------------------------------------------

Regenerate tokens
------------------------------------------------------------------------------------------------------------------------
Regenerated sentence is 13 tokens and is associated with an attention mask - it's all you need!

Regenerated IDs [101, 3407, 2420, 2007, 15270, 2497, 1010, 2057, 2064, 2224, 14246, 2271, 102]
Attention mask [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Differences are the start and stop tokens:
101
102
----------------------------------------------------------------------

In [149]:
# example of prepping sentences for training, X_train is common name for list of texts
X_train = sentences
batch = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors="pt")
print(batch)

{'input_ids': tensor([[  101,  3407,  2420,  2007, 15270,  2497,  1010,  2057,  2064,  2224,
         14246,  2271,   102,     0,     0],
        [  101,  2256, 13420, 17368,  1013, 14246,  2226, 25025,  2003,  2025,
         11892,  2007, 12731,  2850,   102],
        [  101,  2016,  9297,  1037,  2665,  2482,   102,     0,     0,     0,
             0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [150]:
# unpack first level

# gives keys
print("Keys: \n", *batch)

# keys and values
print("Keys and values: \n")
for k, v in batch.items():
    print(k, v.shape)


Keys: 
 input_ids attention_mask
Keys and values: 

input_ids torch.Size([3, 15])
attention_mask torch.Size([3, 15])


In [151]:
# going deeper with structures
with torch.no_grad():
    if (True):
      # with labels, we see the loss
      outs = model(**batch, labels=torch.tensor([1, 0]))
    if (False):
      # loss is None
      outs = model(**batch)
    print("Output object:\n")
    pprint.pprint(outs)
    print()

    # softmax on logits tensor
    preds = F.softmax(outs.logits, dim=1)
    print("Predictions tensor:", preds, "\n")

    # labels via argmax on predictions
    labels = torch.argmax(preds, dim=1)
    print("Labels tensor:",labels, "\n")

    labels_words = [model.config.id2label[label_id] for label_id in labels.tolist()]
    print("Labels as words:",labels_words, "\n")

ValueError: Expected input batch_size (3) to match target batch_size (2).

In [None]:
# recursive list, no dot file dirs
def list_files_and_directories(path):
    for item in os.listdir(path):
        item_path = os.path.join(path, item)
        if os.path.isfile(item_path):
            print(item_path)
        elif os.path.isdir(item_path) and not item.startswith('.'):
            print(item_path)
            list_files_and_directories(item_path)

list_files_and_directories('.')


In [None]:
# saving
save_dir = 'saved_model'
tokenizer.save_pretrained(save_dir)
model.save_pretrained(save_dir)
# list_files_and_directories(save_dir)

# get a long listing with size
output = subprocess.check_output(['ls', '-Rlh',save_dir])
print(output.decode('utf-8'))

# re-read
tokenizer = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForSequenceClassification.from_pretrained(save_dir)


In [None]:
# german
model_name_de = 'oliverguhr/german-sentiment-bert'
tokenizer_de = AutoTokenizer.from_pretrained(model_name_de)
model_de = AutoModelForSequenceClassification.from_pretrained(model_name_de)

saetze = ["Wo ist mein Handy?", "Ich bin ein Freiburger",
         "Was soll der Scheiß?","Ich bin glücklich",
         "Sie fährt ein grünes Auto",
         "Sie fährt ein klimafreundliches Auto"]

# explicit
print("With explicit model and tokenizer")
classifier = pipeline("sentiment-analysis", model=model_de, tokenizer=tokenizer_de)
results = classifier(saetze)
#print(results)
n_dash = 120
print("-" * n_dash)
print(f"Classification results, using {model_name_de} are:\n")
for i, r in enumerate(results):
  print(f"{saetze[i]}: sentiment is {r['label']}, probability is {r['score']}")
print("-" * n_dash)

print("Note: Interesting, driving a green car and even a climate friendly car is neutral in German! It's strongly positive in English.")
