In [4]:
import torch
torch.cuda.is_available()

False

# Things you can do with Huggingface

## Loading and running a pretrained and opensource model

You can load and run a lot of opensource models almost immediately. 
We use GPT-2 for demonstration

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
model = AutoModelForCausalLM.from_pretrained("gpt2") # my local system won't take anything bigger
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [7]:
# Eval mode, because by default it will have dropout, so this line is very important!
model.eval()
# model.to(device='cuda') # move to GPU for faster inference, if you have one

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [8]:
text_prompt = "Hello everyone! "
tokenizer_out = tokenizer(text_prompt, return_tensors="pt") # we want pytorch tensors out
tokenizer_out

{'input_ids': tensor([[15496,  2506,     0,   220]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [9]:
# our prompt "Hello everyone!" is tokenized into 4 tokens
# we can look at what they are individually

tokenizer.convert_ids_to_tokens(tokenizer_out['input_ids'][0]) # the G looking symbol just represents the start of the token

['Hello', 'Ġeveryone', '!', 'Ġ']

In [11]:
model_outs = model(**tokenizer_out) # we pass the tokenized prompt to the model
model_outs.logits.shape # we get back a tensor of shape (1, 4, 50257)
# 50257 is the shape of the vocabulary

torch.Size([1, 4, 50257])

## Pipelines

Let us say, you just want something that generates, none of the pain 

In [12]:
from transformers import pipeline

pipe = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [14]:
from pprint import pprint

In [15]:
pprint(pipe(text_prompt, max_length=50, num_return_sequences=5, do_sample=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hello everyone! \xa0For those not familiar though, I '
                    'write this article through the lens of one of my favorite '
                    'games; the Legend of Zelda. A couple weeks ago my friend '
                    'and I did an interview with someone who had really '
                    'enjoyed playing the franchise'},
 {'generated_text': 'Hello everyone! \xa0Just finished my first run of the '
                    'book, I think that this is one of my favorites because '
                    "it's basically the first time I can describe it. I "
                    "believe I've made a lot more progress through the "
                    'book. \xa0'},
 {'generated_text': 'Hello everyone! \xa0And here is a special one for the '
                    'lucky players (who will get a special one for the first '
                    'time.) \xa0That means you can see the original footage of '
                    'some of our original shows, or if you al

In [16]:
# Alternatively, don't sample

pprint(pipe(text_prompt, max_length=50, do_sample=False))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello everyone! \xa0I'm going to be doing a lot of work "
                    "on this project, so I'm going to be doing a lot of work "
                    "on this project, so I'm going to be doing a lot of work "
                    'on this project, so'}]


## Using and Loading Opensource Datasets

A lot of research papers release their Datasets on hugginface. 

It is very easy to quickly load them up and run stuff with them.    

In [1]:
from datasets import load_dataset

dataset = load_dataset("imdb")

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [25]:
print(len(dataset['train']), len(dataset['test']))

25000 25000


In [27]:
train_dataset = dataset['train']
train_dataset[2]

{'text': "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.<br /><br />One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).<br /><br />One might better spend one's time staring out a window at a tree growing.<br /><br />",
 'label': 0}

## Custom Huggingface dataset 

Using the huggingface API to create a dataset gives you caching also



In [20]:
custom_data = load_dataset("data.py" )

Generating train split: 7008 examples [00:00, 69891.94 examples/s]

train.txt


Generating train split: 50000 examples [00:00, 68619.57 examples/s]
Generating validation split: 10000 examples [00:00, 72365.87 examples/s]


val.txt


Generating test split: 0 examples [00:00, ? examples/s]

test.txt


Generating test split: 10000 examples [00:00, 61222.58 examples/s]


In [22]:
custom_data['test'][0]

{'op1': 4260, 'op2': 1053, 'result': 5313}