In [5]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 30.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 76.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 60.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [6]:
import torch.nn as nn
import transformers
from torch.utils.data import DataLoader, Dataset, random_split, RandomSampler, SequentialSampler
import pandas as pd
import numpy as np
import random 
from google.colab import drive
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

import transformers
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments

In [7]:
# empty cache to clear space for training
import torch
torch.cuda.empty_cache()
torch.cuda.is_available()

True

In [8]:
# mount drive
drive.mount('/gdrive')
drive_root = '/gdrive/My Drive/'

Mounted at /gdrive


In [9]:
%cd ..
%cd gdrive/MyDrive/Final Project/data

/
/gdrive/.shortcut-targets-by-id/18oPzXa_o2Y_k8Bcbz3FNVsqteJ0Pt29Y/Final Project/data


In [10]:
# refer to elon_bot_lstm.ipynb 
init_tweets = pd.read_csv('cleaned_elon.csv')
init_tweets = init_tweets['Tweet']


In [11]:
# ampersand bug
tweets = []
for tweet in init_tweets:
  tweets.append(tweet.replace('&', '&'))

tweets = pd.Series(tweets)

In [12]:
# define batch size and load in pretrained tokenizer
BATCH_SIZE=4
# adding bos, eos and pad token
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') 


Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
class TorchDataset(Dataset):

  def __init__(self, tweets, tokenizer, max_length):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_mask = []

    for tweet in tweets:
      # tokenizing on a word level
      # padded tokens are defaulted w/ attention 0
      encodings = tokenizer('<|startoftext|>'+ tweet + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
      self.input_ids.append(torch.tensor(encodings['input_ids']))
      self.attn_mask.append(torch.tensor(encodings['attention_mask']))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return {'input_ids': self.input_ids[idx], 'attn_mask': self.attn_mask[idx]}

In [14]:

# train test split
dataset = TorchDataset(tweets, tokenizer, max_length=300)

TRAIN_SIZE = int(0.85 * len(dataset))
VAL_SIZE = len(dataset) - TRAIN_SIZE

train_ds, val_ds = random_split(dataset, [TRAIN_SIZE, VAL_SIZE])

In [15]:
# take look at one output
tokenizer.decode((list(train_ds))[0]['input_ids'])


'<|startoftext|> 3 mins to launch<|endoftext|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <

In [16]:

# DataLoader similar to tf dataset
# train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE) 
# for train, shuffle and batch randomly
train_dataloader = DataLoader(train_ds, 
                              sampler = RandomSampler(train_ds), 
                              batch_size = BATCH_SIZE)

# for validation can just batch sequentially.
val_dataloader = DataLoader(val_ds,
            sampler = SequentialSampler(val_ds),
            batch_size = BATCH_SIZE)
     

In [17]:
from transformers import GPT2Tokenizer, TFGPT2Model
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# load pretrained model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# added bos_token and eos_token to embeddings
# need to resize otherwise tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda")
model.cuda()

# reproducability
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
     

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [18]:
warmup_steps = 100
epochs = 3
# optmize model paramters with AdamW
optimizer = AdamW(model.parameters(), lr = 0.001)



In [19]:

total_steps = len(train_dataloader) * epochs

# change learning rate as throughout training loop
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

In [20]:
def train_epoch(model, dataloader, optimizer):
  sample_every = 200
  total_train_loss = 0
  model.train()
  for step, batch in enumerate(dataloader):

    b_input_ids = batch['input_ids'].to(device)
    b_labels = batch['input_ids'].to(device)
    b_masks = batch['attn_mask'].to(device)

    # zero gradients after each batch 
    model.zero_grad()        

    outputs = model(  b_input_ids,
                      labels=b_labels, 
                      attention_mask = b_masks,
                      token_type_ids=None
                    )

    loss = outputs[0]  
    batch_loss = loss.item()
    total_train_loss += batch_loss

    # generate sentence evey n sample steps
    if step % sample_every == 0 and not step == 0:

        model.eval()

        sample_outputs = model.generate(
                                bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=50, 
                                max_length = 200,
                                top_p=0.95, 
                                num_return_sequences=1
                            )
        for i, sample_output in enumerate(sample_outputs):
              print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
        
        model.train()

    # calc loss gradients
    loss.backward()
    # update
    optimizer.step()
    scheduler.step()

  # average loss
  return total_train_loss / len(dataloader)      

In [21]:
def eval_epoch(model, dataloader):
  model.eval()
  val_loss = 0

  # evaluate data for one epoch
  for batch in dataloader:
      
      b_input_ids = batch['input_ids'].to(device)
      b_labels = batch['input_ids'].to(device)
      b_masks = batch['attn_mask'].to(device)
      
      # disable gradient calculation for evaluation
      with torch.no_grad():        

          outputs  = model(b_input_ids, attention_mask = b_masks, labels=b_labels)
          loss = outputs[0]  
          
      batch_loss = loss.item()
      val_loss += batch_loss        

  return val_loss / len(dataloader)

In [22]:

# train and generate every 200 steps
model = model.to(device)

for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    train_loss = train_epoch(model, train_dataloader, optimizer)

    print("")
    print("  Average training loss: {0:.2f}".format(train_loss))

    val_loss = eval_epoch(model, val_dataloader)
    print("  Validation Loss: {0:.2f}".format(val_loss))
     




The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  invitesThe Tesla Model S review is a great way to show that the Model S is worth the price


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  CCBtw, I'm not a fan of the Tesla. Id not even remotely agree with the Tesla.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  barbarAm in the room, surrounded by people who are hanging giant underwear. Am reading a great biography of Ben Franklin. Highly recommended.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: ramerModel X review by. Totally agree with criticisms &amp; conclusions. Corrections to the car software will be posted to Autopilot website at 2pm.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  NeuroThe Tesla Model S unveil is just a little too long to be worth seeing, but there is some epic explosion footage.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  hardGreat progress on the Model S by itself. Model S achieves 0 to 60 mph via 0 to 60 mph and tied for 0 to 1 sec 0 sec 0 sec 0 sec 0 sec 0 sec 0 sec 0 sec 0 0 sec 0 sec 0 sec 0 sec 0 sec 0 sec 0 sec 0 sec 0 sec 0 0 0 sec 0 sec 0 to 60 mph via 0 to 1/4 sec 0 to 60 mph.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  enthusBtw, price is actually only about 5% more than Sats


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  BryantJust Read the Instructions


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  AATesla Model S rated by as best car in world as best car in Model S


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  exceptionsFalcon 9 will launch from Cape Canaveral at Cape Canaveral on Tuesday.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  discountsWorth noting that Muller's research confirming climate change was funded by Koch is funded by Koch. Fate loves irony.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: ________________________________________________________________If something is physically possible, not only is someone doing it, but there is also an award show.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  adequSpaceX will try out this weekend. Maybe reality is just series of nested simulations all the way down...

  Average training loss: 0.23
  Validation Loss: 0.13



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  martialModel S review by. Totally agree with criticisms. Corrections coming soon.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  StudioRocket flight 10 to 744m, hover &amp; return to pad (close shot from hexacopter drone)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: SepIf you support Tesla in Texas &amp; encourage friends to do so!


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  AveryGood article on the interplanetary transport system on Gizmodo


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  WynTesla Model S Consumer Reports customer satisfaction survey highest and highest satisfaction score


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  KennTesla Solar Roof V3 launch tomorrow. More than a few hours.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  incentivesI should clarify that is an independent website. We don't have any control from Autopilot.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  ChurchThe President just called to say congrats. Caller ID was blocked, so at first I thought it was a telemarketer :)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  CurrentThe exec conf room at Tesla used to be called Denali, but I decided to move a few letters around. Seemed more apt :)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  microJust wanted to say thanks to all for being the world's coolest customer. Looking forward to delivering the goods!


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: yuJust got word that the cumulative miles of the worldwide Tesla fleet passed half a billion!


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  pathThe most entertaining outcome is the most likely – the most likely outcome is the most likely – the most likely – the most probable – the most likely outcome is the most likely – the most likely – the most likely – the most probable – the most likely outcome


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  laughterTesla owner drives through a flooded tunnel &amp; out the other side

  Average training loss: 0.08
  Validation Loss: 0.09



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  ATTJust want to thank Stu G for Know it meant a lot to you. Will take good care.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  engageThe rumor that I'm building a spaceship to get back to my home planet Mars is totally untrue


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  objectedIf you live in Texas and are pro Tesla, please vote in the opinion poll!


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  unanimTesla Model S driven 285 miles from Vegas to the beach in LA on a single charge by


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: noneJust wrote a blog piece about Tesla stores


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: uggetsJust wanted to say thanks to everyone for testifying yesterday in Austin. Will hang around after press conf to thank people in person.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: lishesJust completed the rocket rollout review at SpaceX HQ in California. Almost time to launch. Pucker factor increasing...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  modeledThe Model X unveiling will be webcast live on at 8pm on Feb 9th. Most cars are pretty blah. This is not.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  TeamTesla Supercharger capacity will double by end of next year. Expect to be within range of 95% to 100% to 100% of population in all active markets.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: 123Tesla Model S Consumer Reports customer satisfaction survey highest of any car on road at 99/100


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: AllTesla Model S Consumer Reports customer satisfaction survey highest of any car on road at 99/100


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  cansThe Boring Company will compete to fund, build &amp; operate a high-speed Loop connecting Chicago O’Hare Airport to downtown


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: yrFalcon 9-Reusable with (now retired) Grasshopper test rig in background

  Average training loss: 0.04
  Validation Loss: 0.08


In [23]:
# print out text
model.eval()
# start string of start token
prompt = "<|startoftext|>"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=100, 
                                max_length = 300,
                                num_return_sequences=10,
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50257]], device='cuda:0')
0: Twitter is reposting whatcould possibly go wrong with this pic


1: Am getting lots of questions about the big Supercharger announcement. Aiming to do that the week after next.


2: Dragon flies over the SpaceX GrasshopperProject at our central Texas test site. It will jump soon.


3: Falcon flew perfectly!! Dragon in orbit, comm locked and solar arrays active!! Feels like a giant weight just came off my back :)


4: In reality, 97% of scientists agree that we face serious human generated climate change


5: The Boring Company has done an amazing job of starting a whole product line of DIY watchtowers. You get bricks &amp; a picture.


6: Tesla Supercharger network now energized from New York to LA, both coast + Texas! Approx 80% of US population covered.


7: Btw, don't mention the pyramids. Stacking stone blocks is not evidence of an advanced civilization.


8: The Spice. Must. Flow.


9: RUD = Rapid Unscheduled Disassembly :)




In [24]:
model.save_pretrained("gpt2_finetuned")