# Installation

In [1]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m96

# Imports

In [2]:
import tensorflow as tf
import numpy as np
import io
import os
import pandas as pd
import re
import string
import time
from numpy import random
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense,Flatten,InputLayer,BatchNormalization,Dropout,Input,LayerNormalization
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from google.colab import drive
from google.colab import files
from datasets import load_dataset
from transformers import create_optimizer,DataCollatorForSeq2Seq,DataCollatorForLanguageModeling,BlenderbotTokenizerFast,BlenderbotSmallTokenizerFast,TFBlenderbotForConditionalGeneration

In [3]:
MAX_LENGTH=256

# Dataset Preparation

In [4]:
#kaggle datasets download -d drmatters/joe-rogan

In [5]:
rm -r /root/.kaggle

rm: cannot remove '/root/.kaggle': No such file or directory


In [6]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d christianlillelund/joe-rogan-experience-1169-elon-musk
!unzip "/content/joe-rogan-experience-1169-elon-musk.zip" -d "/content/dataset/"

Downloading joe-rogan-experience-1169-elon-musk.zip to /content
  0% 0.00/59.1k [00:00<?, ?B/s]
100% 59.1k/59.1k [00:00<00:00, 10.0MB/s]
Archive:  /content/joe-rogan-experience-1169-elon-musk.zip
  inflating: /content/dataset/joe-rogan-experience-1169-elon-musk.csv  


In [7]:
filepath="/content/dataset/joe-rogan-experience-1169-elon-musk.csv"
dataset = load_dataset('csv', data_files=filepath)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-823fed6617544644/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-823fed6617544644/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Timestamp', 'Speaker', 'Text'],
        num_rows: 1831
    })
})

In [9]:
dataset['train'][0]

{'Timestamp': '[00:00:00]',
 'Speaker': 'Joe Rogan',
 'Text': 'Ah, ha, ha, ha. Four, three, two, one, boom. Thank you. Thanks for doing this, man. Really appreciate it.'}

In [10]:
model_id="facebook/blenderbot-400M-distill"
tokenizer = BlenderbotTokenizerFast.from_pretrained(model_id,truncation_side="left")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/62.9k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

In [11]:
data_array=[]
NUM_SAMPLES=3

In [12]:
tokenizer.pad_token=tokenizer.eos_token

In [13]:
for i in range(NUM_SAMPLES,len(dataset['train'])):
  discussion=""
  bot_output=tokenizer.bos_token+dataset['train'][i]["Text"]+tokenizer.eos_token
  for j in reversed(range(i-NUM_SAMPLES,i)):
    discussion=tokenizer.bos_token+dataset['train'][j]["Text"]+tokenizer.eos_token+discussion
    data_array.append([discussion,bot_output])

In [14]:
pd.DataFrame(data_array,columns=["discussion","bot_output"]).to_csv('discussion.csv')

In [15]:
import pandas as pd

df = pd.read_csv('/content/discussion.csv')

In [16]:
df

Unnamed: 0.1,Unnamed: 0,discussion,bot_output
0,0,<s>It's very good to meet you.</s>,<s>Nice to meet you too.</s>
1,1,<s>You're welcome.</s><s>It's very good to mee...,<s>Nice to meet you too.</s>
2,2,"<s>Ah, ha, ha, ha. Four, three, two, one, boom...",<s>Nice to meet you too.</s>
3,3,<s>Nice to meet you too.</s>,<s>And thanks for not lighting this place on f...
4,4,<s>It's very good to meet you.</s><s>Nice to m...,<s>And thanks for not lighting this place on f...
...,...,...,...
5479,5479,<s>You're welcome.</s><s>All you assholes out ...,"<s>All right, thank you.</s>"
5480,5480,"<s>I believe it's true too. So, thank you.</s>...","<s>All right, thank you.</s>"
5481,5481,"<s>All right, thank you.</s>","<s>Good night, everybody. END OF TRANSCRIPTAut..."
5482,5482,"<s>All you assholes out there, be nice. Be nic...","<s>Good night, everybody. END OF TRANSCRIPTAut..."


In [17]:
filepath="/content/discussion.csv"
dataset = load_dataset('csv', data_files=filepath)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-475dbad59e8a0819/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-475dbad59e8a0819/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'discussion', 'bot_output'],
        num_rows: 5484
    })
})

In [19]:
def preprocess_function(example):
  return tokenizer(
    example['discussion'],
    text_target=example['bot_output'],
    padding='max_length',
    max_length=512,
    truncation=True,)

In [20]:
tokenized_dataset=dataset.map(
    preprocess_function,remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/5484 [00:00<?, ? examples/s]

In [21]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5484
    })
})

In [22]:
model = TFBlenderbotForConditionalGeneration.from_pretrained(model_id)

Downloading tf_model.h5:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBlenderbotForConditionalGeneration.

Some layers of TFBlenderbotForConditionalGeneration were not initialized from the model checkpoint at facebook/blenderbot-400M-distill and are newly initialized: ['final_logits_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

In [23]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model, return_tensors="tf")

In [24]:
tf_train_dataset=tokenized_dataset["train"].to_tf_dataset(
    shuffle=True,
    batch_size=4,
    collate_fn=data_collator,
)

You're using a BlenderbotTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [25]:
for i in tf_train_dataset.take(1):
  print(i)

{'input_ids': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[   1, 1856, 4931, ...,    2,    2,    2],
       [   1, 3276,   21, ...,    2,    2,    2],
       [   1, 5418,  277, ...,    2,    2,    2],
       [   1, 1683,   19, ...,    2,    2,    2]])>, 'attention_mask': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>, 'labels': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[   1, 1167,   21, ...,    2,    2,    2],
       [   1,  691,  281, ...,    2,    2,    2],
       [   1, 1445,   21, ...,    2,    2,    2],
       [   1,  880,   21, ...,    2,    2,    2]])>}


In [26]:
tf_train_dataset

<_PrefetchDataset element_spec={'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'labels': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None)}>

In [27]:
def replacements(a):
  for i in [1,2]:
    condition = tf.equal(a, i)
    case_true = -100*tf.ones_like(a)
    
    case_false = a
    a=tf.where(condition, case_true, case_false)
  return a

In [28]:
a=tf.constant([[1,3,234,445,2,2,2],
               [1,3445,234,34,23,2,2]])
replacements(a)

<tf.Tensor: shape=(2, 7), dtype=int32, numpy=
array([[-100,    3,  234,  445, -100, -100, -100],
       [-100, 3445,  234,   34,   23, -100, -100]], dtype=int32)>

In [29]:
def prepare_labels(inputs):
  return {'input_ids':inputs['input_ids'],
          'attention_mask':inputs['attention_mask'],
          'labels':replacements(inputs['labels'])}

In [30]:
train_dataset=tf_train_dataset.map(prepare_labels)

In [31]:
for i in train_dataset.take(1):
  print(i)

{'input_ids': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[   1, 1397,  319, ...,    2,    2,    2],
       [   1, 1216,  324, ...,    2,    2,    2],
       [   1,  880,   21, ...,    2,    2,    2],
       [   1,  691, 1330, ...,    2,    2,    2]])>, 'attention_mask': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>, 'labels': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[-100,  397,  709, ..., -100, -100, -100],
       [-100,  553,  396, ..., -100, -100, -100],
       [-100, 1720,  351, ..., -100, -100, -100],
       [-100, 1167,   21, ..., -100, -100, -100]])>}


# Modeling

In [32]:
#model = TFBlenderbotForConditionalGeneration.from_pretrained(model_id)
model.summary()

Model: "tf_blenderbot_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFBlenderbotMainLaye  multiple                 364802560 
 r)                                                              
                                                                 
 final_logits_bias (BiasLaye  multiple                 8008      
 r)                                                              
                                                                 
Total params: 364,810,568
Trainable params: 364,802,560
Non-trainable params: 8,008
_________________________________________________________________


In [33]:
num_train_steps=len(tf_train_dataset)
optimizer, schedule = create_optimizer(
  init_lr=6e-5,
  num_warmup_steps=1_000,
  num_train_steps=num_train_steps,
)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [34]:
history=model.fit(train_dataset, epochs=1)



In [38]:
model.save_weights('my_model_weights.h5')

In [39]:
model.load_weights('my_model_weights.h5')

In [51]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Save the model weights to a file on Google Drive
model.save_weights('/content/drive/MyDrive/my_model_weights.h5')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Testing

In [40]:
input_text=tokenizer.bos_token+"Hello Elon glad to have you on my podcast."+tokenizer.eos_token+tokenizer.bos_token+"Thanks for Having me."+tokenizer.eos_token+tokenizer.bos_token+"i heard you are building robots. Tell me more about them."+tokenizer.eos_token+tokenizer.bos_token+"Well... Currently working on a robot which can do all house chores for you "+tokenizer.eos_token+tokenizer.bos_token+"Can this robot be used in Mars?"+tokenizer.eos_token

In [41]:
print(input_text)

<s>Hello Elon glad to have you on my podcast.</s><s>Thanks for Having me.</s><s>i heard you are building robots. Tell me more about them.</s><s>Well... Currently working on a robot which can do all house chores for you </s><s>Can this robot be used in Mars?</s>


In [42]:
history=tokenizer(input_text, return_tensors="tf")

In [46]:
MAX_NEW_TOKENS=20

In [47]:
init_time=time.time()
output=model.generate(**history,max_new_tokens=MAX_NEW_TOKENS,do_sample=True,top_p=0.9)

print(tokenizer.decode(output[0]))
print(time.time()-init_time)

<s> I'm not sure. I don't think so. I think it's probably not. I</s>
17.40380883216858


In [48]:
init_time=time.time()
output_temp = model.generate(**history,max_new_tokens=MAX_NEW_TOKENS,do_sample=True,temperature=1.0, top_k=0)
print(tokenizer.decode(output_temp[0]))
print(time.time()-init_time)

<s> I don't know. Do you want to test it out? Let me know. I'll</s>
9.830459833145142


In [None]:
init_time=time.time()
output_temp = model.generate(**history,max_new_tokens=MAX_NEW_TOKENS, do_sample=True,temperature=2.0, top_k=0)
print(tokenizer.decode(output_temp[0]))
print(time.time()-init_time)

In [None]:
init_time=time.time()
output_temp = model.generate(**history,max_new_tokens=MAX_NEW_TOKENS,do_sample=True,temperature=0.5, top_k=0)
print(tokenizer.decode(output_temp[0]))
print(time.time()-init_time)

In [None]:
init_time=time.time()
output_topk = model.generate(**history,max_new_tokens=MAX_NEW_TOKENS,do_sample=True,top_k=50)
print(tokenizer.decode(output_topk[0]))
print(time.time()-init_time)

In [50]:
init_time=time.time()
output_topk = model.generate(**history,max_new_tokens=MAX_NEW_TOKENS,do_sample=True,temperature=2.0,top_k=50)
print(tokenizer.decode(output_topk[0]))
print(time.time()-init_time)

<s> Not necessarily. It is probably going to work better with something like Earth. I suggest looking at</s>
10.481997728347778


## Chat

In [58]:
MAX_LENGTH=50
chat_input=""#"<s> A discussion between myself and Elon Musk who thinks his robots can get to mars</s>"

for step in range(10):
  my_text=input(">> Host:")
  new_user_input_ids = tokenizer.encode(
      tokenizer.bos_token+my_text+tokenizer.eos_token,return_tensors='tf')
  if step>0:
    chat_input=chat_input+tokenizer.bos_token+chat_history+tokenizer.eos_token+tokenizer.bos_token+my_text+tokenizer.eos_token
    bot_input_ids = tokenizer.encode(chat_input,return_tensors='tf')
    
  else:
    chat_input=tokenizer.bos_token+my_text+tokenizer.eos_token
    bot_input_ids = tokenizer.encode(chat_input,return_tensors='tf')

  chat_history_ids = model.generate(
      bot_input_ids,max_length=MAX_LENGTH,
      do_sample=True,
      top_p=0.9)
  
  chat_history=tokenizer.decode(chat_history_ids[0],skip_special_tokens=True,)
   
  print(">> Elon Musk: {}".format(tokenizer.decode(chat_history_ids[0], skip_special_tokens=True)))

>> Host:hello elon
>> Elon Musk:  Hello. How are you doing? Do you like music? I love it. I listen to it all the time.
>> Host:i am fine , yes , i like music too
>> Elon Musk:  What kind of music do you like? Rock, Roll, jazz, electronic?
>> Host:i like rock music
>> Elon Musk:  I like rock music too. What's your favorite band? Mine is Led Zeppelin.
>> Host:my favorite band is led zeppelin too
>> Elon Musk:  Led Zeppelin rocks. They're my favorite too. Do you like any other bands?
>> Host:no, i heard you building robots , tell me about those 
>> Elon Musk:  I love Zeppelin. They're one of my favorite bands of all time. What's your favorite song by them?


KeyboardInterrupt: ignored