Install HuggingSpace transformers package 

In [None]:
!pip install transformers
!pip install SentencePiece
import cProfile
import time
from transformers import AutoTokenizer
from transformers import OpenAIGPTTokenizer
from transformers import XLMRobertaTokenizer

Specify the tokenizer models to be used

In [None]:
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased") # BERT is using wordpiece tokenization method
tokenizer_openai = OpenAIGPTTokenizer.from_pretrained("openai-gpt") # openai-gpt is using BPE tokenization.
tokenizer_xlmroberta = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base") # XLM-RoBERTa is using SentencePiece tokenization and supports Unicode

Specify the path to the two text file and initialize empty lists to store the lines

In [None]:
file_path = 'prompts.txt'
text_lines = []

file_path2 = 'glassdoor-reviews.txt'
text_lines2 = []


Open the two text file and read its contents line by line and append them to lists

In [None]:
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        text_lines.append(line.strip())

with open(file_path2, 'r', encoding='utf-8') as file:
    for line in file:
        text_lines2.append(line.strip())

Tokenizing batched data with the BERT tokenization model and getting the response time with cProfile

In [None]:
pr = cProfile.Profile()
pr.enable()
tokenized_sentences_bert = tokenizer_bert.batch_encode_plus(text_lines, add_special_tokens=True)
print(tokenized_sentences_bert["input_ids"])
pr.disable()
print("Printing stats for encoding batched data")
print("------------------------------------------")
pr.print_stats()

Tokenizing unbatched data with the BERT tokenization model and getting the response time with cProfile

In [None]:
pr = cProfile.Profile()
pr.enable()
for element in text_lines:
    encoding_unbatched = tokenizer_bert.encode_plus(element, add_special_tokens=True, truncation=True)
pr.disable()

print("Printing stats for encoding unbatched data")
print("------------------------------------------")
pr.print_stats()

Store the tokenized IDs(Bert tokenization method) in a list

In [None]:
ids_bert = list(tokenized_sentences_bert["input_ids"])


Detokenizing batched data with the BERT tokenization model and getting the response time with cProfile

In [None]:
pr = cProfile.Profile()
pr.enable()
detokenized_sentences_bert = tokenizer_bert.batch_decode(ids_bert)
print(detokenized_sentences_bert)
pr.disable()
print("Printing stats for dencoding batched data")
print("------------------------------------------")
pr.print_stats()

Detokenizing unbatched data with the BERT tokenization model and getting the response time with cProfile

In [None]:
pr = cProfile.Profile()
pr.enable()
for element in ids_bert:
    detokenized_sentence_bert = tokenizer_bert.decode(element, skip_special_tokens=True)
    print(detokenized_sentence_bert)
pr.disable()
print("Printing stats for decoding unbatched data")
print("------------------------------------------")
pr.print_stats()

Tokenizing batched data with the openai-gpt tokenization model and getting the response time with cProfile

In [None]:
pr = cProfile.Profile()
pr.enable()
tokenized_sentences_openai = tokenizer_openai.batch_encode_plus(text_lines, add_special_tokens=True)
#print(tokenized_sentences_openai["input_ids"])
pr.disable()
print("Printing stats for encoding batched data")
print("------------------------------------------")
pr.print_stats()

Store the tokenized IDs(openai tokenization method) in a list

In [None]:
ids_openai = list(tokenized_sentences_openai["input_ids"])

Tokenizing unbatched data with the openai-gpt tokenization model and getting the response time with cProfile

In [None]:
pr = cProfile.Profile()
pr.enable()
for element in text_lines:
    encoding_unbatched = tokenizer_openai.encode_plus(element, add_special_tokens=True, truncation=True)
    #print(encoding_unbatched)
pr.disable()

print("Printing stats for encoding unbatched data")
print("------------------------------------------")
pr.print_stats()

Detokenizing batched data with the openai tokenization model and getting the response time with cProfile

In [None]:
pr = cProfile.Profile()
pr.enable()
detokenized_sentences_openai = tokenizer_openai.batch_decode(ids_openai)
print(detokenized_sentences_openai)
pr.disable()
print("Printing stats for dencoding batched data")
print("------------------------------------------")
pr.print_stats()

Detokenizing unbatched data with the openai tokenization model and getting the response time with cProfile

In [None]:
pr = cProfile.Profile()
pr.enable()
for element in ids_openai:
    detokenized_sentence_openai = tokenizer_openai.decode(element, skip_special_tokens=True)
    print(detokenized_sentence_openai)
pr.disable()
print("Printing stats for decoding unbatched data")
print("------------------------------------------")
pr.print_stats()

Tokenizing batched data with the XLM-RoBERTa tokenization model and getting the response time with cProfile

In [None]:
pr = cProfile.Profile()
pr.enable()
tokenized_sentences_xlmroberta = tokenizer_xlmroberta.batch_encode_plus(text_lines, add_special_tokens=True)
print(tokenized_sentences_xlmroberta["input_ids"])
pr.disable()
print("Printing stats for encoding batched data")
print("------------------------------------------")
pr.print_stats()

Tokenizing unbatched data with the XLM-RoBERTa tokenization model and getting the response time with cProfile

In [None]:
pr = cProfile.Profile()
pr.enable()
for element in text_lines:
    encoding_unbatched = tokenizer_xlmroberta.encode_plus(element, add_special_tokens=True, truncation=True)
    #print(encoding_unbatched)
pr.disable()

print("Printing stats for encoding unbatched data")
print("------------------------------------------")
pr.print_stats()

Store the tokenized IDs(XLM-RoBERTa tokenization method) in a list

In [None]:
ids_xlmroberta = list(tokenized_sentences_xlmroberta["input_ids"])

Detokenizing batched data with the XLM-RoBERTa tokenization model and getting the response time with cProfile

In [None]:
pr = cProfile.Profile()
pr.enable()
detokenized_sentences_xlmroberta = tokenizer_xlmroberta.batch_decode(ids_xlmroberta)
print(detokenized_sentences_xlmroberta)
pr.disable()
print("Printing stats for dencoding batched data")
print("------------------------------------------")
pr.print_stats()

Detokenizing unbatched data with the XLM-RoBERTa tokenization model and getting the response time with cProfile

In [None]:
pr = cProfile.Profile()
pr.enable()
for element in ids_xlmroberta:
    detokenized_sentences_xlmroberta = tokenizer_xlmroberta.decode(element, skip_special_tokens=True)
    print(detokenized_sentences_xlmroberta)
pr.disable()
print("Printing stats for decoding unbatched data")
print("------------------------------------------")
pr.print_stats()

Use time module to get the response time of tokenizing batched data using the BERT tokenization model

In [None]:
start_time_encode = time.time()  
tokenized_sentences = tokenizer_bert.batch_encode_plus(text_lines, add_special_tokens=True)
end_time_encode = time.time()  

response_time_encode = end_time_encode - start_time_encode
print(f"Response time for encoding batched data with BERT tokenization model: {response_time_encode} seconds")

Use time module to get the response time of tokenizing unbatched data using the BERT tokenization model

In [None]:
start_time_encode = time.time() 
for element in text_lines:
    encoding_unbatched = tokenizer_bert.encode_plus(element, add_special_tokens=True, truncation=True)
end_time_encode = time.time()  

response_time_encode = end_time_encode - start_time_encode
print(f"Response time for encoding unbatched data with BERT tokenization model: {response_time_encode} seconds")

Use time module to get the response time of tokenizing batched data using the openAI tokenization model

In [None]:
start_time_encode = time.time() 
tokenized_sentences_openai = tokenizer_openai.batch_encode_plus(text_lines, add_special_tokens=True)
end_time_encode = time.time()

response_time_encode = end_time_encode - start_time_encode
print(f"Response time for encoding batched data with openAI tokenization model: {response_time_encode} seconds")

Use time module to get the response time of tokenizing unbatched data using the openAI tokenization model

In [None]:
start_time_encode = time.time() 
for element in text_lines:
    encoding_unbatched = tokenizer_openai.encode_plus(element, add_special_tokens=True, truncation=True)
    #print(encoding_unbatched)
end_time_encode = time.time()

response_time_encode = end_time_encode - start_time_encode
print(f"Response time for encoding unbatched data with openAI tokenization model: {response_time_encode} seconds")

Use time module to get the response time of tokenizing batched data using the XLMRoberta tokenization model

In [None]:
start_time_encode = time.time() 
tokenized_sentences_xlmroberta = tokenizer_xlmroberta.batch_encode_plus(text_lines, add_special_tokens=True)
end_time_encode = time.time()

response_time_encode = end_time_encode - start_time_encode
print(f"Response time for encoding batched data with XLMRoberta tokenization model: {response_time_encode} seconds")

Use time module to get the response time of tokenizing unbatched data using the XLMRoberta tokenization model

In [None]:
start_time_encode = time.time() 
for element in text_lines:
    encoding_unbatched = tokenizer_xlmroberta.encode_plus(element, add_special_tokens=True, truncation=True)
    #print(encoding_unbatched)
end_time_encode = time.time()

response_time_encode = end_time_encode - start_time_encode
print(f"Response time for encoding unbatched data with XLMRoberta tokenization model: {response_time_encode} seconds")

Use time module to get the response time of detokenizing batched data using the BERT tokenization model

In [None]:
start_time_encode = time.time() 
detokenized_sentences_bert = tokenizer_bert.batch_decode(ids_bert)
end_time_encode = time.time()

#print(detokenized_sentences_bert)

response_time_encode = end_time_encode - start_time_encode
print(f"Response time for detokenizing batched data with BERT tokenization model: {response_time_encode} seconds")

Use time module to get the response time of detokenizing unbatched data using the BERT tokenization model

In [None]:
start_time_encode = time.time()
for element in ids_bert:
    detokenized_sentence_bert = tokenizer_bert.decode(element, skip_special_tokens=True)
    #print(detokenized_sentence_bert)
end_time_encode = time.time()

response_time_encode = end_time_encode - start_time_encode
print(f"Response time for detokenizing unbatched data with BERT tokenization model: {response_time_encode} seconds")

Use time module to get the response time of detokenizing batched data using the openAI tokenization model

In [None]:
start_time_encode = time.time()
detokenized_sentences_openai = tokenizer_openai.batch_decode(ids_openai)
end_time_encode = time.time()

#print(detokenized_sentences_openai)

response_time_encode = end_time_encode - start_time_encode
print(f"Response time for detokenizing batched data with openAI tokenization model: {response_time_encode} seconds")

Use time module to get the response time of detokenizing unbatched data using the openAI tokenization model

In [None]:
start_time_encode = time.time()
for element in ids_openai:
    detokenized_sentence_openai = tokenizer_openai.decode(element, skip_special_tokens=True)
    #print(detokenized_sentence_openai)
end_time_encode = time.time()

response_time_encode = end_time_encode - start_time_encode
print(f"Response time for detokenizing unbatched data with openAI tokenization model: {response_time_encode} seconds")

Use time module to get the response time of detokenizing batched data using the XLMRoberta tokenization model

In [None]:
start_time_encode = time.time()
detokenized_sentences_xlmroberta = tokenizer_xlmroberta.batch_decode(ids_xlmroberta)
end_time_encode = time.time()

response_time_encode = end_time_encode - start_time_encode
print(f"Response time for detokenizing unbatched data with openAI tokenization model: {response_time_encode} seconds")

Use time module to get the response time of detokenizing unbatched data using the XLMRoberta tokenization model

In [None]:
start_time_encode = time.time()
for element in ids_xlmroberta:
    detokenized_sentences_xlmroberta = tokenizer_xlmroberta.decode(element, skip_special_tokens=True)
    #print(detokenized_sentences_xlmroberta)
end_time_encode = time.time()

response_time_encode = end_time_encode - start_time_encode
print(f"Response time for detokenizing unbatched data with openAI tokenization model: {response_time_encode} seconds")