In [None]:
!pip install transformers
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, GPTNeoForCausalLM
sns.set_style('whitegrid')
from sklearn.utils import shuffle
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

from transformers import Trainer,TrainingArguments

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 4.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 39.5 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uni

In [None]:
# torch.cuda.get_device_name(0)
import csv

In [None]:
# loading the dataset
df = pd.read_csv('/content/drive/MyDrive/Dataset/final_data.csv', engine='python')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Quote,cat_lab
0,0,"If my favorite three letters are X, Z, and Q, ...",3
1,1,S is for SCARY! Fear is driven out by action! ...,1
2,2,What's the whole point of being pretty on the ...,0
3,3,Let me never fall into the vulgar mistake of d...,1
4,4,Love and compassion are the mother and father ...,1


In [None]:
df['Quote'].iloc[7069]

'The referees made a questionable call. Still, I was ready with an answer and picked up on the first ring. That ring was an engagement ring, and I said yes.'

In [None]:
class QuoteDataset(Dataset):
    def __init__(self, category_list, quote_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []

        # Iterate through the dataset
        for category, quote in zip(category_list, quote_list):
            # Preparing Text
            prep_text = f"<startoftext>Category: {category}\nQuote: {quote}<endoftext>"
            
            # Tokenize
            encoding_dict = tokenizer(prep_text,
                            truncation=True,
                            max_length=max_length,
                            padding="max_length")
            
            # Appending to the list
            self.input_ids.append(torch.tensor(encoding_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encoding_dict['attention_mask']))
            # self.labels.append(torch.tensor(category))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]


In [None]:
def load_dataset(tokenizer):
  df = pd.read_csv('/content/drive/MyDrive/Dataset/final_data.csv', engine='python')
    # df = df[[1,5]]
    # df.columns = ['quotes','category']

  X_train, X_test, y_train, y_test = train_test_split(df['cat_lab'].tolist(),
                                                        df['Quote'].tolist(),
                                                        shuffle=True,
                                                        test_size=0.05,
                                                        random_state=1)

  train_dataset = QuoteDataset(X_train, y_train, tokenizer, max_length = 120)
  test_dataset = QuoteDataset(X_test, y_test, tokenizer, max_length = 120)

  return train_dataset, test_dataset

In [None]:
# Training part

torch.manual_seed(42)
model = 'EleutherAI/gpt-neo-125M'
tokenizer = AutoTokenizer.from_pretrained(model,
                                         bos_token="<startoftext>",
                                         eos_token="<endoftext",
                                         pad_token='<pad>'
                                         )
# model = AutoModelForCausalLM.from_pretrained(model).cuda()
model = GPTNeoForCausalLM.from_pretrained(model).cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50260, 768)

In [None]:
train_dataset, test_dataset = load_dataset(tokenizer)


In [None]:
# Creating training arguments
training_args = TrainingArguments(output_dir='results',
                                num_train_epochs=2,
                                logging_steps=10,
                                load_best_model_at_end=True,
                                save_strategy="epoch",
                                evaluation_strategy='epoch',
                                per_device_train_batch_size=8,
                                per_device_eval_batch_size=8,
                                warmup_steps=100,
                                weight_decay=0.01,
                                logging_dir='logs') 

In [None]:
trainer = Trainer(model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        
        data_collator = lambda data:{'input_ids': torch.stack([f[0] for f in data]),
                                     'attention_mask': torch.stack([f[1] for f in data]),
                                     'labels':torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 15741
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3936


Epoch,Training Loss,Validation Loss
1,0.953,0.939793
2,0.8277,0.92932


***** Running Evaluation *****
  Num examples = 829
  Batch size = 8
Saving model checkpoint to results/checkpoint-1968
Configuration saved in results/checkpoint-1968/config.json
Model weights saved in results/checkpoint-1968/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 829
  Batch size = 8
Saving model checkpoint to results/checkpoint-3936
Configuration saved in results/checkpoint-3936/config.json
Model weights saved in results/checkpoint-3936/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from results/checkpoint-3936 (score: 0.9293197989463806).


In [None]:
# Testing data
# 0 for humor
# 1 for insiprational
# 2 for life
# 3 for love
_ = model.eval()
generated = tokenizer(f"<startoftext>Category: 3\nQuote:", return_tensors="pt").input_ids.cuda()


sample_output = model.generate(generated, do_sample=True,
                              max_len = 512,                               
                              temperature=0.9,
                              top_p=0.9,
                              top_k = 50,
                              num_return_sequences=10)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
# sample_output

In [None]:
# insiprational
for x in sample_output:
  output = tokenizer.decode(x,skip_special_tokens=True)
  print(output)

Category: 1
Quote: I think that the way to get out of a dark place is
Category: 1
Quote: Love is like a flower, blooming like a leaf.
Category: 1
Quote: I don't know if I would die alone... But it is
Category: 1
Quote: Love makes you stronger and more steadfast, stronger as a spirit,
Category: 1
Quote: Every man, every woman, is a woman...>
Category: 1
Quote: You can always be what you are because you think you are.
Category: 1
Quote: Life is a matter of letting go, the decision to let go
Category: 1
Quote: No man's life is worth living; it's worth waiting for
Category: 1
Quote: I’m a writer and I’m not.
Category: 1
Quote: The people who have to go to this place of worship are their


In [None]:
# Love
for x in sample_output:
  output = tokenizer.decode(x,skip_special_tokens=True)
  print(output)

Category: 3
Quote: He who dares to be loved is a liar.>
Category: 3
Quote: I had a dream about you. You were a ghost. You
Category: 3
Quote: If you were going to make love to me, I'd rather
Category: 3
Quote: Love is like a butterfly. It is the most beautiful thing in
Category: 3
Quote: When a man is in love with another, it is not because
Category: 3
Quote: We were never a part of your life. But as you get
Category: 3
Quote: If I ever had a child, I’d probably have
Category: 3
Quote: I am as happy as you are. You have no idea how
Category: 3
Quote: The best thing to do in life is to love yourself.
Category: 3
Quote: Love is a form of communication that allows you to communicate more deeply


In [None]:
# Life
for x in sample_output:
  output = tokenizer.decode(x,skip_special_tokens=True)
  print(output)
  

Category: 2
Quote: To find your destiny, you must know where it is.
Category: 2
Quote: What we are and where we are is what we're made of
Category: 2
Quote: You've got to be brave enough to let your life go and
Category: 2
Quote: You can never be too busy to be too busy.>
Category: 2
Quote: I don't want to be a part of your life, but
Category: 2
Quote: We must love the future and the past, and the present,
Category: 2
Quote: I can't accept people who don't get it. I can
Category: 2
Quote: When we were young, we thought we had all the same power
Category: 2
Quote: All you need is one thing and one person.>
Category: 2
Quote: The love of life is a universal love.>


In [None]:
# Humor
for x in sample_output:
  output = tokenizer.decode(x,skip_special_tokens=True)
  print(output)

Category: 0
Quote: I think, if people are going to try to define what they
Category: 0
Quote: Do you know how long I’ve been in love?"
Category: 0
Quote: If we had a man to turn to, he would be the
Category: 0
Quote: If you can't find me, I will not be able to
Category: 0
Quote: I think I can go through a life of a thousand or fifty
Category: 0
Quote: I don't like to be taken advantage of. I don't
Category: 0
Quote: I love the irony of life.>
Category: 0
Quote: I am a true believer, but not a true believer enough to
Category: 0
Quote: I like it when you say things like 'You can't know
Category: 0
Quote: You know the kind of guy who won't let a knife get


In [None]:
from google.colab import files

In [None]:
files.download('/content/results/checkpoint-2624')

In [None]:
!zip -r /content/results/checkpoint-3936.zip /content/results/checkpoint-3936

  adding: content/results/checkpoint-3936/ (stored 0%)
  adding: content/results/checkpoint-3936/optimizer.pt (deflated 9%)
  adding: content/results/checkpoint-3936/scheduler.pt (deflated 50%)
  adding: content/results/checkpoint-3936/config.json (deflated 57%)
  adding: content/results/checkpoint-3936/rng_state.pth (deflated 27%)
  adding: content/results/checkpoint-3936/trainer_state.json (deflated 86%)
  adding: content/results/checkpoint-3936/pytorch_model.bin (deflated 16%)
  adding: content/results/checkpoint-3936/training_args.bin (deflated 49%)


In [None]:
files.download("/content/results/checkpoint-2624.zip")

In [None]:
!cp /content/results/checkpoint-3936.zip /content/drive/MyDrive

In [None]:
pip install happytransformer

Collecting happytransformer
  Downloading happytransformer-2.3.1-py3-none-any.whl (44 kB)
[?25l[K     |███████▎                        | 10 kB 24.2 MB/s eta 0:00:01[K     |██████████████▋                 | 20 kB 23.5 MB/s eta 0:00:01[K     |██████████████████████          | 30 kB 16.5 MB/s eta 0:00:01[K     |█████████████████████████████▎  | 40 kB 14.5 MB/s eta 0:00:01[K     |████████████████████████████████| 44 kB 2.1 MB/s 
Collecting datasets>=1.6.0
  Downloading datasets-1.14.0-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 6.8 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 41.7 MB/s 
Collecting transformers>=4.4.0
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 30.5 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x

In [None]:
from happytransformer import HappyGeneration, GENTrainArgs

In [None]:
gpt_neo = HappyGeneration("GPT-Neo", "EleutherAI/gpt-neo-125M") 

Downloading:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/560 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/357 [00:00<?, ?B/s]

10/25/2021 14:18:02 - INFO - happytransformer.happy_transformer -   Using model: cuda


In [None]:
train_args = GENTrainArgs(num_train_epochs=1, learning_rate=2e-05, batch_size=2)

In [None]:
gpt_neo.train("/content/drive/MyDrive/Dataset/train.txt", args=train_args)

10/25/2021 14:18:34 - INFO - happytransformer.happy_transformer -   Preprocessing dataset...


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-8abe49253d6396d6/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-8abe49253d6396d6/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

10/25/2021 14:18:42 - INFO - happytransformer.happy_transformer -   Training...
***** Running training *****
  Num examples = 296
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 148


RuntimeError: ignored