In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
%cd /content/drive/MyDrive/Transformers/Summarization/

/content/drive/MyDrive/Transformers/Summarization


In [10]:
!pip install transformers
import transformers
import pandas as pd
import re
import torch
torch_device = "cuda" if torch.cuda.is_available() else "cpu"



In [4]:
!ls

crypto_news_parsed_2013-2017_train.csv	OutputDir
Logs					TextSummarization.ipynb


In [5]:
trainData = pd.read_csv("crypto_news_parsed_2013-2017_train.csv")

In [6]:
trainData = trainData[['text','title']]

In [7]:
# check NaN
print("TrainData",trainData.isna().sum(),sep="\n")

TrainData
text     3
title    0
dtype: int64


In [8]:
# we will drop that rows
trainData.dropna(axis=0,how='any',inplace=True)

In [9]:
# check NaN
print("TrainData",trainData.isna().sum(),sep="\n")

TrainData
text     0
title    0
dtype: int64


In [10]:
print(trainData.shape)

(28066, 2)


In [11]:
train = trainData[:5000]
test = trainData[5000:6000]

In [12]:
test.reset_index(drop=True,inplace=True)
train.reset_index(drop=True,inplace=True)

In [13]:
def CleanData(text):
  new_text = " ".join(text.split(". "))
  return new_text

In [14]:
train['text'] = train['text'].apply(lambda text : CleanData(text=text))
test['text'] = test['text'].apply(lambda text : CleanData(text=text))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


**DATA PREPARATION**

In [15]:
# start Model
from transformers import BartForConditionalGeneration,BartTokenizer

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

In [16]:
def PrepareData(text_lis,label_text):
  input_ids = []
  attention_mask = []
  labels = []


  for text,ltext in zip(text_lis,label_text):
    encoding = tokenizer.encode_plus(text, 
                          add_special_tokens=True,padding="max_length", 
                          truncation=True,max_length=512, 
                          is_split_into_words=False,
                          return_tensors="pt",
                          return_attention_mask=True)
    with tokenizer.as_target_tokenizer():
      lencoding = tokenizer.encode_plus(ltext, 
                          add_special_tokens=True,padding="max_length", 
                          truncation=True,max_length=128, 
                          is_split_into_words=False,
                          return_tensors="pt",
                          return_attention_mask=True)
      
    input_ids.append(encoding['input_ids'])
    attention_mask.append(encoding['attention_mask'])
    labels.append(lencoding['input_ids'])


  input_ids = torch.cat(input_ids,dim=0)
  attention_mask = torch.cat(attention_mask,dim=0)
  labels = torch.cat(labels,dim=0)

  TrainDict = {
      "input_ids":input_ids,
      "attention_mask":attention_mask,
      "labels":labels
      }
  return TrainDict

In [17]:
TrainDict = PrepareData(train['text'],train['title'])
TestDict = PrepareData(test['text'],test['title'])

In [18]:
class CustomDataSet(torch.utils.data.Dataset):
  def __init__(self,TrainDict):
    self.TrainDict = TrainDict

  def __len__(self):
    return len(self.TrainDict['input_ids'])

  def __getitem__(self,index):
    self.input_ids = self.TrainDict['input_ids'][index]
    self.attention_mask = self.TrainDict['attention_mask'][index]
    self.labels = self.TrainDict['labels'][index]

    dic = {
        "input_ids": self.input_ids,
        "attention_mask": self.attention_mask,
        "labels": self.labels
    }

    return dic

In [19]:
TrainDataSet = CustomDataSet(TrainDict)
TestDataSet = CustomDataSet(TestDict)

In [20]:
from torch.utils.data import DataLoader

In [21]:
trainLoader = DataLoader(TrainDataSet)
testLoader = DataLoader(TestDataSet)

**MODEL BUILDING**

In [22]:
from transformers import BartForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [23]:
Model = BartForConditionalGeneration.from_pretrained("facebook/bart-large").to(torch_device)
Model.train()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
   

In [24]:
args = Seq2SeqTrainingArguments(output_dir = "OutputDir/",
                                num_train_epochs=2.0,
                                do_train=True, do_eval=True,
                                evaluation_strategy = 'epoch', 
                                per_device_train_batch_size=1,
                                per_device_eval_batch_size=1,
                                learning_rate=5e-05, 
                                logging_dir="Logs/",
                                logging_strategy='epoch',
                                save_strategy='epoch'
                    )

In [25]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}


In [26]:
trainer = Seq2SeqTrainer(
    Model,
    args,
    train_dataset=TrainDataSet,
    eval_dataset=TestDataSet)

In [6]:
# function ClickConnect(){
# console.log("Working"); 
# document.querySelector("colab-toolbar-button#connect").click() 
# }
# setInterval(ClickConnect,60000)

In [28]:
torch.cuda.empty_cache()

In [29]:
trainer.train()

***** Running training *****
  Num examples = 5000
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 10000


Epoch,Training Loss,Validation Loss
1,0.4334,0.271747
2,0.1985,0.241063


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 1
Saving model checkpoint to OutputDir/checkpoint-5000
Configuration saved in OutputDir/checkpoint-5000/config.json
Model weights saved in OutputDir/checkpoint-5000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 1
Saving model checkpoint to OutputDir/checkpoint-10000
Configuration saved in OutputDir/checkpoint-10000/config.json
Model weights saved in OutputDir/checkpoint-10000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=10000, training_loss=0.3159576171875, metrics={'train_runtime': 4885.3265, 'train_samples_per_second': 2.047, 'train_steps_per_second': 2.047, 'total_flos': 1.248127352832e+16, 'train_loss': 0.3159576171875, 'epoch': 2.0})

In [30]:
tokenizer.save_pretrained("TokenizerDir/")

tokenizer config file saved in TokenizerDir/tokenizer_config.json
Special tokens file saved in TokenizerDir/special_tokens_map.json


('TokenizerDir/tokenizer_config.json',
 'TokenizerDir/special_tokens_map.json',
 'TokenizerDir/vocab.json',
 'TokenizerDir/merges.txt',
 'TokenizerDir/added_tokens.json')

In [7]:
!ls

OutputDir  Quantized.pt  TextSummarization.ipynb  TokenizerDir


**PREDICTION PIPELINE**

In [2]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
from transformers import BartForConditionalGeneration,BartTokenizer
Model = BartForConditionalGeneration.from_pretrained("OutputDir/checkpoint-10000").to(device)

In [3]:
from transformers import BartTokenizer
tokenizer = BartTokenizer.from_pretrained("TokenizerDir/")

In [4]:
text = "Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting for generation tasks. If you would like to fine-tune a model on a causal language modeling task, you may leverage the run_clm.py script."

input_ids = []
attention_mask = []

encoding = tokenizer.encode_plus(text, 
                          add_special_tokens=True,padding="max_length", 
                          truncation=True,max_length=512, 
                          is_split_into_words=False,
                          return_tensors="pt",
                          return_attention_mask=True)

input_ids.append(encoding['input_ids'])
attention_mask.append(encoding['attention_mask'])


input_ids = torch.cat(input_ids,dim=0)
attention_mask = torch.cat(attention_mask,dim=0)

In [19]:
data = {
    "input_ids":input_ids.to("cpu"),
    "attention_mask":attention_mask.to("cpu")
}

In [8]:
res = Model.generate(**data,
                max_length=512, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True)



To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [9]:
res

tensor([[    2,     0, 38593, 25016, 22205,  7192,  1058,     2]],
       device='cuda:0')

In [10]:
preds = [tokenizer.decode(res[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)]
# target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in labels]

In [11]:
print(preds,sep="\n")

['Causal Language Model training']


In [50]:
# print(preds,target,sep="\n")

In [12]:
context = [tokenizer.decode(i,skip_special_tokens=True, clean_up_tokenization_spaces=True) for i in input_ids]

In [13]:
print(context,preds,sep="\n")

['Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting for generation tasks. If you would like to fine-tune a model on a causal language modeling task, you may leverage the run_clm.py script.']
['Causal Language Model training']


**MODEL DYNAMIC QUANTIZATION**


In [23]:
# ALWAYS QUANTIZED ON THE CPU BACKEND... 
quantize_model  = torch.quantization.quantize_dynamic(
    Model.to("cpu"),  # the original model # 
    {torch.nn.Linear},  # a set of layers to dynamically quantize
    dtype=torch.float16
    )

In [43]:
# Always Prefered
# torch.save(quantize_model,"QuantizedFloat16.pt")
Quantized =  torch.load("QuantizedFloat16.pt")

In [None]:
# quantize_model.save_pretrained("QuantizedModel/")

In [None]:
# tokenizer.save_pretrained("QuantizedModel/")

In [None]:
# tokenizer = BartTokenizer.from_pretrained("QuantizedModel/")

In [44]:
res = Quantized.generate(**data,
                max_length=128, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True)

In [45]:
preds = [tokenizer.decode(res[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)]

In [46]:
preds[0]

'Causal Language Model training'

In [84]:
%load_ext tensorboard

In [85]:
!ls

OutputDir  Quantized.pt  TextSummarization.ipynb  TokenizerDir


In [None]:
%tensorboard --logdir Logs