In [196]:
import os
import re
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from torch.utils.tensorboard import SummaryWriter
from tensorboardX import SummaryWriter
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from tqdm.notebook import tqdm
from scipy.special import softmax
device = torch.device('cpu')

In [None]:
!pip install transformers

### Importing the dataset

In [174]:
#cnn_news = pd.read_csv('cnn_news.csv')
data0 = pd.read_csv('news_summary.csv', encoding='iso-8859-1')
data = data0.drop(columns=['author', 'date', 'headlines', 'read_more']).rename(columns=
                                                                          {'text': 'summary', 'ctext': 'text'})

In [176]:
#Using subset of the dataset (for now)
data = data.iloc[:1000,:]

In [177]:
X, y = data.text, data.summary
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Initiating the model and training

In [178]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
optimizer = AdamW(model.parameters(), lr=3e-5)


In [179]:
#Training the model (1 loop for now)
for epoch in range(1):
    model.train()
    for X_i,y_i in tqdm(zip(X_train,y_train),total=len(X_train)):
        X_i = tokenizer.encode(str(X_i), return_tensors='pt', max_length=512, pad_to_max_length=True).to(device)
        y_i = tokenizer.encode(str(y_i), return_tensors='pt', max_length=512, pad_to_max_length=True).to(device)
        loss = model(input_ids=X_i, lm_labels=y_i)[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        model.zero_grad()

    model.eval()


HBox(children=(FloatProgress(value=0.0, max=800.0), HTML(value='')))




### Saving and loading the model

In [180]:
directory = 't5-finetuned_2'
if not os.path.exists(directory):
    os.makedirs(directory)
torch.save(model, directory)


In [181]:
model = torch.load('t5-finetuned_2')
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

### Generating a summary for a sample text

In [195]:
def smry_gen(text):
    text_token = tokenizer.encode(text, return_tensors='pt',max_length=512).to(device)
    smry = model.generate(text_token, max_length=200, num_beams=4, no_repeat_ngram_size=3)[0]
    smry = tokenizer.decode(smry, skip_special_tokens=True)
    return smry
    

In [215]:
#Let's use a text which was not used during training:
sample_text = data0.ctext[1001]
print('The sample text: \n{}'. format(sample_text))
print('{}\n'.format(100*'='))
print('The generated summary: \n{}'. format(smry_gen(sample_text)))
print('{}\n'.format(100*'='))
print('The original highlight/summary of the text: \n{}'. format(data0.text[1001]))


The sample text: 
Over 300 commissioners of the Income Tax department have been transferred recently by the government, a major reshuffle seen as an attempt to fine tune the working of the department.The Central Board of Direct Taxes (CBDT), which frames policy for the tax department, has recently brought out two lists ? first transferring 80 and subsequently 245 commissioner rank officials across the country.Just few days before this, on May 31, the CBDT also transferred over 50 chief commissioners of the department across the country.A senior department official said these three lists in quick intervals can be billed as the largest scale of transfers done in the department in the last few years.?The 245 commissioners transfer list published recently is the largest amongst the three lists. Such huge numbers were never heard of, even as the lists have come during the usual transfers season due every year. The aim is to ensure better work productivity and transparency in the department 