In [1]:
!pip install evaluate




In [1]:
import re
import nltk
from nltk.corpus import stopwords
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import pipeline  
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset, DatasetDict
from evaluate import load


In [2]:
train_df = pd.read_csv(r"C:\Users\Rishabh\Desktop\Samsum\samsum-train.csv")
test_df = pd.read_csv(r"C:\Users\Rishabh\Desktop\Samsum\samsum-test.csv")
val_df = pd.read_csv(r"C:\Users\Rishabh\Desktop\Samsum\samsum-validation.csv")

In [3]:
train_df.dropna(axis = 0, inplace = True)

In [4]:
model_checkpoint = 't5-small'

In [5]:
def cleanse(text):
    clean = re.compile('<.*?>')
    clean = re.sub(clean, '', text)

    clean = '\n'.join([line for line in clean.split('\n') if not re.match('.*:\s*$', line)])

    return clean

In [6]:
train_df['dialogue'] = train_df['dialogue'].apply(cleanse)
val_df['dialogue'] = val_df['dialogue'].apply(cleanse)
test_df['dialogue'] = test_df['dialogue'].apply(cleanse)

In [7]:
train_df['summary'] = train_df['summary'].apply(cleanse)
val_df['summary'] = val_df['summary'].apply(cleanse)
test_df['summary'] = test_df['summary'].apply(cleanse)

In [8]:
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

In [9]:
dataset_dict = {
    'train' : train_dataset,
    'test': test_dataset,
    'validation': val_dataset
}

data = DatasetDict(dataset_dict)

In [10]:
for _ in range(5):
    idx = random.randint(0, 2000)
    sample = data["train"][idx]
    
    print("")
    print("="*100)
    print(sample["dialogue"])
    print("-"*100)
    print(sample["summary"])
    print("")


Dean: How are tou Laura?
Dean: *you
Laura: fine thanks, what's up with you? :)
Dean: just studying, studying
Dean: and then some studying :D
Laura: sounds exciting :P
Dean: not too much
Laura: :D
Dean: which is why I wanted to ask if you's like to have dinner with me tonight?
Laura: That's sweet thank you
Dean: buuut? :D
Laura: I promised my mum I would help her tonight
Dean: oh
Laura: tomorrow?
Dean: great!
Laura: :)
----------------------------------------------------------------------------------------------------
Dean is studying a lot so he wanted to invite Laura for dinner tonight. Laura promised she would help her mum tonight. Laura and Dean will have dinner tomorrow. 


Jimmy: How are you?
Greg: I'm wearing a cast :D
Jimmy: seriously? fuuck
Jimmy: do they even let you in when you're that drunk?
Greg: they do and they hydrate you so you're not hungover the next day :D
Jimmy: you fucker
Greg: :D:D:D
-----------------------------------------------------------

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [12]:
tokenizer('Hello This is Hakim..')

{'input_ids': [8774, 100, 19, 1626, 19754, 5, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [13]:
prefix = 'summarize:'

In [14]:
max_dialogue_length = 1024
max_summary_length = 128

In [15]:

def preprocess_function(samples):
    inputs = [prefix + doc for doc in samples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_dialogue_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(samples["summary"], max_length=max_summary_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [16]:
preprocess_function(data['train'][0])

{'input_ids': [[21603, 10, 188, 1], [21603, 10, 51, 1], [21603, 10, 9, 1], [21603, 10, 29, 1], [21603, 10, 26, 1], [21603, 10, 9, 1], [21603, 10, 10, 1], [21603, 10, 1], [21603, 10, 196, 1], [21603, 10, 1], [21603, 10, 115, 1], [21603, 10, 9, 1], [21603, 10, 157, 1], [21603, 10, 15, 1], [21603, 10, 26, 1], [21603, 10, 1], [21603, 10, 1], [21603, 10, 75, 1], [21603, 10, 32, 1], [21603, 10, 32, 1], [21603, 10, 157, 1], [21603, 10, 23, 1], [21603, 10, 15, 1], [21603, 10, 7, 1], [21603, 10, 5, 1], [21603, 10, 1], [21603, 10, 308, 1], [21603, 10, 32, 1], [21603, 10, 1], [21603, 10, 63, 1], [21603, 10, 32, 1], [21603, 10, 76, 1], [21603, 10, 1], [21603, 10, 210, 1], [21603, 10, 9, 1], [21603, 10, 29, 1], [21603, 10, 17, 1], [21603, 10, 1], [21603, 10, 7, 1], [21603, 10, 32, 1], [21603, 10, 51, 1], [21603, 10, 15, 1], [21603, 10, 58, 1], [21603, 10, 1], [21603, 10, 1], [21603, 10, 683, 1], [21603, 10, 15, 1], [21603, 10, 52, 1], [21603, 10, 52, 1], [21603, 10, 63, 1], [21603, 10, 10, 1], [216

In [17]:
final_data = data.map(preprocess_function, batched =True,
                     remove_columns=['id', 'dialogue', 'summary'])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [18]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [19]:
batch_size = 1
epochs = 4
model_name = f"{model_checkpoint}-transcript-summarizer"
args = Seq2SeqTrainingArguments(
    model_name,                                                        
    evaluation_strategy="epoch",                                       
    learning_rate=2e-5,                            
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01, 
    save_total_limit=3,                      
    # Save the model only 3 times
    num_train_epochs=epochs,                                           
    predict_with_generate=True,                                        
    fp16=False,                                                         
    push_to_hub=False                                                  
)

In [20]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [24]:
!pip install rouge_score



In [21]:
metric = load("rouge")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id) 
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key : value * 100 for key, value in result.items()}
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v,4) for k,v in result.items()}

In [22]:
trainer  = Seq2SeqTrainer(model,
                         args,
                         train_dataset=final_data['train'],
                         eval_dataset=final_data['validation'],
                         tokenizer= tokenizer,
                          compute_metrics= compute_metrics
                         )

In [23]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rishabh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
trainer.train()

***** Running training *****
  Num examples = 2000
  Num Epochs = 4
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 8000


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.5468,0.459514,36.8378,15.5057,31.0153,34.1198,14.9095
2,0.4969,0.439746,38.0298,16.2199,32.1913,35.4955,15.9108
3,0.4919,0.430752,38.1414,16.2867,32.389,35.491,15.324
4,0.4717,0.428678,38.1536,16.2777,32.2962,35.4995,15.5611


Saving model checkpoint to t5-small-transcript-summarizer\checkpoint-500
Configuration saved in t5-small-transcript-summarizer\checkpoint-500\config.json
Model weights saved in t5-small-transcript-summarizer\checkpoint-500\pytorch_model.bin
tokenizer config file saved in t5-small-transcript-summarizer\checkpoint-500\tokenizer_config.json
Special tokens file saved in t5-small-transcript-summarizer\checkpoint-500\special_tokens_map.json
Copy vocab file to t5-small-transcript-summarizer\checkpoint-500\spiece.model
Deleting older checkpoint [t5-small-transcript-summarizer\checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to t5-small-transcript-summarizer\checkpoint-1000
Configuration saved in t5-small-transcript-summarizer\checkpoint-1000\config.json
Model weights saved in t5-small-transcript-summarizer\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in t5-small-transcript-summarizer\checkpoint-1000\tokenizer_config.json
Special tokens file saved in t5-sma

TrainOutput(global_step=8000, training_loss=0.5627898540496826, metrics={'train_runtime': 25115.0168, 'train_samples_per_second': 0.319, 'train_steps_per_second': 0.319, 'total_flos': 321910474801152.0, 'train_loss': 0.5627898540496826, 'epoch': 4.0})

In [25]:
results = trainer.evaluate(final_data['test'])

***** Running Evaluation *****
  Num examples = 819
  Batch size = 1


In [26]:
for k,v in results.items():
    print(f"{k:{30}}{round(v,3):{20}}")

eval_loss                                    0.425
eval_rouge1                                 38.491
eval_rouge2                                 15.264
eval_rougeL                                 31.897
eval_rougeLsum                              35.336
eval_gen_len                                15.858
eval_runtime                              1474.313
eval_samples_per_second                      0.556
eval_steps_per_second                        0.556
epoch                                          4.0


In [27]:
summarizer = pipeline('summarization', model = model, tokenizer = tokenizer)

In [28]:
for _ in range(5):
    idx = random.randint(0, 818)
    
    dialogue = data['test'][idx]['dialogue']
    summary = data['test'][idx]['summary']
    
    predicted = summarizer(dialogue)
    
    print("="*100)
    print(f"Dialogue:\n{dialogue}")
    print("-"*100)
    print(f"summary:\n{summary}")
    print("-"*100)
    print(f"Predicted:\n{predicted[0]['summary_text']}")

Your max_length is set to 200, but you input_length is only 127. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)


Dialogue:
Richie: Pogba
Clay: Pogboom
Richie: what a s strike yoh!
Clay: was off the seat the moment he chopped the ball back to his right foot
Richie: me too dude
Clay: hope his form lasts
Richie: This season he's more mature
Clay: Yeah, Jose has his trust in him
Richie: everyone does
Clay: yeah, he really deserved to score after his first 60 minutes
Richie: reward
Clay: yeah man
Richie: cool then 
Clay: cool
----------------------------------------------------------------------------------------------------
summary:
Richie and Clay saw a very good football game, with one football player chopping the ball back to his foot, which was particularly exciting. Jose has trust in that player. 
----------------------------------------------------------------------------------------------------
Predicted:
Pogboom Richie was off the seat when he chopped the ball back to his right foot. Jose has his trust in him.
Dialogue:
Lincoln: Heeyyy ;* whats up
Fatima: I talked to Jenson, he’s not too happ

In [31]:
def generate_summary(dialogue):
    inputs = tokenizer.encode("summarize: " + dialogue, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=128, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [32]:
dialogue = '''
Alice: Bob, have you seen my keys? I can’t find them anywhere.
Bob: No, I haven’t. Did you check your bag?
Alice: Yes, I did. They’re not there. I’m supposed to leave in ten minutes.
Bob: Let’s retrace your steps. Where did you go after coming home?
Alice: I came in, put my coat on the chair, and went to the kitchen to make some tea.
Bob: Let’s start there. Maybe you left them on the counter.
Alice: I don’t remember taking them out of my bag in the kitchen, but it’s worth a look.
Bob: Here they are! They were next to the kettle. You must have put them down when you made your tea.
Alice: Oh, thank goodness! Thank you, Bob. I don’t know what I’d do without you.
Bob: Anytime, Alice. Now go, or you’ll be late!'''
summary = generate_summary(dialogue)
print("Generated Summary:", summary)

Generated Summary: Alice hasn't seen her keys, but she's supposed to leave them in 10 minutes. Bob left them on the counter. Bob doesn't remember taking them out of his bag.
