## Installing libraries

In [1]:
! pip install pandas scikit-learn transformers accelerate



### importing libraries


# Summarization using AutoTokenizer and AutoModelForSeq2SeqLM

#### 1.For this Extractive summarization we are using Auto tokenizer and AutoModelForSeq2SeqLM classes and using t5-small as the model (Text-To-Text Transfer Trnsformer)
#### 2.It accepts 60 million parameters while the base one accepts 660million parameters
#### 3.More over it lightweight

In [2]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


### Here we are reading dataset and converting it into Huggingface dataset format

In [3]:
dataset=pd.read_csv('Dataset/train data.csv')
dataset=Dataset.from_pandas(dataset)

## Summarization using 't5-small' model

In [4]:
# Load a pre-trained model and tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

### we know for the models they won't accept strings as trainig inputs so we are converting the train dataset into numarables (Pre-processing)

In [5]:

# Preprocess the dataset
def preprocess_function(examples):
    inputs = [doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=150, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

## original Data will look like this

In [6]:
dataset[0]

{'id': '92c514c913c0bdfe25341af9fd72b29db544099b',
 'article': "Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee.\xa0'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crow

### After Pre-processing the data

In [16]:
dataexample=preprocess_function(dataset[0])
for key,value in dataexample.items():
    print(f'{key}:{value[:10]}')

input_ids:[[262, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## converting the original dataset to tokenized_datset

In [6]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 1000/1000 [00:02<00:00, 469.47 examples/s]


## Fine-tuning the model / Training the model

In [23]:

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=1,
    push_to_hub=False,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)


trainer.train()

  4%|▍         | 10/250 [03:28<1:23:19, 20.83s/it]
  3%|▎         | 28/1000 [01:05<38:55,  2.40s/it]

## saving the model for Later usages

In [None]:
# model.save_pretrained("./fine-tuned-model")
# tokenizer.save_pretrained("./fine-tuned-model")

('./fine-tuned-model\\tokenizer_config.json',
 './fine-tuned-model\\special_tokens_map.json',
 './fine-tuned-model\\tokenizer.json')

## importing the Custom trained model

In [7]:

model_name_or_path = "./fine-tuned-model"
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM
local_tokenizer=AutoTokenizer.from_pretrained(model_name_or_path)
local_model=AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)


### Lets See an Example

In [8]:

article = dataset['article'][0]
article

"Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee.\xa0'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than fighting for sp

### Summarize using pipeline

In [9]:
from transformers import pipeline
summarize=pipeline('summarization',model=local_model,tokenizer=local_tokenizer)

In [10]:
summarize(dataset['article'][0])[0]['summary_text']

"a consumer advisory group set up by the Department of Transportation said that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans . 'in a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative ."

## summarize using tokenizer and model

### converting article to tokens

In [11]:
inputs = local_tokenizer(article, return_tensors="pt", max_length=512, truncation=True)
inputs

{'input_ids': tensor([[ 6381,  4944,   149,  6112,  6116,  2385,    12,    36,   652,  2755,
            11,  2755,    58,   438,  3094,  2302,    13,   151,   838,    12,
             8, 22902,     6,   128,  2273,    33,   822,    53,     3,    99,
           578,   224,  7614,    91,  6112,     7,    19,     3,  3131,  9234,
            44,  1020,     5,   328,   497,    24,     8, 18508,    53,   628,
            30, 15726,  3767,    15,     7,    19,    59,   163, 14209,     3,
            18,    34,    31,     7,     3,  3131,    69,   533,    11,  1455,
            16,  5129,     5,  1537,   145,     3,     7,  4960,   115,  7428,
           147,     8,  2939,   880,     6, 18508,    53,   628,    30,  6112,
             7,     3,  3131,    69,   533,    11,  1455,    16,  5129,    58,
           100,   471,     6,     3,     9,   412,     5,   134,  3733, 18599,
           563,   356,    95,    57,     8,  1775,    13, 14630,   243,    44,
             3,     9,   452,  3507,  

### we are giving the Above tokens as input to generate Sumaary

In [12]:

n=100
summary_ids = local_model.generate(inputs["input_ids"], max_length=n)

#prints encoded summary
print(summary_ids)


tensor([[    0,     0,     3,     9,  3733, 18599,   563,   356,    95,    57,
             8,  1775,    13, 14630,   243,    44,     3,     9,   452,  3507,
            24,   298,     8,   789,    19,  1095,    12,   356,  2443,    21,
          3127,  7070,    30,  6112,     7,     6,    34,   744,    31,    17,
         28713,     3,     9,  2559,   866,    13,   628,    21,  6917,     3,
             5,     3,    31,  1570,     3,     9,   296,   213,  3127,    43,
            72,  2166,    12,   628,    11,   542,   145,  6917,     6,    31,
           243,     8,  2952,     3,     5,     1]])


### decoding the output tokens 

In [13]:
summary = local_tokenizer.decode(summary_ids[0],skip_special_tokens=True)

print("Summary:", summary)

Summary: a consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said the panel.


## Evaluating the model Performance using ROUGE(Recall-Oriented Understudy for Gisting Evaluation)

In [23]:
! pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting absl-py (from rouge-score)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
   ---------------------------------------- 0.0/133.7 kB ? eta -:--:--
   --- ------------------------------------ 10.2/133.7 kB ? eta -:--:--
   -------- ------------------------------ 30.7/133.7 kB 330.3 kB/s eta 0:00:01
   ----------- --------------------------- 41.0/133.7 kB 281.8 kB/s eta 0:00:01
   ----------- -----------------------

In [14]:
from datasets import load_metric

def generate_summary(data):
    summary=[]
    for dat in data['article']:
        
        summary.append(summarize(dat,min_length=30,max_length=90)[0]['summary_text'])
        

    
    data['predicted_summary']=summary
    
    return data



In [15]:
results=Dataset.from_dict(dataset[0:10])

In [16]:
results=results.map(generate_summary,batched=True)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1086 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 10/10 [00:40<00:00,  4.05s/ examples]


In [17]:
results.to_pandas().head()

Unnamed: 0,id,article,highlights,predicted_summary
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,a consumer advisory group set up by the Depart...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,"Rahul Kumar, 17, climbed into enclosure fence ..."
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,Dougie Freedman is set to sign a new two-year ...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,a number of top european clubs are interested ...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...","the former Olympian and reality tv star, 65, w..."


In [93]:

# Loading ROUGE metric
rouge = load_metric("rouge")


references = results["highlights"]
predictions = results["predicted_summary"]

rouge_output = rouge.compute(predictions=predictions, references=references)

for key in rouge_output:
    print(f"{key}: {rouge_output[key].high.fmeasure:.4f}")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


rouge1: 0.5435
rouge2: 0.2868
rougeL: 0.4135
rougeLsum: 0.4918
