## Environment Setup




In [1]:
!pip install datasets sacrebleu rouge_score py7zr -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m18.3 M

## Module imports

In [2]:
from google.colab import drive

from transformers import pipeline
from datasets import load_from_disk

## Utility functions

In [3]:
def show_samples(dataset, num_samples=5, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Title: {example['title']}'")
        print(f"'>> Text: {example['text']}'")

In [4]:
def get_samples(dataset, num_samples=5, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    return sample

## Azma-AI/bart-large-text-summarizer



**About**
- Model obtained by Fine Tuning 'facebook/bart-large-xsum' using AMI Meeting Corpus, SAMSUM Dataset, DIALOGSUM Dataset, XSUM Dataset!

**Resource**
- https://huggingface.co/Azma-AI/bart-large-text-summarizer

In [5]:
model_checkpoint = "Azma-AI/bart-large-text-summarizer"

In [6]:
summarizer = pipeline("summarization", model=model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/337 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

## McAuley-Lab/Amazon-Reviews-2023 Dataset from Local Drive

### Unzip the local disk dataset

In [7]:
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
!unzip /content/drive/MyDrive/appliances_english_reviews.zip

Archive:  /content/drive/MyDrive/appliances_english_reviews.zip
   creating: appliances_english_reviews/
   creating: appliances_english_reviews/test/
  inflating: appliances_english_reviews/test/dataset_info.json  
  inflating: appliances_english_reviews/test/state.json  
  inflating: appliances_english_reviews/test/data-00000-of-00001.arrow  
   creating: appliances_english_reviews/validation/
  inflating: appliances_english_reviews/validation/dataset_info.json  
  inflating: appliances_english_reviews/validation/state.json  
  inflating: appliances_english_reviews/validation/data-00000-of-00001.arrow  
   creating: appliances_english_reviews/train/
  inflating: appliances_english_reviews/train/dataset_info.json  
  inflating: appliances_english_reviews/train/state.json  
  inflating: appliances_english_reviews/train/data-00000-of-00001.arrow  
  inflating: appliances_english_reviews/dataset_dict.json  


### Loading split dataset

In [9]:
split_dataset = load_from_disk('appliances_english_reviews')

In [10]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 393558
    })
    validation: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 131186
    })
    test: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 131187
    })
})

In [11]:
show_samples(split_dataset)


'>> Title: Stopped working in less than 24 hours'
'>> Text: Stopped working in less than 24 hours.'

'>> Title: SAVED MONEY WITH NOT HAVING TO BUY A NEW kUERIG'
'>> Text: EASY PEASY'

'>> Title: I like a hard plastic case'
'>> Text: I like a hard plastic case. I find that they provide the best phone protection.This one has a more rugged, manly look. I'll look like a stud when I pull that baby out of my golf bag and consult my golf app for a blistering 250-yard 3-wood shot into a tight par 5 green.'

'>> Title: It goes on so easy i don't know why i didn't get it sooner'
'>> Text: I needed this for a long long time! It goes on so easy i don't know why i didn't get it sooner! the only thing i wish the instructions was with it but i looked it up on youtube and didn't have any problems what so ever! It also came on time just like it said!'

'>> Title: Plastic taste won't go away'
'>> Text: Could not get the nasty plastic taste out of the line. We ran at least 10 gallons everyday for 2 week

## Prediction analysis of the finetuned model

In [12]:
# Getting the same random samples

samples = get_samples(split_dataset)
samples

Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
    num_rows: 5
})

In [13]:
# Making sure we are getting the same samples as above

for i in range(5):
  print(samples[i])

{'rating': 1.0, 'title': 'Stopped working in less than 24 hours', 'text': 'Stopped working in less than 24 hours.', 'images': [], 'asin': 'B0B6BXH89M', 'parent_asin': 'B0B6BXH89M', 'user_id': 'AG7D4K2T5Z5V4YQGBB3NGBBHXZXA', 'timestamp': 1688844327387, 'helpful_vote': 0, 'verified_purchase': False}
{'rating': 5.0, 'title': 'SAVED MONEY WITH NOT HAVING TO BUY A NEW kUERIG', 'text': 'EASY PEASY', 'images': [], 'asin': 'B01A4JYDFY', 'parent_asin': 'B01A4JYDFY', 'user_id': 'AF2JVYCH6HNSQP5LXWFAJNIOYFZA', 'timestamp': 1628795871271, 'helpful_vote': 0, 'verified_purchase': True}
{'rating': 5.0, 'title': 'I like a hard plastic case', 'text': "I like a hard plastic case. I find that they provide the best phone protection.This one has a more rugged, manly look. I'll look like a stud when I pull that baby out of my golf bag and consult my golf app for a blistering 250-yard 3-wood shot into a tight par 5 green.", 'images': [], 'asin': 'B014J4BCP4', 'parent_asin': 'B014J4BCP4', 'user_id': 'AGWQOIXL

In [14]:
for i in range(5):
  sample_length = len(samples[i]["text"].split())
  min_length = int(round(sample_length/1.5,0))
  max_length = int(round(sample_length/1.2,0))

  print(sample_length, min_length, max_length)
  print("="*100)

7 5 6
2 1 2
54 36 45
56 37 47
57 38 48


In [15]:
for i in range(5):
  sample_length = len(samples[i]["text"].split())
  min_length = int(round(sample_length/1.5,0))
  max_length = int(round(sample_length/1.2,0))

  print("actual_title :", samples[i]["title"])
  print(summarizer(samples[i]["text"], min_length=min_length, max_length=max_length))
  print("="*100)

actual_title : Stopped working in less than 24 hours
[{'summary_text': 'St'}]
actual_title : SAVED MONEY WITH NOT HAVING TO BUY A NEW kUERIG
[{'summary_text': ''}]
actual_title : I like a hard plastic case
[{'summary_text': "This is a hard plastic case. I like them because they provide the best phone protection. I'll look like a stud when I pull that baby out of my golf bag."}]
actual_title : It goes on so easy i don't know why i didn't get it sooner
[{'summary_text': "I really like this product. It's easy to use and comes on time. I didn't have any problems following the instructions as I looked it up on youtube."}]
actual_title : Plastic taste won't go away
[{'summary_text': 'There was a bad taste in the water. I replaced the bad wire with one fron Home Depot and the bad taste was gone in 20 minutes. I threw the other one away.'}]


**Observation**
- The model is doing abstractive summarization and it is doing better than facebook/bart-large-cnn but still it is not good enough for our usecase.