## Environment Setup




In [1]:
!pip install datasets sacrebleu rouge_score py7zr -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/542.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m532.5/542.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/

## Module imports

In [2]:
from google.colab import files

In [3]:
from transformers import pipeline

from datasets import load_dataset, DatasetDict

## Utility functions

In [4]:
# Function to display random samples

def show_samples_full(dataset, num_samples=5, seed=42):
    sample = dataset["full"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Title: {example['title']}'")
        print(f"'>> Text: {example['text']}'")

In [5]:
# Function for filtering DatasetDict
# We will consider only those titles where there are atleast 5 words (more specifically splits)

def filter_appliances(example):
    return (
        len(example["title"].split()) >= 5
    )

In [6]:
def show_samples(dataset, num_samples=5, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Title: {example['title']}'")
        print(f"'>> Text: {example['text']}'")

In [7]:
def get_samples(dataset, num_samples=5, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    return sample

## facebook/bart-large-cnn Model



**Resource**
- https://huggingface.co/facebook/bart-large-cnn
- https://techblog.geekyants.com/text-summarization-using-facebook-bart-large-cnn

In [8]:
# facebook/bart-large-cnn finetuned model on cnn_dailymail data

model_checkpoint = "facebook/bart-large-cnn"

In [9]:
# Complete facebook/bart-large-cnn summarizer pipeline

summarizer = pipeline("summarization", model=model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## McAuley-Lab/Amazon-Reviews-2023 Dataset

In [17]:
# Loading the Appliances data (Not the complete data)

appliances_english_reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", trust_remote_code=True)

### Data analysis

In [18]:
appliances_english_reviews

DatasetDict({
    full: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 2128605
    })
})

In [20]:
# Looking at the first example

appliances_english_reviews['full'][0]

{'rating': 5.0,
 'title': 'Work great',
 'text': 'work great. use a new one every month',
 'images': [],
 'asin': 'B01N0TQ0OH',
 'parent_asin': 'B01N0TQ0OH',
 'user_id': 'AGKHLEW2SOWHNMFQIJGBECAF7INQ',
 'timestamp': 1519317108692,
 'helpful_vote': 0,
 'verified_purchase': True}

In [21]:
# Looking at first 5 examples

appliances_english_reviews['full'][:5]

{'rating': [5.0, 5.0, 5.0, 5.0, 5.0],
 'title': ['Work great',
  'excellent product',
  'Happy customer!',
  'Amazing value',
  'Dryer parts'],
 'text': ['work great. use a new one every month',
  'Little on the thin side',
  'Quick delivery, fixed the issue!',
  "I wasn't sure whether these were worth it or not, given the cost compared to the original branded filters.<br /><br />I can happily report that these are a great value and work every bit as good as the original. If you are on the fence worrying whether these are worth it- I can assure you they are.",
  'Easy to install got the product expected to receive'],
 'images': [[], [], [], [], []],
 'asin': ['B01N0TQ0OH',
  'B07DD2DMXB',
  'B082W3Z9YK',
  'B078W2BJY8',
  'B08C9LPCQV'],
 'parent_asin': ['B01N0TQ0OH',
  'B07DD37QPZ',
  'B082W3Z9YK',
  'B078W2BJY8',
  'B08C9LPCQV'],
 'user_id': ['AGKHLEW2SOWHNMFQIJGBECAF7INQ',
  'AHWWLSPCJMALVHDDVSUGICL6RUCA',
  'AHZIJGKEWRTAEOZ673G5B3SNXEGQ',
  'AFGUPTDFAWOHHL4LZDV27ERDNOYQ',
  'AELFJFA

In [22]:
# Looking at the title and text only, of the first five examples

for i in range(5):
  print("Title:", appliances_english_reviews['full'][i]["title"])
  print("Text:", appliances_english_reviews['full'][i]["text"])
  print("="*50)

Title: Work great
Text: work great. use a new one every month
Title: excellent product
Text: Little on the thin side
Title: Happy customer!
Text: Quick delivery, fixed the issue!
Title: Amazing value
Text: I wasn't sure whether these were worth it or not, given the cost compared to the original branded filters.<br /><br />I can happily report that these are a great value and work every bit as good as the original. If you are on the fence worrying whether these are worth it- I can assure you they are.
Title: Dryer parts
Text: Easy to install got the product expected to receive


In [23]:
show_samples_full(appliances_english_reviews)


'>> Title: What changed????'
'>> Text: I’ve used these filters in the past with the only issue being leaks.<br /><br />This newest batch makes my water taste like garden hose water. Almost worse then the sink!<br /><br />What changed!?'

'>> Title: Five Stars'
'>> Text: These Work fine- no issues at all.'

'>> Title: Five Stars'
'>> Text: Just right! Fits my Kenmore 400 series washer.'

'>> Title: Works great for Vicks vaporizers'
'>> Text: Fits great  with vicks vaporizer rinses out well every week after daily use'

'>> Title: Don’t buy beware..'
'>> Text: I asked for a refund as it did not fit my cooktop and I never heard from the seller!!'


In [24]:
# Changing the format of the DatasetDict into pandas dataframe

appliances_english_reviews.set_format("pandas")

In [26]:
appliances_english_reviews

DatasetDict({
    full: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 2128605
    })
})

In [27]:
# [:] Selecting the rows
# We need all the rows

appliances_english_reviews_df = appliances_english_reviews["full"][:]

In [29]:
appliances_english_reviews_df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5.0,Work great,work great. use a new one every month,[],B01N0TQ0OH,B01N0TQ0OH,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1519317108692,0,True
1,5.0,excellent product,Little on the thin side,[],B07DD2DMXB,B07DD37QPZ,AHWWLSPCJMALVHDDVSUGICL6RUCA,1664746863446,0,True
2,5.0,Happy customer!,"Quick delivery, fixed the issue!",[],B082W3Z9YK,B082W3Z9YK,AHZIJGKEWRTAEOZ673G5B3SNXEGQ,1607225435363,0,True
3,5.0,Amazing value,I wasn't sure whether these were worth it or n...,[],B078W2BJY8,B078W2BJY8,AFGUPTDFAWOHHL4LZDV27ERDNOYQ,1534104184306,0,True
4,5.0,Dryer parts,Easy to install got the product expected to re...,[],B08C9LPCQV,B08C9LPCQV,AELFJFAXQERUSMTXJQ6SYFFRDWMA,1620176603754,0,True


In [30]:
appliances_english_reviews_df.columns

Index(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase'],
      dtype='object')

In [31]:
# We only need the title and text columns
# We are listing the other columns to delete

columns_to_drop = set(appliances_english_reviews_df.columns) - set(["title", "text"])
columns_to_drop

{'asin',
 'helpful_vote',
 'images',
 'parent_asin',
 'rating',
 'timestamp',
 'user_id',
 'verified_purchase'}

In [32]:
# Deleting the columns

appliances_english_reviews_df.drop(columns = columns_to_drop, inplace=True)
appliances_english_reviews_df

Unnamed: 0,title,text
0,Work great,work great. use a new one every month
1,excellent product,Little on the thin side
2,Happy customer!,"Quick delivery, fixed the issue!"
3,Amazing value,I wasn't sure whether these were worth it or n...
4,Dryer parts,Easy to install got the product expected to re...
...,...,...
2128600,Accurate description,As described
2128601,Not compatible with Nespresso U Machine,I have tried multiple times with different reu...
2128602,Works with Sears Kenmore model 36275585891,Exact fit for Sears Kenmore model 36275585891....
2128603,Perfect little ice maker!,Love this!! It doesn’t keep the ice cold but t...


In [33]:
# Exporting the title and text data as csv

appliances_english_reviews_df.to_csv("./appliances_english_reviews_df.csv")

**Notes**
- The most popular products in the Amazon dataset are about household items, clothing, and wireless electronics.
- The Amazon theme, though, focuses on summarizing book reviews — after all, this is what the company was founded on.
- We can see two product categories that fit the bill (book and digital_ebook_purchase), but we will use *Appliances* becasue we want to start working with a small dataset.

In [34]:
# Resetting the format back to DatasetDict

appliances_english_reviews.reset_format()
appliances_english_reviews

DatasetDict({
    full: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 2128605
    })
})

In [35]:
# Filtering the Dataset
# Considering the records with the titles having atleast 5 words

appliances_english_reviews = appliances_english_reviews.filter(filter_appliances)

Filter:   0%|          | 0/2128605 [00:00<?, ? examples/s]

In [36]:
appliances_english_reviews

DatasetDict({
    full: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 655931
    })
})

In [None]:
# Reduction in records

2128605 - 655931

1472674

In [37]:
show_samples_full(appliances_english_reviews)


'>> Title: Works great.  Note on the shaft cut.'
'>> Text: Item was shipped fast.  With help from video on YT, was able to replace old auger with ease.<br />***Please Note***<br />This new part has a shaft with a shorter machined stop on each side compared to the old part.  The new part does not need to be modified.  The blades fit on the auger just fine and once put together ice is dispensed as expected.  Just note the last two spacers will not sit flush against the shaft.  This does not affect fit nor function of the new part.'

'>> Title: ... was purchased for my adult son who does not like to iron shirts'
'>> Text: This item was purchased for my adult son who does not like to iron shirts. He is extremely pleased with it, it is easy to use and allows him to press several shirts at a time without having to refill with water.'

'>> Title: The filter is working fine but it has only been about one month'
'>> Text: The filter is working fine but it has only been about one month. That's 

### Train validation test split

In [38]:
# Split train and test sets

train_test_split = appliances_english_reviews["full"].train_test_split(test_size=0.2)

train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [39]:
train_dataset

Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
    num_rows: 524744
})

In [40]:
test_dataset

Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
    num_rows: 131187
})

In [41]:
# Further split the training set into training and validation sets

train_val_split = train_dataset.train_test_split(test_size=0.25)

train_dataset = train_val_split['train']
val_dataset = train_val_split['test']

In [42]:
# Combine the splits into a DatasetDict

split_dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [43]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 393558
    })
    validation: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 131186
    })
    test: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 131187
    })
})

In [44]:
# Export the splitted dataset

split_dataset.save_to_disk("appliances_english_reviews")

Saving the dataset (0/1 shards):   0%|          | 0/393558 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/131186 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/131187 [00:00<?, ? examples/s]

In [45]:
show_samples(split_dataset)


'>> Title: Stopped working in less than 24 hours'
'>> Text: Stopped working in less than 24 hours.'

'>> Title: SAVED MONEY WITH NOT HAVING TO BUY A NEW kUERIG'
'>> Text: EASY PEASY'

'>> Title: I like a hard plastic case'
'>> Text: I like a hard plastic case. I find that they provide the best phone protection.This one has a more rugged, manly look. I'll look like a stud when I pull that baby out of my golf bag and consult my golf app for a blistering 250-yard 3-wood shot into a tight par 5 green.'

'>> Title: It goes on so easy i don't know why i didn't get it sooner'
'>> Text: I needed this for a long long time! It goes on so easy i don't know why i didn't get it sooner! the only thing i wish the instructions was with it but i looked it up on youtube and didn't have any problems what so ever! It also came on time just like it said!'

'>> Title: Plastic taste won't go away'
'>> Text: Could not get the nasty plastic taste out of the line. We ran at least 10 gallons everyday for 2 week

### Zip and Download

In [46]:
# Zipping the csv dataset

!zip -r appliances_english_reviews_df.zip appliances_english_reviews_df.csv

  adding: appliances_english_reviews_df.csv (deflated 63%)


In [47]:
# Zipping the DatasetDict dataset

!zip -r appliances_english_reviews.zip appliances_english_reviews

  adding: appliances_english_reviews/ (stored 0%)
  adding: appliances_english_reviews/test/ (stored 0%)
  adding: appliances_english_reviews/test/dataset_info.json (deflated 67%)
  adding: appliances_english_reviews/test/state.json (deflated 38%)
  adding: appliances_english_reviews/test/data-00000-of-00001.arrow (deflated 61%)
  adding: appliances_english_reviews/validation/ (stored 0%)
  adding: appliances_english_reviews/validation/dataset_info.json (deflated 67%)
  adding: appliances_english_reviews/validation/state.json (deflated 38%)
  adding: appliances_english_reviews/validation/data-00000-of-00001.arrow (deflated 61%)
  adding: appliances_english_reviews/train/ (stored 0%)
  adding: appliances_english_reviews/train/dataset_info.json (deflated 67%)
  adding: appliances_english_reviews/train/state.json (deflated 38%)
  adding: appliances_english_reviews/train/data-00000-of-00001.arrow (deflated 61%)
  adding: appliances_english_reviews/dataset_dict.json (deflated 5%)


In [48]:
# Downloading the zipped csv file

files.download("appliances_english_reviews_df.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [49]:
# Downloading the zipped DatasetDict file

files.download("appliances_english_reviews.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Prediction analysis of the cnn finetuned model

In [50]:
# Displaying some random samples

show_samples(split_dataset)


'>> Title: Stopped working in less than 24 hours'
'>> Text: Stopped working in less than 24 hours.'

'>> Title: SAVED MONEY WITH NOT HAVING TO BUY A NEW kUERIG'
'>> Text: EASY PEASY'

'>> Title: I like a hard plastic case'
'>> Text: I like a hard plastic case. I find that they provide the best phone protection.This one has a more rugged, manly look. I'll look like a stud when I pull that baby out of my golf bag and consult my golf app for a blistering 250-yard 3-wood shot into a tight par 5 green.'

'>> Title: It goes on so easy i don't know why i didn't get it sooner'
'>> Text: I needed this for a long long time! It goes on so easy i don't know why i didn't get it sooner! the only thing i wish the instructions was with it but i looked it up on youtube and didn't have any problems what so ever! It also came on time just like it said!'

'>> Title: Plastic taste won't go away'
'>> Text: Could not get the nasty plastic taste out of the line. We ran at least 10 gallons everyday for 2 week

In [51]:
# Getting the same random samples

samples = get_samples(split_dataset)
samples

Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
    num_rows: 5
})

In [52]:
# Making sure we are getting the same samples as above

for i in range(5):
  print(samples[i])

{'rating': 1.0, 'title': 'Stopped working in less than 24 hours', 'text': 'Stopped working in less than 24 hours.', 'images': [], 'asin': 'B0B6BXH89M', 'parent_asin': 'B0B6BXH89M', 'user_id': 'AG7D4K2T5Z5V4YQGBB3NGBBHXZXA', 'timestamp': 1688844327387, 'helpful_vote': 0, 'verified_purchase': False}
{'rating': 5.0, 'title': 'SAVED MONEY WITH NOT HAVING TO BUY A NEW kUERIG', 'text': 'EASY PEASY', 'images': [], 'asin': 'B01A4JYDFY', 'parent_asin': 'B01A4JYDFY', 'user_id': 'AF2JVYCH6HNSQP5LXWFAJNIOYFZA', 'timestamp': 1628795871271, 'helpful_vote': 0, 'verified_purchase': True}
{'rating': 5.0, 'title': 'I like a hard plastic case', 'text': "I like a hard plastic case. I find that they provide the best phone protection.This one has a more rugged, manly look. I'll look like a stud when I pull that baby out of my golf bag and consult my golf app for a blistering 250-yard 3-wood shot into a tight par 5 green.", 'images': [], 'asin': 'B014J4BCP4', 'parent_asin': 'B014J4BCP4', 'user_id': 'AGWQOIXL

In [53]:
for i in range(5):
  sample_length = len(samples[i]["text"].split())
  min_length = int(round(sample_length/1.5,0))
  max_length = int(round(sample_length/1.2,0))

  print(sample_length, min_length, max_length)
  print("="*100)

7 5 6
2 1 2
54 36 45
56 37 47
57 38 48


In [54]:
for i in range(5):
  sample_length = len(samples[i]["text"].split())
  min_length = int(round(sample_length/1.5,0))
  max_length = int(round(sample_length/1.2,0))

  print("actual_title :", samples[i]["title"])
  print(summarizer(samples[i]["text"], min_length=min_length, max_length=max_length))
  print("="*100)

actual_title : Stopped working in less than 24 hours
[{'summary_text': 'Stopped working'}]
actual_title : SAVED MONEY WITH NOT HAVING TO BUY A NEW kUERIG
[{'summary_text': ''}]
actual_title : I like a hard plastic case
[{'summary_text': "This one has a more rugged, manly look. I like a hard plastic case. I find that they provide the best phone protection. I'll look like a stud when I pull that baby out of my"}]
actual_title : It goes on so easy i don't know why i didn't get it sooner
[{'summary_text': "I needed this for a long long time! It goes on so easy i don't know why i didn't get it sooner! the only thing i wish the instructions was with it but i looked it up on youtube and"}]
actual_title : Plastic taste won't go away
[{'summary_text': 'We ran at least 10 gallons everyday for 2 weeks. Still was there. Worse wire ever. Replaced it with one fron Home Depot and the taste was gone in 20 mins. I threw this one away.'}]


**Observation**
- Mostly the model is doing extractive summarization not abstractive summarization which is the major requirement for our usecase.