## Environment Setup




In [3]:
!pip install datasets sacrebleu rouge_score py7zr -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m20.

## Module imports

In [47]:
from google.colab import files

In [4]:
from transformers import pipeline, AutoTokenizer

from datasets import load_dataset, DatasetDict

## Utility functions

In [39]:
# Function to display random samples

def show_samples_full(dataset, num_samples=5, seed=42):
    sample = dataset["full"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Title: {example['title']}'")
        print(f"'>> Text: {example['text']}'")

In [25]:
# Function for filtering DatasetDict
# We will consider only those titles where there are atleast 5 words (more specifically splits)

def filter_appliances(example):
    return (
        len(example["title"].split()) >= 5
    )

In [58]:
def show_samples(dataset, num_samples=5, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Title: {example['title']}'")
        print(f"'>> Text: {example['text']}'")

In [59]:
def get_samples(dataset, num_samples=5, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    return sample

## facebook/bart-large-cnn Model



**Resource**
- https://techblog.geekyants.com/text-summarization-using-facebook-bart-large-cnn

In [5]:
# facebook/bart-large-cnn finetuned model on cnn_dailymail data

model_checkpoint = "facebook/bart-large-cnn"

In [6]:
# Complete facebook/bart-large-cnn summarizer pipeline

summarizer = pipeline("summarization", model=model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [7]:
# facebook/bart-large-cnn tokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



## McAuley-Lab/Amazon-Reviews-2023 Dataset

In [8]:
# Loading the Appliances data (Not the complete data)

english_dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/929M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

### Data analysis

In [9]:
english_dataset

DatasetDict({
    full: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 2128605
    })
})

In [10]:
# Looking at the first example

english_dataset['full'][0]

{'rating': 5.0,
 'title': 'Work great',
 'text': 'work great. use a new one every month',
 'images': [],
 'asin': 'B01N0TQ0OH',
 'parent_asin': 'B01N0TQ0OH',
 'user_id': 'AGKHLEW2SOWHNMFQIJGBECAF7INQ',
 'timestamp': 1519317108692,
 'helpful_vote': 0,
 'verified_purchase': True}

In [11]:
# Looking at first 5 examples

english_dataset['full'][:5]

{'rating': [5.0, 5.0, 5.0, 5.0, 5.0],
 'title': ['Work great',
  'excellent product',
  'Happy customer!',
  'Amazing value',
  'Dryer parts'],
 'text': ['work great. use a new one every month',
  'Little on the thin side',
  'Quick delivery, fixed the issue!',
  "I wasn't sure whether these were worth it or not, given the cost compared to the original branded filters.<br /><br />I can happily report that these are a great value and work every bit as good as the original. If you are on the fence worrying whether these are worth it- I can assure you they are.",
  'Easy to install got the product expected to receive'],
 'images': [[], [], [], [], []],
 'asin': ['B01N0TQ0OH',
  'B07DD2DMXB',
  'B082W3Z9YK',
  'B078W2BJY8',
  'B08C9LPCQV'],
 'parent_asin': ['B01N0TQ0OH',
  'B07DD37QPZ',
  'B082W3Z9YK',
  'B078W2BJY8',
  'B08C9LPCQV'],
 'user_id': ['AGKHLEW2SOWHNMFQIJGBECAF7INQ',
  'AHWWLSPCJMALVHDDVSUGICL6RUCA',
  'AHZIJGKEWRTAEOZ673G5B3SNXEGQ',
  'AFGUPTDFAWOHHL4LZDV27ERDNOYQ',
  'AELFJFA

In [12]:
# Looking at the title and text only, of the first five examples

for i in range(5):
  print("Title:", english_dataset['full'][i]["title"])
  print("Text:", english_dataset['full'][i]["text"])
  print("="*50)

Title: Work great
Text: work great. use a new one every month
Title: excellent product
Text: Little on the thin side
Title: Happy customer!
Text: Quick delivery, fixed the issue!
Title: Amazing value
Text: I wasn't sure whether these were worth it or not, given the cost compared to the original branded filters.<br /><br />I can happily report that these are a great value and work every bit as good as the original. If you are on the fence worrying whether these are worth it- I can assure you they are.
Title: Dryer parts
Text: Easy to install got the product expected to receive


In [40]:
show_samples_full(english_dataset)


'>> Title: What changed????'
'>> Text: I’ve used these filters in the past with the only issue being leaks.<br /><br />This newest batch makes my water taste like garden hose water. Almost worse then the sink!<br /><br />What changed!?'

'>> Title: Five Stars'
'>> Text: These Work fine- no issues at all.'

'>> Title: Five Stars'
'>> Text: Just right! Fits my Kenmore 400 series washer.'

'>> Title: Works great for Vicks vaporizers'
'>> Text: Fits great  with vicks vaporizer rinses out well every week after daily use'

'>> Title: Don’t buy beware..'
'>> Text: I asked for a refund as it did not fit my cooktop and I never heard from the seller!!'


In [16]:
# Changing the format of the DatasetDict into pandas dataframe

english_dataset.set_format("pandas")

In [17]:
english_dataset

DatasetDict({
    full: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 2128605
    })
})

In [18]:
# [:] Selecting the rows
# We need all the rows

english_df = english_dataset["full"][:]

In [19]:
english_df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5.0,Work great,work great. use a new one every month,[],B01N0TQ0OH,B01N0TQ0OH,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1519317108692,0,True
1,5.0,excellent product,Little on the thin side,[],B07DD2DMXB,B07DD37QPZ,AHWWLSPCJMALVHDDVSUGICL6RUCA,1664746863446,0,True
2,5.0,Happy customer!,"Quick delivery, fixed the issue!",[],B082W3Z9YK,B082W3Z9YK,AHZIJGKEWRTAEOZ673G5B3SNXEGQ,1607225435363,0,True
3,5.0,Amazing value,I wasn't sure whether these were worth it or n...,[],B078W2BJY8,B078W2BJY8,AFGUPTDFAWOHHL4LZDV27ERDNOYQ,1534104184306,0,True
4,5.0,Dryer parts,Easy to install got the product expected to re...,[],B08C9LPCQV,B08C9LPCQV,AELFJFAXQERUSMTXJQ6SYFFRDWMA,1620176603754,0,True


In [None]:
english_df.columns

Index(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase'],
      dtype='object')

In [20]:
# We only need the title and text columns
# We are listing the other columns to delete

columns_to_drop = set(english_df.columns) - set(["title", "text"])
columns_to_drop

{'asin',
 'helpful_vote',
 'images',
 'parent_asin',
 'rating',
 'timestamp',
 'user_id',
 'verified_purchase'}

In [21]:
# Deleting the columns

english_df.drop(columns = columns_to_drop, inplace=True)
english_df

Unnamed: 0,title,text
0,Work great,work great. use a new one every month
1,excellent product,Little on the thin side
2,Happy customer!,"Quick delivery, fixed the issue!"
3,Amazing value,I wasn't sure whether these were worth it or n...
4,Dryer parts,Easy to install got the product expected to re...
...,...,...
2128600,Accurate description,As described
2128601,Not compatible with Nespresso U Machine,I have tried multiple times with different reu...
2128602,Works with Sears Kenmore model 36275585891,Exact fit for Sears Kenmore model 36275585891....
2128603,Perfect little ice maker!,Love this!! It doesn’t keep the ice cold but t...


In [22]:
# Exporting the title and text data as csv

english_df.to_csv("./english_df.csv")

**Notes**
- The most popular products in the Amazon dataset are about household items, clothing, and wireless electronics.
- The Amazon theme, though, focuses on summarizing book reviews — after all, this is what the company was founded on.
- We can see two product categories that fit the bill (book and digital_ebook_purchase), but we will use *Appliances* becasue we want to start working with a small dataset.

In [23]:
# Resetting the format back to DatasetDict

english_dataset.reset_format()
english_dataset

DatasetDict({
    full: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 2128605
    })
})

In [26]:
# Filtering the Dataset
# Considering the records with the titles having atleast 5 words

english_appliances = english_dataset.filter(filter_appliances)

Filter:   0%|          | 0/2128605 [00:00<?, ? examples/s]

In [42]:
english_appliances

DatasetDict({
    full: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 655931
    })
})

In [28]:
# Reduction in records

2128605 - 655931

1472674

In [44]:
show_samples_full(english_appliances)


'>> Title: Works great.  Note on the shaft cut.'
'>> Text: Item was shipped fast.  With help from video on YT, was able to replace old auger with ease.<br />***Please Note***<br />This new part has a shaft with a shorter machined stop on each side compared to the old part.  The new part does not need to be modified.  The blades fit on the auger just fine and once put together ice is dispensed as expected.  Just note the last two spacers will not sit flush against the shaft.  This does not affect fit nor function of the new part.'

'>> Title: ... was purchased for my adult son who does not like to iron shirts'
'>> Text: This item was purchased for my adult son who does not like to iron shirts. He is extremely pleased with it, it is easy to use and allows him to press several shirts at a time without having to refill with water.'

'>> Title: The filter is working fine but it has only been about one month'
'>> Text: The filter is working fine but it has only been about one month. That's 

### Train validation test split

In [30]:
# Split train and test sets

train_test_split = english_appliances["full"].train_test_split(test_size=0.2)

train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [31]:
train_dataset

Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
    num_rows: 524744
})

In [32]:
test_dataset

Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
    num_rows: 131187
})

In [33]:
# Further split the training set into training and validation sets

train_val_split = train_dataset.train_test_split(test_size=0.25)

train_dataset = train_val_split['train']
val_dataset = train_val_split['test']

In [34]:
# Combine the splits into a DatasetDict

split_dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [35]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 393558
    })
    validation: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 131186
    })
    test: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 131187
    })
})

In [36]:
# Export the splitted dataset

split_dataset.save_to_disk("appliances")

Saving the dataset (0/1 shards):   0%|          | 0/393558 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/131186 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/131187 [00:00<?, ? examples/s]

In [45]:
show_samples(split_dataset)


'>> Title: Inexpensive, reliable, so far. Does the job.'
'>> Text: Setup was very much like any smart WIFI, device, took about 10 minutes.  Uses same app as Treatlife smart devices.  Have only had the device for a few days, but it seems accurate enough. I needed a means to monitor temperature and humidity in my unheated crawl space.  For $15 this seems to be an excellent product. Time will tell regarding reliability.<br /><br />App is pretty basic, but OK. Bigger and brighter font would help with readability.'

'>> Title: Missing clampassembly from original installer'
'>> Text: Worked to hold cooktop in place.'

'>> Title: Not working! No ice disappointed'
'>> Text: [[VIDEOID:ccf060572b17c2a64216df423d0dd8dc]] Only worked 1 time where it filled the ice bucket. Then it never worked again, says it needs water all the time no matter the amount of water in it. Big disappointment wanted it to be awesome.'

'>> Title: Works flawlessly and in the middle of nowhere!'
'>> Text: We purchased th

### Zip and Download

In [None]:
# Zipping the csv dataset

!zip -r english_df.zip /content/english_df.csv

  adding: content/english_df.csv (deflated 63%)


In [46]:
# Zipping the DatasetDict dataset

!zip -r appliances.zip /content/appliances

  adding: content/appliances/ (stored 0%)
  adding: content/appliances/train/ (stored 0%)
  adding: content/appliances/train/data-00000-of-00001.arrow (deflated 61%)
  adding: content/appliances/train/state.json (deflated 37%)
  adding: content/appliances/train/dataset_info.json (deflated 67%)
  adding: content/appliances/dataset_dict.json (deflated 5%)
  adding: content/appliances/test/ (stored 0%)
  adding: content/appliances/test/data-00000-of-00001.arrow (deflated 61%)
  adding: content/appliances/test/state.json (deflated 38%)
  adding: content/appliances/test/dataset_info.json (deflated 67%)
  adding: content/appliances/validation/ (stored 0%)
  adding: content/appliances/validation/data-00000-of-00001.arrow (deflated 61%)
  adding: content/appliances/validation/state.json (deflated 38%)
  adding: content/appliances/validation/dataset_info.json (deflated 67%)


In [50]:
# Downloading the zipped csv file

files.download("/content/english_df.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [48]:
# Downloading the zipped DatasetDict file

files.download("/content/appliances.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Prediction analysis of the cnn finetuned model

In [60]:
# Displaying some random samples

show_samples(split_dataset)


'>> Title: Make sure too add plenty of water and you good.'
'>> Text: Yes I love this washer easy too use and carry around'

'>> Title: Works well if installed well'
'>> Text: You have to be careful to set the right distance during installation (trial and error before permanent installation), but once set it works as expected - at least on this specimen.'

'>> Title: The shipping box was fine but the duct connectors were all smashed and bent'
'>> Text: I paid for the connectors and they were all smashed and bent I disliked that alot'

'>> Title: Pays for itself in no time'
'>> Text: Surprised by the plastic bottom as my old one had mesh there but it doesn’t seem to affect flow and is easier to clean'

'>> Title: Seems like some new fridges now have louder condensers to help ...'
'>> Text: Seems like some new fridges now have louder condensers to help save energy?@$#&!!  I bought a new 2016 model LG, which I had to send back because it kept us all up at night, vibrating/etc. Then got a

In [56]:
# Getting the same random samples

samples = get_samples(split_dataset)
samples

Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
    num_rows: 5
})

In [57]:
# Making sure we are getting the same samples as above

for i in range(5):
  print(samples[i])

{'rating': 5.0, 'title': 'Make sure too add plenty of water and you good.', 'text': 'Yes I love this washer easy too use and carry around', 'images': [{'attachment_type': 'IMAGE', 'large_image_url': 'https://images-na.ssl-images-amazon.com/images/I/A1xi9rHZj-L._SL1600_.jpg', 'medium_image_url': 'https://images-na.ssl-images-amazon.com/images/I/A1xi9rHZj-L._SL800_.jpg', 'small_image_url': 'https://images-na.ssl-images-amazon.com/images/I/A1xi9rHZj-L._SL256_.jpg'}], 'asin': 'B089YD4RBV', 'parent_asin': 'B089YD4RBV', 'user_id': 'AFK6KNDPYEMUKV3IGHOAXIOJP45A', 'timestamp': 1616765992118, 'helpful_vote': 2, 'verified_purchase': True}
{'rating': 5.0, 'title': 'Works well if installed well', 'text': 'You have to be careful to set the right distance during installation (trial and error before permanent installation), but once set it works as expected - at least on this specimen.', 'images': [], 'asin': 'B0813GNDB8', 'parent_asin': 'B08CW52TWQ', 'user_id': 'AGZR52SKS24QMWEF357FRH47KRMA', 'times

In [62]:
for i in range(5):
  sample_length = len(samples[i]["text"].split())
  min_length = int(round(sample_length/1.5,0))
  max_length = int(round(sample_length/1.2,0))

  print(sample_length, min_length, max_length)
  print("="*100)

11 7 9
31 21 26
16 11 13
24 16 20
115 77 96


In [65]:
for i in range(5):
  sample_length = len(samples[i]["text"].split())
  min_length = int(round(sample_length/1.5,0))
  max_length = int(round(sample_length/1.2,0))

  print("actual_title :", samples[i]["title"])
  print(summarizer(samples[i]["text"], min_length=min_length, max_length=max_length))
  print("="*100)

actual_title : Make sure too add plenty of water and you good.
[{'summary_text': 'I love this washer easy'}]
actual_title : Works well if installed well
[{'summary_text': 'You have to be careful to set the right distance during installation. Once set it works as expected - at least on'}]
actual_title : The shipping box was fine but the duct connectors were all smashed and bent
[{'summary_text': 'I paid for the connectors and they were all smashed'}]
actual_title : Pays for itself in no time
[{'summary_text': '. Surprised by the plastic bottom as my old one had mesh there'}]
actual_title : Seems like some new fridges now have louder condensers to help ...
[{'summary_text': "These feet solved MOST of the problem. Expensive but they are high quality and very dense. Easy to put on with someone strong enough to tip over the fridge. Not a one person install. The fridges weren't that noisy in the kitchen...but the floor vibration was insane in other rooms... kind of a nightmare issue to solve

**Observation**
- Mostly the model is doing extractive summarization not abstractive summarization which is the major requirement for our usecase.