# Install and import required libraries/packaages

In [3]:
import re
import random
import pandas as pd
import numpy as np
from math import ceil
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch.optim as optim

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


# Get data from csv file stored in Google drive

In [5]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [6]:
data = pd.read_csv("/content/drive/MyDrive/text_summarizer/data/reviews.csv")

In [7]:
df = data[['Text', 'Summary']].reset_index(drop=True)
df.Summary = df.Summary.astype('str')

In [8]:
df_train = df[:30000]
df_test = df[30000:]

In [9]:
average_review_length = ceil(df_train.Text.apply(lambda x: len(x.split())).mean())
print(f'Avg. review length: {average_review_length}')
average_summary_length = ceil(df_train.Summary.astype('str').apply(lambda x: len(x.split())).mean())
print(f'Avg. summary length: {average_summary_length}')

Avg. review length: 80
Avg. summary length: 5


Since the average length of review is 80, a reasonable estimate of maximum length can 120.

In [10]:
MAX_LENGTH = 120

# Get the pre-trained GPT2 model tokens and initialize the model

In [11]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
model = model.to(device)

(…)ingface.co/gpt2/resolve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

(…)gingface.co/gpt2/resolve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

(…)gingface.co/gpt2/resolve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)face.co/gpt2/resolve/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

(…)gpt2/resolve/main/generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The DataSet and DataLoader classes can be used to create a dataset object that would automate the batching of dataset for training purposes and provide an easy way for interfacing between model and data.


# Prepare the dataset

In [12]:
class ReviewDataset(Dataset):
    def __init__(self, tokenizer, reviews, max_len):
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.reviews = reviews
        self.result = []

        for review in self.reviews:
            tokenized = self.tokenizer.encode(review + self.eos, max_length=self.max_len, truncation=True)
            padded = self.pad_truncate(tokenized)
            self.result.append(torch.tensor(padded))

    def __len__(self):
        return len(self.result)

    def __getitem__(self, item):
        return self.result[item]

    def pad_truncate(self, name):
        name_length = len(name) - len(tokenizer.encode(" TL;DR "))
        if name_length < self.max_len:
            difference = self.max_len - name_length
            result = name + [self.eos_id] * difference
        else:
            result = name
        return result

In [13]:
reviews = list(df_train.Text + ' TL;DR ' + df_train.Summary)
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
dataset = ReviewDataset(tokenizer=tokenizer, reviews=reviews, max_len=MAX_LENGTH)
dataloader = DataLoader(dataset, batch_size=16)

# Train the model on train data

In [14]:
def train(model, optimizer, dl, epochs):
    for epoch in range(epochs):
        for idx, batch in enumerate(dl):
             with torch.set_grad_enabled(True):
                optimizer.zero_grad()
                batch = batch.to(device)
                output = model(batch, labels=batch)
                loss = output[0]
                loss.backward()
                optimizer.step()
                if idx % 50 == 0:
                    print("loss: %f, %d"%(loss, idx))

In [15]:
train(model=model, optimizer=optimizer, dl=dataloader, epochs=1)

loss: 8.437123, 0
loss: 1.819293, 50
loss: 2.257750, 100
loss: 1.626482, 150
loss: 1.787626, 200
loss: 2.545513, 250
loss: 2.333845, 300
loss: 1.779812, 350
loss: 1.810717, 400
loss: 2.443677, 450
loss: 1.912886, 500
loss: 2.259478, 550
loss: 2.828206, 600
loss: 2.143399, 650
loss: 2.132315, 700
loss: 2.168471, 750
loss: 2.031986, 800
loss: 1.375688, 850
loss: 2.200886, 900
loss: 2.359701, 950
loss: 2.464784, 1000
loss: 2.389952, 1050
loss: 2.754581, 1100
loss: 2.258098, 1150
loss: 2.332687, 1200
loss: 1.803594, 1250
loss: 1.488941, 1300
loss: 1.847700, 1350
loss: 2.179464, 1400
loss: 1.957935, 1450
loss: 1.799400, 1500
loss: 2.505725, 1550
loss: 2.221826, 1600
loss: 1.554770, 1650
loss: 2.081868, 1700
loss: 1.890948, 1750
loss: 2.173512, 1800
loss: 2.364714, 1850


In [16]:
def topk(probs):
  n = 9
  probs = torch.softmax(probs, dim= -1)
  tokensProb, topIx = torch.topk(probs, k=n)
  tokensProb = tokensProb / torch.sum(tokensProb)
  tokensProb = tokensProb.cpu().detach().numpy()
  choice = np.random.choice(n, 1, p = tokensProb)
  tokenId = topIx[choice][0]
  return int(tokenId)

In [17]:
def model_infer(model, tokenizer, review, max_length=15):
  review_encoded = tokenizer.encode(review)
  result = review_encoded
  initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)

  with torch.set_grad_enabled(False):
    output = model(initial_input)
    logits = output.logits[0,-1]
    result.append(topk(logits))
    for _ in range(max_length):
      input = torch.tensor(result).unsqueeze(0).to(device)
      output = model(input)
      logits = output.logits[0,-1]
      res_id = topk(logits)
      if res_id == tokenizer.eos_token_id:
        return tokenizer.decode(result)
      else:
        result.append(res_id)
  return tokenizer.decode(result)

# Compare the results between predicted and actual summaries

In [33]:
reviews = (list(df_test.Text), list(df_test.Summary))
for review, summ in zip(reviews[0][:10], reviews[1][:10]):
    print(review)
    summary = model_infer(model, tokenizer, review + " TL;DR ").split(" TL;DR ")[1].strip()
    print(f'Summary: {summary}\nActual Summary: {summ}\n')

I've taken the gluten-free baking classes from the CIA and, although you're taught to make good gluten-free bread, this is much better.  Simply because it is so quick & easy.  You don't have to worry about the bread rising.<br /><br />Make sure you use a mixer.  If you try to mix it by hand, the bread won't rise properly.  Additionally, the pizza dough is great with diced olives, homemade roasted red peppers, basil & garlic - all in the dough.<br /><br />Finally, I'm also allergic to corn, soy, beans, nuts, and a ton of other stuff. So, unless I make it myself, this is my only choice (other than some of the Namaste products).
Summary: Great tasting
Actual Summary: Awesome!  Strongly recommend.

This bread mix is the absolutley best replacement for wheat flour bread we have found.  It smells yummy baking and the taste and consistancy makes a delicious sandwich.  It also makes great french toast.  This works great in a bread machine, which makes it super easy to make.  Every other bread 