<a href="https://colab.research.google.com/github/pszemraj/vid2cleantxt/blob/master/colab_notebooks/summarization_example_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# vid2cleantxt - summarization example

Purpose: an example notebook to illustrate use-case of .txt transcriptions after vid2cleantext is completed. 

A decent resource for further reading is [here](https://chetanambi.medium.com/generate-summaries-using-googles-pegasus-library-772633a161c2).

Author: Peter Szemraj |  [vid2cleantxt github repo](https://github.com/pszemraj/vid2cleantxt)

---


In [1]:
%%capture

!pip install -U torch
!pip install -U transformers
!pip install -U sentencepiece
!pip install -U tqdm

from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch, sentencepiece
from tqdm.auto import tqdm

In [2]:
# check GPU allocation

!nvidia-smi

Tue Jul 13 11:38:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# huggingface.co example - pegasus

From documentation [here](https://huggingface.co/transformers/model_doc/pegasus.html)

In [3]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

src_text = [
    """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
]

model_name = "google/pegasus-xsum"  # this example uses the "xsum" model
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(
    device
)
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

del batch, translated, model, tokenizer  # changing below

Downloading:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [4]:
import pprint as pp

print("Summarized version with pegasus-xsum is:\n")
pp.pprint(tgt_text)

Summarized version with pegasus-xsum is:

["California's largest electricity provider has turned off power to hundreds "
 'of thousands of customers.']


# vid2cleantxt example

- upload a .zip or other compressed archive of .txt files
- each txt file summarized
- can download a new zip of summarized text

In [5]:
import os, shutil
from google.colab import files

# upload zip file. in the example file, the .zip contains the text files in:
# example_JFK_speech/TEST_folder_edition/v2clntxt_transcriptions/NSC + SBD
in_archive = files.upload()
the_wd = os.getcwd()
src_dir = os.path.join(the_wd, "input_txt_files")  # put the files here
os.makedirs(src_dir, exist_ok=True)
shutil.unpack_archive(list(in_archive.keys())[0], src_dir)

Saving jfk_text_examples.zip to jfk_text_examples.zip


In [6]:
from natsort import natsorted
from os import listdir
from os.path import join, basename, isfile

# read txt files from unpacked archive. They should be in top-level
req_ext = ".txt"
appr_files = [
    join(src_dir, f)
    for f in listdir(src_dir)
    if isfile(join(src_dir, f)) and f.endswith(req_ext)
]
print("preview of {} loaded text files \n".format(len(appr_files)))
n_preview = 3
pp.pprint(appr_files[:n_preview], compact=True, indent=5)

preview of 7 loaded text files 

[    '/content/input_txt_files/FIN_gpu_president_kennedy_speech_on_the_space_effort_apart_t_script_05435.txt',
     '/content/input_txt_files/FIN_gpu_president_kennedy_speech_on_the_space_effort_apart_t_script_05540.txt',
     '/content/input_txt_files/FIN_jfk_rice_moons_pee_ct_script_06082.txt']


In [7]:
out_dir = os.path.join(the_wd, "summarized_txt_files")  # put the files here
os.makedirs(out_dir, exist_ok=True)

# load the pegasus large model (typically better than xsum above)
model_name = "google/pegasus-large"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

Downloading:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

In [8]:
summarizations = []
summarization_percentages = []

for tfile in tqdm(appr_files, total=len(appr_files), desc="summarizing input files"):

    torch.cuda.empty_cache()  # so GPU doesn't crash
    with open(tfile, mode="r", encoding="utf-8", errors="ignore") as src_f:
        this_text = src_f.read()

    batch = tokenizer(
        this_text, truncation=True, padding="longest", return_tensors="pt"
    ).to(device)
    translated = model.generate(**batch, no_repeat_ngram_size=2)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    summarizations.append(tgt_text)

    this_outname = "SUM_" + basename(tfile)
    with open(
        join(out_dir, this_outname), mode="w", encoding="utf-8", errors="ignore"
    ) as out_f:
        out_f.writelines(tgt_text)
    # metric comp.
    summarization_percentages.append(round(100 * len(tgt_text) / len(this_text), 4))
    del batch, translated, tgt_text  # so GPU doesn't crash

print(
    "Files were summarized down to the following percentages:\n",
    summarization_percentages,
)  # by ratio of characters in text

summarizing input files:   0%|          | 0/7 [00:00<?, ?it/s]

Files were summarized down to the following percentages:
 [0.0418, 0.0409, 0.0458, 0.0445, 0.0397, 0.0392, 0.0052]


In [9]:
# display some results
import random

n_preview = 5
print("printout of {} of the summary files \n".format(n_preview))
pp.pprint(random.sample(summarizations, k=n_preview), indent=5)

printout of 5 of the summary files 

[    [    'Your city of history with its main spacecraft center will become '
          'the heart of a large scientific and engineering community during '
          'the next five years the national aerobatic and space administration '
          'expects to double the number of scientists and engineers in this '
          'area to increase its outlets for salaries and expenses to sixty '
          'million dollars a year to invest some two hundred million to.'],
     [    'First watch carrying all the equipment needed for guidance '
          'propulsion control communications food and survival on an untried '
          'mission to an unknown celestial body and then return it safely to '
          'earth re entering the atmosphere at speeds of over twenty five '
          'thousand miles per hour causing heat about half that on the '
          'temperature of the sun almost as hot as it is here to day and do '
          'all this.'],
     [    'Ans

In [10]:
download_output_files = True  # @param {type:"boolean"}

In [11]:
# create zip archive and download
from datetime import datetime

zip_dir = join(the_wd, "zipped_outputs")
os.makedirs(zip_dir, exist_ok=True)
summary_header = "summary_file_archive_" + datetime.now().strftime("%d%m%Y")
shutil.make_archive(join(zip_dir, summary_header), "zip", out_dir)

if download_output_files:
    files.download(join(zip_dir, summary_header + ".zip"))
    print("downloaded files - ", datetime.now())
else:
    print("download_output_files is set to: ", download_output_files)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

downloaded files -  2021-07-13 11:41:27.877957
