In [1]:
import json
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

spark = sparknlp.start()

In [2]:
# Define pipeline

document_assembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("documents")

t5 = T5Transformer() \
  .pretrained("t5_small", 'en') \
  .setTask("summarize:")\
  .setMaxOutputLength(100)\
  .setInputCols(["documents"]) \
  .setOutputCol("summaries")

summarizer_pp = Pipeline(stages=[
    document_assembler, t5
])

t5_small download started this may take some time.
Approximate size to download 139 MB
[OK!]


In [3]:
# Implement pipeline

empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline_model = summarizer_pp.fit(empty_df)
sum_lmodel = LightPipeline(pipeline_model)

In [4]:
# Sample text for summarization

example_txt = """

I am fascinated by technology. I think that technology can solve many societal challenges. We should invest more in sustainable technologies.
"""

In [5]:
# Check output (text summary)

res = sum_lmodel.fullAnnotate(example_txt)[0]

print ('Summary:', res['summaries'][0].result)

Summary: I think that technology can solve many societal challenges . we should invest more in sustainable technologies .
