In [1]:
# Install PySpark and Spark NLP
!pip install spark-nlp==5.2.3 pyspark==3.3.1



In [2]:
!python --version

Python 3.10.12


In [3]:
from pyspark.ml import Pipeline
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.annotator import DocumentCharacterTextSplitter


In [5]:
# Initialize SparkSession
spark = sparknlp.start()

# Sample text data
text_data = "This is a sample text that will be chunked Another piece of text to demonstrate chunking My friend was an enthusiastic musician, being himself not only"

# Create DataFrame with a single column "text" and a single row
textDF = spark.createDataFrame([(text_data,)], ["text"])

# Display the DataFrame
textDF.show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                   |
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
|This is a sample text that will be chunked Another piece of text to demonstrate chunking My friend was an enthusiastic musician, being himself not only|
+-------------------------------------------------------------------------------------------------------------------------------------------------------+



In [14]:
documentAssembler = DocumentAssembler().setInputCol("text")

textSplitter = DocumentCharacterTextSplitter() \
    .setInputCols(["document"]) \
    .setOutputCol("splits") \
    .setSplitPatterns([" "]) \
    .setChunkSize(100) \
    .setChunkOverlap(10) \
    .setPatternsAreRegex(False) \
    .setExplodeSplits(True)

pipeline = Pipeline().setStages([documentAssembler, textSplitter])
result = pipeline.fit(textDF).transform(textDF)
result.selectExpr(
      "splits.result",
      "splits[0].begin",
      "splits[0].end",
      "splits[0].end - splits[0].begin as length") \
    .show(8, truncate = 80)

+--------------------------------------------------------------------------------+---------------+-------------+------+
|                                                                          result|splits[0].begin|splits[0].end|length|
+--------------------------------------------------------------------------------+---------------+-------------+------+
|[<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"><html lang="E...|              0|           91|    91|
|[name="format-detection" content="telephone=no"/>\n<meta name="DC.language" c...|             92|          185|    93|
|[name="DC.title" content="EUR-Lex - 32003A0204(07) - EN">\n<meta name="DC.sub...|            186|          285|    99|
|[at work and elsewhere, Nuclear common market, Health and safety, Germany, nu...|            283|          379|    96|
|[station, radioactive effluent, nuclear safety, radioactive pollution, ">\n<m...|            371|          449|    78|
|[">\n<meta name="DC.description" conten

In [7]:
# Assuming 'result' is the column you want to convert to a list of strings
result_list = result.select("splits.result").rdd.flatMap(lambda x: x[0]).collect()

# Print the resulting list
print(result_list)

['This is a', 'is a sample', 'sample text', 'text that will', 'will be', 'be chunked', 'Another piece', 'piece of text', 'text to', 'to demonstrate', 'chunking My', 'My friend was', 'was an', 'enthusiastic', 'musician,', 'being himself', 'not only']


## Test on HTML data

In [12]:
text_data = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"><html lang=\"EN\">\n<head><meta name=\"format-detection\" content=\"telephone=no\"/>\n<meta name=\"DC.language\" content=\"EN\">\n<meta name=\"DC.title\" content=\"EUR-Lex - 32003A0204(07) - EN\">\n<meta name=\"DC.subject\" content=\"Safety at work and elsewhere, Nuclear common market, Health and safety, Germany, nuclear power station, radioactive effluent, nuclear safety, radioactive pollution, \">\n<meta name=\"DC.description\" content=\"Commission opinion of 29 January 2003 concerning the plan for the disposal of radioactive waste resulting from modifications at the site of the Gundremmingen II Nuclear Power Station KRB II in the Federal Republic of Germany, in accordance with Article 37 of the Euratom Treaty  \">\n<meta name=\"DC.type\" http-equiv=\"Content-Type\" content=\"text/html; charset=UNICODE-1-1-UTF-8\">\n<meta name=\"DC.source\" content=\"Official Journal C 026 , 04/02/2003 P. 0013 - 0013; \">\n<meta name=\"DC.publisher\" content=\"OPOCE\">\n<meta name=\"DC.identifier\" scheme=\"URI\" content=\"http://europa.eu.int/eur-lex/lex/LexUriServ/LexUriServ.do?uri=CELEX:32003A0204(07):EN:HTML\">\n<script type=\"text/javascript\" src=\"/eurlex-frontoffice/ruxitagentjs_ICA2NVfgjqrux_10259230221142207.js\" data-dtconfig=\"app=47d4c64c3b67ec69|agentId=99c01a79b0e1f1ac|featureHash=ICA2NVfgjqrux|vcv=2|rdnt=1|uxrgce=1|bp=3|cuc=m097nmfl|mel=100000|mb=null|dpvc=1|iub=null|ssv=4|lastModification=1698244402819|tp=500,50,0,1|agentUri=/eurlex-frontoffice/ruxitagentjs_ICA2NVfgjqrux_10259230221142207.js|reportUrl=/eurlex-frontoffice/rb_39a3e95b-5423-482c-879b-99ef235dffeb|rid=RID_-1519899484|rpid=-2099257570|domain=europa.eu\"></script><style type=\"text/css\" media=\"all\">  @import url(./../../../../css/generic.css); </style>\n<link rel=\"stylesheet\" type=\"text/css\" media=\"print\" href=\"./../../../../css/generic-print.css\">\n<title>EUR-Lex - 32003A0204(07) - EN</title>\n<link rel=\"canonical\" href=\"https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX%3A32003A0204%2807%29\"/>\n</head>\n<body>\n<div id=\"banner\">\n<a name=\"top\"></a>\n<div class=\"bglang\">\n<p class=\"bglang\">\n<a class=\"langue\" href=\"../../../editorial/legal_notice.htm\" accesskey=\"8\"><b>Avis juridique important</b></a>\n<br>\n</p>\n</div>\n<div class=\"bgtool\">\n<em class=\"none\">|</em>\n</div>\n</div>\n<a name=\"top\"></a>\n<h1>32003A0204(07)</h1>\n<p>\n<strong>Commission opinion of 29 January 2003 concerning the plan for the disposal of radioactive waste resulting from modifications at the site of the Gundremmingen II Nuclear Power Station KRB II in the Federal Republic of Germany, in accordance with Article 37 of the Euratom Treaty  </strong>\n<br>\n<em>\n<br>Official Journal C 026 , 04/02/2003 P. 0013 - 0013<br> </em>\n</p>\n<br>\n<div id=\"TexteOnly\">\n<p>\n<TXT_TE>\n<p>Commission opinion</p><p>of 29 January 2003</p><p>concerning the plan for the disposal of radioactive waste resulting from modifications at the site of the Gundremmingen II Nuclear Power Station KRB II in the Federal Republic of Germany, in accordance with Article 37 of the Euratom Treaty</p><p>(2003/C 26/09)</p><p></p><p>(Only the German text is authentic)</p><p>On 6 August 2002 the European Commission received from the Government of the Federal Republic of Germany, in accordance with Article 37 of the Euratom Treaty, general data relating to the plan for the disposal of radioactive waste resulting from modifications at the site of the Gundremmingen II Nuclear Power Station KRB II.</p><p>On the basis of these data, the Commission has considered that the plan concerned modifications to an existing plan on which an opinion had already been given. The Commission has further taken into consideration that the intermediate storage facility for irradiated fuel resulting from those modifications is designed to operate up to 40 years and could remain in operation after decommissioning and dismantling of the existing plant. Following consultation with the group of experts, the Commission has drawn up the following opinion:</p><p>(a) the planned modifications require no changes to the existing authorised limits for gaseous and liquid discharges;</p><p>(b) the planned modifications have no consequences in relation to the solid radioactive waste arising from the operation of the existing plant;</p><p>(c) the planned modifications have no consequences in relation to the unplanned discharges of radioactive substances, which may follow an accident of the type and magnitude considered in the general data of the existing plan.</p><p>In conclusion, the Commission is of the opinion that the implementation of the plan for the disposal of radioactive waste in whatever form resulting from modifications at the site of the Gundremmingen II Nuclear Power Station KRB II, located in the Federal Republic of Germany, both in normal operation and in the event of an accident of the type and magnitude considered in the general data, is not liable to result in radioactive contamination, significant from the point of view of health, of the water, soil or airspace of another Member State.</p><p></p><p> </p>\n</TXT_TE>\n</p>\n</div>\n</body>\n</html>\n"

In [15]:
# Create DataFrame with a single column "text" and a single row
textDF = spark.createDataFrame([(text_data,)], ["text"])

# Display the DataFrame
textDF.show(truncate=False)

pipeline = Pipeline().setStages([documentAssembler, textSplitter])
result = pipeline.fit(textDF).transform(textDF)
result.selectExpr(
      "splits.result",
      "splits[0].begin",
      "splits[0].end",
      "splits[0].end - splits[0].begin as length") \
    .show(8, truncate = 80)

# Assuming 'result' is the column you want to convert to a list of strings
result_list = result.select("splits.result").rdd.flatMap(lambda x: x[0]).collect()

# Print the resulting list
print(result_list)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------