In [1]:
import sparknlp
from pyspark.sql import SparkSession

In [20]:
spark = SparkSession.builder \
    .appName("Spark NLP") \
    .master("local[4]") \
    .config("spark.driver.memory", "8G") \
    .config("spark.kryoserializer.buffer.max", "2000M") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.2") \
    .getOrCreate()

In [21]:
# df = spark.read.csv("data.csv", header = True)

In [22]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType

# Define the schema based on the expected column names and types
schema = StructType([
    StructField("tweet_id", StringType(), True),
    StructField("text", StringType(), True),
    StructField("username", StringType(), True),
    StructField("fullname", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("replies", IntegerType(), True),
    StructField("retweets", IntegerType(), True),
    StructField("likes", IntegerType(), True),
])

# Read the CSV file with the specified schema
df = spark.read.option("header", True).schema(schema).csv("data.csv")

In [23]:
df.printSchema()
df.show(5, truncate=30)

root
 |-- tweet_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- username: string (nullable = true)
 |-- fullname: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- replies: integer (nullable = true)
 |-- retweets: integer (nullable = true)
 |-- likes: integer (nullable = true)



+-----------+------------------------------+----------------+-----------------------+-------------------+-------+--------+-----+
|   tweet_id|                          text|        username|               fullname|          timestamp|replies|retweets|likes|
+-----------+------------------------------+----------------+-----------------------+-------------------+-------+--------+-----+
|1.72657E+18|The Director of @PressHouse...|@mcaruanagalizia|Matthew Caruana Galizia|2023-11-20 12:54:00|      1|      23|   31|
|1.72657E+18|Smaller nations, focused on...|     @CSDR_India|                   CSDR|2023-11-20 12:52:00|      1|       0|    0|
|1.72657E+18|Ankara backed Egypt\u2019s ...|     @ragipsoylu|       Rag\u0131p Soylu|2023-11-20 12:48:00|      4|       5|   21|
|1.72657E+18|Pakistan\u2019s policy on P...|       @dawn_com|               Dawn.com|2023-11-20 12:42:00|      1|       0|    1|
|1.72656E+18|#Palestine\U0001f1f5\U0001f...|      @IFJGlobal|                    IFJ|2023-11-20 1

In [24]:
from sparknlp.base import DocumentAssembler, Finisher, TokenAssembler
from sparknlp.annotator import Tokenizer, Normalizer, LemmatizerModel, StopWordsCleaner, SentenceDetector,Stemmer, Lemmatizer

In [25]:
# Document Assembler
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")\
    .setCleanupMode("shrink")

In [26]:
df_doc= documentAssembler.transform(df)
# df_doc.printSchema()

In [27]:
#df_doc.select("document.result", "document.begin", "document.end").show(5, truncate=30)

In [28]:
#df_doc.select("document.result").take(1)

In [29]:
  # Tokenizer
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

In [30]:
sentenceDetector= SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

In [31]:
# Normalizer
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized") \
    .setLowercase(True)\
    .setCleanupPatterns(["[^\w\d\s]"]) #remove punctuations, keep alphanumeric chars.

In [32]:
#Stopwords

stopwordsCleaner =StopWordsCleaner()\
    .setInputCols(["token"])\
    .setOutputCol("cleaned_tokens")\
    .setCaseSensitive(True)

In [33]:
#TokenAssembler
tokenAssembler= TokenAssembler()\
    .setInputCols(["sentence", "cleaned_tokens"])\
    .setOutputCol("assembled")

In [34]:
#Stemming
stemmer= Stemmer()\
    .setInputCols(["token"])\
    .setOutputCol("stem")

In [35]:
#lemmatization
lemmatizer= Lemmatizer()\
    .setInputCols(["token"])\
    .setOutputCol("lemma")\
    .setDictionary("AntBNC_lemmas_ver_001.txt",
    value_delimiter="\t", key_delimiter="->")

In [36]:

# You can add Lemmatizer, Stemmer, and Stop Words Cleaner here as needed.

# Finisher: converts tokens to human-readable output
finisher = Finisher() \
    .setInputCols(["normalized"]) \
    .setOutputCols(["ntokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

In [37]:
# Define the pipeline
from sparknlp.base import Pipeline

nlpPipeline = Pipeline(stages=[
    documentAssembler,
    tokenizer,
    sentenceDetector,
    normalizer,
    stopwordsCleaner,
    tokenAssembler,
    stemmer,
    lemmatizer
])

In [39]:
model = nlpPipeline.fit(df)

Py4JJavaError: An error occurred while calling o241.getParam.
: java.util.NoSuchElementException: Param customBoundsStrategy does not exist.
	at org.apache.spark.ml.param.Params.$anonfun$getParam$2(params.scala:705)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.ml.param.Params.getParam(params.scala:705)
	at org.apache.spark.ml.param.Params.getParam$(params.scala:703)
	at org.apache.spark.ml.PipelineStage.getParam(Pipeline.scala:41)
	at sun.reflect.GeneratedMethodAccessor17.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)


In [32]:
result = model.transform(df)

SyntaxError: invalid syntax (2254586533.py, line 1)

In [31]:
# Apply the pipeline to the DataFrame

result= model.transform(df)

Py4JJavaError: An error occurred while calling o53.getParam.
: java.util.NoSuchElementException: Param customBoundsStrategy does not exist.
	at org.apache.spark.ml.param.Params.$anonfun$getParam$2(params.scala:705)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.ml.param.Params.getParam(params.scala:705)
	at org.apache.spark.ml.param.Params.getParam$(params.scala:703)
	at org.apache.spark.ml.PipelineStage.getParam(Pipeline.scala:41)
	at sun.reflect.GeneratedMethodAccessor16.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)


In [26]:
from pyspark.sql import functions as F

In [32]:
result.select("token.result" ,"normalized.result").show(5, truncate=30)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `normalized`.`result` cannot be resolved. Did you mean one of the following? [`replies`, `images`, `likes`, `retweets`, `fullname`].;
'Project [token#383.result AS result#573, 'normalized.result]
+- Project [tweet_avatar#17, tweet_id#18, url#19, query#20, text#21, username#22, fullname#23, timestamp#24, replies#25, retweets#26, quotes#27, images#28, likes#29, tweet_links#30, tweet_mentions#31, tweet_hashtags#32, in_reply_to#33, document#359, UDF(array(document#359)) AS token#383]
   +- Project [tweet_avatar#17, tweet_id#18, url#19, query#20, text#21, username#22, fullname#23, timestamp#24, replies#25, retweets#26, quotes#27, images#28, likes#29, tweet_links#30, tweet_mentions#31, tweet_hashtags#32, in_reply_to#33, UDF(text#21) AS document#359]
      +- Relation [tweet_avatar#17,tweet_id#18,url#19,query#20,text#21,username#22,fullname#23,timestamp#24,replies#25,retweets#26,quotes#27,images#28,likes#29,tweet_links#30,tweet_mentions#31,tweet_hashtags#32,in_reply_to#33] csv


In [23]:
result.withColumn("tmp", F.explode("assembled"))\
    .select("tmp.*").select("begin", "end", "result",
                            "metadata.sentence").show(5,truncate=30)

NameError: name 'result' is not defined