In [2]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.1.4

--2023-11-21 04:10:50--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2023-11-21 04:10:50--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1191 (1.2K) [text/plain]
Saving to: ‘STDOUT’


2023-11-21 04:10:50 (50.1 MB/s) - written to stdout [1191/1191]

Installing PySpark 3.2.3 and Spark NLP 5.1.4
setup Colab for PySpark 3.2.3 and Spark NLP 5

In [3]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.4")\
    .getOrCreate()

In [4]:
import sparknlp

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  5.1.4
Apache Spark version:  3.2.3


In [5]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip

--2023-11-21 04:14:21--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘smsspamcollection.zip’

smsspamcollection.z     [ <=>                ] 198.65K  1.01MB/s    in 0.2s    

2023-11-21 04:14:22 (1.01 MB/s) - ‘smsspamcollection.zip’ saved [203415]



In [6]:
!unzip -o smsspamcollection.zip

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [7]:
sms = spark.read.csv("file:/content/SMSSpamCollection", header=False,inferSchema=True, sep='\t').toDF("label", "text")

In [8]:
id2label = {0: "ham", 1: "spam"}
label2id = {'ham': 0, 'spam': 1}

In [9]:
from pyspark.sql.functions import pandas_udf
import pandas as pd

# `pandas_udf` is the annotator that transforms a custom function
# into a udf, so we can call this function inside `select`.
@pandas_udf('integer')
def replace_labels_with_ids(labels: pd.Series) -> pd.Series:
  return labels.apply(lambda x: label2id[x])

sms_id_labels = sms.select(replace_labels_with_ids(sms.label).alias('label'), sms.text)
sms_id_labels.toPandas().head(10)

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [10]:
(train, test) = sms_id_labels.persist().randomSplit([0.8, 0.2])

In [13]:
from pyspark.sql.types import ArrayType, FloatType, IntegerType
from pyspark.sql.functions import udf
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import numpy as np

# Define a UDF to convert logits to a prediction
def argmax(logits):
    return int(np.argmax(logits))
argmax_udf = udf(argmax, IntegerType())

In [14]:
import sparknlp
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql.functions import col

In [28]:
test.show(5)

+-----+--------------------+
|label|                text|
+-----+--------------------+
|    0| says that he's q...|
|    0|"Happy valentines...|
|    0|"Its Ur luck to L...|
|    0|"The world suffer...|
|    0|'An Amazing Quote...|
+-----+--------------------+
only showing top 5 rows



In [15]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

# Initialize Spark NLP components
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

bert_embeddings = BertEmbeddings.pretrained("bert_base_uncased", "en") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings")

sentence_embeddings = SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classifier = ClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("class") \
    .setLabelColumn("label") \
    .setMaxEpochs(5) \
    .setBatchSize(8)

# Define the pipeline
pipeline = Pipeline().setStages([
    document_assembler,
    tokenizer,
    bert_embeddings,
    sentence_embeddings,
    classifier
])

# Fit the pipeline to the training data
model = pipeline.fit(train)

# Predict on the test data
predictions = model.transform(test)

bert_base_uncased download started this may take some time.
Approximate size to download 392.5 MB
[OK!]


In [26]:
predictions.show(5)

+----------+-----+
|prediction|label|
+----------+-----+
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
|         0|  0.0|
+----------+-----+
only showing top 5 rows



In [30]:
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Ensure the prediction column is of type double
predictions = predictions.withColumn("prediction", col("prediction").cast(DoubleType()))

# Ensure the label column is also of type double
predictions = predictions.withColumn("label", col("label").cast(DoubleType()))

# Select the prediction and true label
predictions = predictions.select(col("prediction"), col("label"))

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8682242990654205
