# Solr Hacking (2)

From chatgpt: https://chatgpt.com/c/1cd2c89f-5a32-4ae1-a4f0-ca5aaf54a6a6

In [1]:
import sparknlp
from pyspark.sql import SparkSession
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import SentenceDetector, Tokenizer, BertSentenceEmbeddings
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType

In [2]:
# Start SparkSession
spark = SparkSession.builder \
    .appName("Spark NLP") \
    .master("local[*]") \
    .getOrCreate()

# Sample DataFrame with text
data = [
    (1, "This is the first sentence."),
    (2, "Here is another sentence.")
]

df = spark.createDataFrame(data, ["id", "text"])

# Document Assembler
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

# Sentence Detector
sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

# Tokenizer
tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

# Sentence Embeddings
sentence_embeddings = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_768") \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingLayer("mean")

# Pipeline
pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    sentence_embeddings
])

24/08/27 20:59:25 WARN Utils: Your hostname, minti9 resolves to a loopback address: 127.0.1.1; using 192.168.1.101 instead (on interface enp4s0)
24/08/27 20:59:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/27 20:59:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/27 20:59:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


TypeError: 'JavaPackage' object is not callable

In [None]:
# Fit and transform the pipeline
model = pipeline.fit(df)
result = model.transform(df)

In [3]:
# UDF to convert embeddings to array of floats
def extract_embeddings(embeddings):
    return [float(x) for x in embeddings[0].embeddings]

# Register UDF
extract_embeddings_udf = udf(extract_embeddings, ArrayType(FloatType()))

# Apply UDF to get the embeddings in the required format
result_with_embeddings = result.withColumn(
    "sentence_embeddings", extract_embeddings_udf(col("sentence_embeddings"))
)

# Select relevant columns and rename
final_df = result_with_embeddings.select(
    col("sentence.result").alias("sentence"),
    "sentence_embeddings"
)

# Show the final DataFrame
final_df.show(truncate=False)

# Saving to Solr
final_df.write.format("solr") \
    .option("zkhost", "your_zookeeper_host") \
    .option("collection", "your_solr_collection_name") \
    .save()

NameError: name 'pipeline' is not defined