In [1]:
from pyspark.sql import SparkSession, Row, functions as F
from pyspark.sql.types import *


In [2]:
spark = (
    SparkSession
    .builder
    .config("spark.mongodb.write.connection.uri", "mongodb://127.0.0.1/telematics_playgroud")
    .getOrCreate()
)

In [3]:
data_schema = ArrayType(
    StructType([
        StructField("c", StringType(), True),
        StructField("p", DoubleType(), True),
        StructField("s", StringType(), True),
        StructField("t", LongType(), True),
        StructField("v", DoubleType(), True)
    ])
)

In [4]:
main_schema = StructType([
    StructField("data", data_schema, True),
    StructField("type", StringType(), True)
])

In [5]:
data_list = [
    Row(data=[
        Row(c=None, p=36532.12890625, s='BINANCE:BTCUSDT', t=1700370088463, v=0.01640000008046627),
        Row(c=None, p=36532.12890625, s='BINANCE:BTCUSDT', t=1700370088463, v=0.002369999885559082)
    ], type='trade')
]

In [6]:
df = spark.createDataFrame(data_list, schema=main_schema)

In [7]:
df.show()

+--------------------+-----+
|                data| type|
+--------------------+-----+
|[{NULL, 36532.128...|trade|
+--------------------+-----+



-----

In [8]:
parsed_stream = df\
    .selectExpr("explode (data) as data")\
    .selectExpr(
        "data.c as TradeConditions", 
        "data.p as LastPrice", 
        "data.s as Symbol", 
        "data.t as UNIXTimestamp", 
        "data.v as Volume"
    )

In [9]:
parsed_stream.show()

+---------------+--------------+---------------+-------------+--------------------+
|TradeConditions|     LastPrice|         Symbol|UNIXTimestamp|              Volume|
+---------------+--------------+---------------+-------------+--------------------+
|           NULL|36532.12890625|BINANCE:BTCUSDT|1700370088463| 0.01640000008046627|
|           NULL|36532.12890625|BINANCE:BTCUSDT|1700370088463|0.002369999885559082|
+---------------+--------------+---------------+-------------+--------------------+



In [10]:
parsed_stream = parsed_stream.withColumn("Timestamp", F.current_timestamp())

In [11]:
parsed_stream.show()

+---------------+--------------+---------------+-------------+--------------------+--------------------+
|TradeConditions|     LastPrice|         Symbol|UNIXTimestamp|              Volume|           Timestamp|
+---------------+--------------+---------------+-------------+--------------------+--------------------+
|           NULL|36532.12890625|BINANCE:BTCUSDT|1700370088463| 0.01640000008046627|2023-11-20 14:01:...|
|           NULL|36532.12890625|BINANCE:BTCUSDT|1700370088463|0.002369999885559082|2023-11-20 14:01:...|
+---------------+--------------+---------------+-------------+--------------------+--------------------+



In [14]:
parsed_stream.write\
        .format("mongodb")\
        .option("uri", "mongodb://34.27.126.27:27017")\
        .mode("append")\
        .option("database", "telematics_playgroud")\
        .option("collection", "tickers_test")\
        .save()

Py4JJavaError: An error occurred while calling o66.save.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: mongodb. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:724)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:863)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:257)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:248)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.ClassNotFoundException: mongodb.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:471)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:588)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:521)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 16 more
