In [18]:
import os
import logging

from pyflink.common import WatermarkStrategy
from pyflink.datastream import StreamExecutionEnvironment, RuntimeExecutionMode
from pyflink.datastream.connectors.kafka import KafkaSource, KafkaOffsetsInitializer
from pyflink.datastream.formats.json import JsonRowDeserializationSchema
from pyflink.common import Types, Row
from pyflink.datastream.connectors.jdbc import JdbcSink, JdbcConnectionOptions, JdbcExecutionOptions
from pyflink.datastream.connectors.kafka import KafkaTopicPartition
import json
from datetime import datetime

RUNTIME_ENV = os.getenv("RUNTIME_ENV", "local")
BOOTSTRAP_SERVERS = os.getenv("BOOTSTRAP_SERVERS", "localhost:9092")

In [19]:
env = StreamExecutionEnvironment.get_execution_environment()
env.set_runtime_mode(RuntimeExecutionMode.STREAMING)

JavaObject id=o365

In [20]:
jar_files = ["flink-sql-connector-kafka-3.2.0-1.18.jar", "postgresql-42.6.0.jar", "flink-connector-jdbc-3.1.2-1.18.jar"]

In [21]:
CURRENT_DIR = os.getcwd()

In [22]:
departure_type = Types.ROW_NAMED(['delay', 'time', 'uncertainty'], [Types.INT(), Types.INT(), Types.INT()])
arrival_type = Types.ROW_NAMED(['delay', 'time', 'uncertainty'], [Types.INT(), Types.INT(), Types.INT()])
stop_time_update_type = Types.ROW_NAMED(['stop_sequence', 'arrival', 'departure', 'stop_id', 'schedule_relationship'], [Types.INT(), arrival_type, departure_type, Types.STRING(), Types.INT()])
vehicle_type_info = Types.ROW_NAMED(['id', 'label', 'license_plate'], [Types.STRING(), Types.STRING(), Types.STRING()])
trip_update_trip_info = Types.ROW_NAMED(['trip_id', 'start_time', 'start_date', 'schedule_relationship', 'route_id', 'direction_id'], [Types.STRING(), Types.STRING(), Types.STRING(), Types.INT(), Types.STRING(), Types.INT()])
trip_update_type = Types.ROW_NAMED(['trip', 'stop_time_update', 'vehicle', 'timestamp', 'delay'],[trip_update_trip_info, Types.OBJECT_ARRAY(stop_time_update_type), Types.STRING(), Types.STRING(), Types.INT()])
trip_info_type = Types.ROW_NAMED(['id', 'is_deleted', 'trip_update', 'vehicle', 'alert'], [Types.STRING(), Types.BOOLEAN(), trip_update_type, Types.STRING(), Types.STRING()])
json_format = JsonRowDeserializationSchema.builder().type_info(trip_info_type).build()

In [23]:
jar_paths = tuple(
            [f"file://{os.path.join(CURRENT_DIR, 'Downloads', name)}" for name in jar_files]
        )
logging.info(f"adding local jars - {', '.join(jar_files)}")
env.add_jars(*jar_paths)

In [24]:
partition_set = {
    KafkaTopicPartition("transitStream", 0)
}

flink_simple_json_source = (
        KafkaSource.builder()
        .set_bootstrap_servers(BOOTSTRAP_SERVERS)
        .set_group_id("flink.testertransit")
        .set_starting_offsets(KafkaOffsetsInitializer.earliest())
        .set_value_only_deserializer(
            json_format
        )
        .set_partitions(partition_set)
        .build()
    )

In [25]:
flink_stream = env.from_source(
        flink_simple_json_source, WatermarkStrategy.no_watermarks(), "what is this"
    )

In [26]:
def extract_trip_info(row):
    trip_id = row.id
    start_time = datetime.strptime(row.trip_update.trip.start_time, "%H:%M:%S")
    start_date = datetime.strptime(row.trip_update.trip.start_date, "%Y%m%d")
    for stops in row.trip_update.stop_time_update:
        stop_seq = stops.stop_sequence
        if stops.departure is not None:
            if stops.departure.delay > 0:
                delay_in_seconds = stops.departure.delay
                status = "delayed"
            elif stops.departure.delay < 0:
                delay_in_seconds = abs(stops.departure.delay)
                status = "early"
            else:
                delay_in_seconds = stops.departure.delay
                status = "running on time"
            stop_time = datetime.fromtimestamp(stops.departure.time)
        else:
            if stops.arrival.delay > 0:
                delay_in_seconds = stops.arrival.delay
                status = "delayed"
            elif stops.arrival.delay < 0:
                delay_in_seconds = abs(stops.arrival.delay)
                status = "early"
            else:
                delay_in_seconds = stops.arrival.delay
                status = "running on time"
            stop_time = datetime.fromtimestamp(stops.arrival.time)
        yield Row(trip_id, start_time, start_date, stop_seq, stop_time, delay_in_seconds, status)

In [27]:
row_type_info = Types.ROW([Types.STRING(), Types.SQL_TIME(), Types.SQL_DATE(), Types.INT(),Types.SQL_TIMESTAMP(), Types.INT(), Types.STRING()])

flink_stream.flat_map(extract_trip_info, output_type = row_type_info).add_sink(
    JdbcSink.sink(
        """INSERT INTO public.trip_updates (trip_id, start_time, start_date, stop_seq, stop_time, delay_in_sec, status)
            VALUES (?, ?, ?, ?, ?, ?, ?) 
            ON CONFLICT (trip_id, start_date, stop_seq, u) 
            DO UPDATE SET 
            trip_id = EXCLUDED.trip_id,
            start_time = EXCLUDED.start_time,
            start_date = EXCLUDED.start_date,
            stop_seq = EXCLUDED.stop_seq,
            stop_time = EXCLUDED.stop_time,
            delay_in_sec = EXCLUDED.delay_in_sec,
            status = EXCLUDED.status
        """,
        row_type_info,
        JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
            .with_url('jdbc:postgresql://localhost:5432/transitstreamtest')
            .with_driver_name('org.postgresql.Driver')
            .with_user_name('root')
            .with_password('root')
            .build(),
        JdbcExecutionOptions.builder()
            .with_batch_interval_ms(1000)
            .with_batch_size(200)
            .with_max_retries(2)
            .build()
)
)


<pyflink.datastream.data_stream.DataStreamSink at 0x12e683970>

In [28]:
env.execute("")

Py4JJavaError: An error occurred while calling o354.execute.
: org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
	at org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144)
	at org.apache.flink.runtime.minicluster.MiniClusterJobClient.lambda$getJobExecutionResult$3(MiniClusterJobClient.java:141)
	at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:646)
	at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:510)
	at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2179)
	at org.apache.flink.runtime.rpc.pekko.PekkoInvocationHandler.lambda$invokeRpc$1(PekkoInvocationHandler.java:268)
	at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:863)
	at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:841)
	at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:510)
	at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2179)
	at org.apache.flink.util.concurrent.FutureUtils.doForward(FutureUtils.java:1287)
	at org.apache.flink.runtime.concurrent.ClassLoadingUtils.lambda$null$1(ClassLoadingUtils.java:93)
	at org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
	at org.apache.flink.runtime.concurrent.ClassLoadingUtils.lambda$guardCompletionWithContextClassLoader$2(ClassLoadingUtils.java:92)
	at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:863)
	at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:841)
	at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:510)
	at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2179)
	at org.apache.flink.runtime.concurrent.pekko.ScalaFutureUtils$1.onComplete(ScalaFutureUtils.java:47)
	at org.apache.pekko.dispatch.OnComplete.internal(Future.scala:310)
	at org.apache.pekko.dispatch.OnComplete.internal(Future.scala:307)
	at org.apache.pekko.dispatch.japi$CallbackBridge.apply(Future.scala:234)
	at org.apache.pekko.dispatch.japi$CallbackBridge.apply(Future.scala:231)
	at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
	at org.apache.flink.runtime.concurrent.pekko.ScalaFutureUtils$DirectExecutionContext.execute(ScalaFutureUtils.java:65)
	at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:72)
	at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1(Promise.scala:288)
	at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1$adapted(Promise.scala:288)
	at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:288)
	at org.apache.pekko.pattern.PromiseActorRef.$bang(AskSupport.scala:629)
	at org.apache.pekko.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:34)
	at org.apache.pekko.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:33)
	at scala.concurrent.Future.$anonfun$andThen$1(Future.scala:536)
	at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
	at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
	at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
	at org.apache.pekko.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:73)
	at org.apache.pekko.dispatch.BatchingExecutor$BlockableBatch.$anonfun$run$1(BatchingExecutor.scala:110)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:85)
	at org.apache.pekko.dispatch.BatchingExecutor$BlockableBatch.run(BatchingExecutor.scala:110)
	at org.apache.pekko.dispatch.TaskInvocation.run(AbstractDispatcher.scala:59)
	at org.apache.pekko.dispatch.ForkJoinExecutorConfigurator$PekkoForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:57)
	at java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:507)
	at java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1491)
	at java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:2073)
	at java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:2035)
	at java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:187)
Caused by: org.apache.flink.runtime.JobException: Recovery is suppressed by NoRestartBackoffTimeStrategy
	at org.apache.flink.runtime.executiongraph.failover.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:180)
	at org.apache.flink.runtime.executiongraph.failover.ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler.java:107)
	at org.apache.flink.runtime.scheduler.DefaultScheduler.recordTaskFailure(DefaultScheduler.java:277)
	at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(DefaultScheduler.java:268)
	at org.apache.flink.runtime.scheduler.DefaultScheduler.onTaskFailed(DefaultScheduler.java:261)
	at org.apache.flink.runtime.scheduler.SchedulerBase.onTaskExecutionStateUpdate(SchedulerBase.java:787)
	at org.apache.flink.runtime.scheduler.SchedulerBase.updateTaskExecutionState(SchedulerBase.java:764)
	at org.apache.flink.runtime.scheduler.SchedulerNG.updateTaskExecutionState(SchedulerNG.java:83)
	at org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(JobMaster.java:488)
	at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.lambda$handleRpcInvocation$1(PekkoRpcActor.java:309)
	at org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:83)
	at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcInvocation(PekkoRpcActor.java:307)
	at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcMessage(PekkoRpcActor.java:222)
	at org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor.handleRpcMessage(FencedPekkoRpcActor.java:85)
	at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleMessage(PekkoRpcActor.java:168)
	at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:33)
	at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:29)
	at scala.PartialFunction.applyOrElse(PartialFunction.scala:127)
	at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126)
	at org.apache.pekko.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:29)
	at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175)
	at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176)
	at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176)
	at org.apache.pekko.actor.Actor.aroundReceive(Actor.scala:547)
	at org.apache.pekko.actor.Actor.aroundReceive$(Actor.scala:545)
	at org.apache.pekko.actor.AbstractActor.aroundReceive(AbstractActor.scala:229)
	at org.apache.pekko.actor.ActorCell.receiveMessage(ActorCell.scala:590)
	at org.apache.pekko.actor.ActorCell.invoke(ActorCell.scala:557)
	at org.apache.pekko.dispatch.Mailbox.processMailbox(Mailbox.scala:280)
	at org.apache.pekko.dispatch.Mailbox.run(Mailbox.scala:241)
	at org.apache.pekko.dispatch.Mailbox.exec(Mailbox.scala:253)
	... 5 more
Caused by: org.apache.flink.runtime.taskmanager.AsynchronousException: Caught exception while processing timer.
	at org.apache.flink.streaming.runtime.tasks.StreamTask$StreamTaskAsyncExceptionHandler.handleAsyncException(StreamTask.java:1642)
	at org.apache.flink.streaming.runtime.tasks.StreamTask.handleAsyncException(StreamTask.java:1617)
	at org.apache.flink.streaming.runtime.tasks.StreamTask.invokeProcessingTimeCallback(StreamTask.java:1771)
	at org.apache.flink.streaming.runtime.tasks.StreamTask.lambda$null$25(StreamTask.java:1760)
	at org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$1.runThrowing(StreamTaskActionExecutor.java:50)
	at org.apache.flink.streaming.runtime.tasks.mailbox.Mail.run(Mail.java:90)
	at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.runMail(MailboxProcessor.java:398)
	at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.processMailsWhenDefaultActionUnavailable(MailboxProcessor.java:367)
	at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.processMail(MailboxProcessor.java:352)
	at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.runMailboxLoop(MailboxProcessor.java:229)
	at org.apache.flink.streaming.runtime.tasks.StreamTask.runMailboxLoop(StreamTask.java:909)
	at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:858)
	at org.apache.flink.runtime.taskmanager.Task.runWithSystemExitMonitoring(Task.java:958)
	at org.apache.flink.runtime.taskmanager.Task.restoreAndInvoke(Task.java:937)
	at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:751)
	at org.apache.flink.runtime.taskmanager.Task.run(Task.java:566)
	at java.base/java.lang.Thread.run(Thread.java:1570)
Caused by: TimerException{org.apache.flink.streaming.runtime.tasks.ExceptionInChainedOperatorException: Could not forward element to next operator}
	... 15 more
Caused by: org.apache.flink.streaming.runtime.tasks.ExceptionInChainedOperatorException: Could not forward element to next operator
	at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.pushToOperator(CopyingChainingOutput.java:92)
	at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:50)
	at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:29)
	at org.apache.flink.streaming.api.operators.TimestampedCollector.collect(TimestampedCollector.java:52)
	at org.apache.flink.streaming.api.operators.python.process.collector.RunnerOutputCollector.collect(RunnerOutputCollector.java:52)
	at org.apache.flink.streaming.api.operators.python.process.AbstractExternalOneInputPythonFunctionOperator.emitResult(AbstractExternalOneInputPythonFunctionOperator.java:133)
	at org.apache.flink.streaming.api.operators.python.process.AbstractExternalPythonFunctionOperator.emitResults(AbstractExternalPythonFunctionOperator.java:142)
	at org.apache.flink.streaming.api.operators.python.process.AbstractExternalPythonFunctionOperator.invokeFinishBundle(AbstractExternalPythonFunctionOperator.java:101)
	at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.checkInvokeFinishBundleByTime(AbstractPythonFunctionOperator.java:300)
	at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.lambda$open$0(AbstractPythonFunctionOperator.java:118)
	at org.apache.flink.streaming.runtime.tasks.StreamTask.invokeProcessingTimeCallback(StreamTask.java:1769)
	... 14 more
Caused by: java.io.IOException: Writing records to JDBC failed.
	at org.apache.flink.connector.jdbc.internal.JdbcOutputFormat.writeRecord(JdbcOutputFormat.java:198)
	at org.apache.flink.connector.jdbc.internal.GenericJdbcSinkFunction.invoke(GenericJdbcSinkFunction.java:57)
	at org.apache.flink.streaming.api.operators.StreamSink.processElement(StreamSink.java:54)
	at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.pushToOperator(CopyingChainingOutput.java:75)
	... 24 more
Caused by: java.io.IOException: java.sql.BatchUpdateException: Batch entry 0 INSERT INTO public.trip_updates (trip_id, start_time, start_date, stop_seq, stop_time, delay_in_sec, status)
            VALUES ('1181513', '16:02:00-06', '2024-07-17 -05', 56, '2024-07-17 16:47:54-05', 0, 'running on time') 
            ON CONFLICT (trip_id, start_date, stop_seq, u) 
            DO UPDATE SET 
            trip_id = EXCLUDED.trip_id,
            start_time = EXCLUDED.start_time,
            start_date = EXCLUDED.start_date,
            stop_seq = EXCLUDED.stop_seq,
            stop_time = EXCLUDED.stop_time,
            delay_in_sec = EXCLUDED.delay_in_sec,
            status = EXCLUDED.status
         was aborted: ERROR: column "u" does not exist
  Position: 183  Call getNextException to see other errors in the batch.
	at org.apache.flink.connector.jdbc.internal.JdbcOutputFormat.flush(JdbcOutputFormat.java:222)
	at org.apache.flink.connector.jdbc.internal.JdbcOutputFormat.writeRecord(JdbcOutputFormat.java:195)
	... 27 more
Caused by: java.sql.BatchUpdateException: Batch entry 0 INSERT INTO public.trip_updates (trip_id, start_time, start_date, stop_seq, stop_time, delay_in_sec, status)
            VALUES ('1181513', '16:02:00-06', '2024-07-17 -05', 56, '2024-07-17 16:47:54-05', 0, 'running on time') 
            ON CONFLICT (trip_id, start_date, stop_seq, u) 
            DO UPDATE SET 
            trip_id = EXCLUDED.trip_id,
            start_time = EXCLUDED.start_time,
            start_date = EXCLUDED.start_date,
            stop_seq = EXCLUDED.stop_seq,
            stop_time = EXCLUDED.stop_time,
            delay_in_sec = EXCLUDED.delay_in_sec,
            status = EXCLUDED.status
         was aborted: ERROR: column "u" does not exist
  Position: 183  Call getNextException to see other errors in the batch.
	at org.postgresql.jdbc.BatchResultHandler.handleCompletion(BatchResultHandler.java:186)
	at org.postgresql.jdbc.PgStatement.internalExecuteBatch(PgStatement.java:881)
	at org.postgresql.jdbc.PgStatement.executeBatch(PgStatement.java:919)
	at org.postgresql.jdbc.PgPreparedStatement.executeBatch(PgPreparedStatement.java:1685)
	at org.apache.flink.connector.jdbc.internal.executor.SimpleBatchStatementExecutor.executeBatch(SimpleBatchStatementExecutor.java:73)
	at org.apache.flink.connector.jdbc.internal.JdbcOutputFormat.attemptFlush(JdbcOutputFormat.java:246)
	at org.apache.flink.connector.jdbc.internal.JdbcOutputFormat.flush(JdbcOutputFormat.java:216)
	... 28 more
Caused by: org.postgresql.util.PSQLException: ERROR: column "u" does not exist
  Position: 183
	at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2713)
	at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2401)
	at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:368)
	at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:327)
	at org.postgresql.jdbc.PgStatement.internalExecuteBatch(PgStatement.java:877)
	... 33 more


In [None]:
env.close()