# TCP Weblog Server
This notebook simulates a log producer that uses a TCP connection to send the web server logs to a connecting client.

## Let's reuse the `WebLog` definition used in the batch approach

In [ ]:
import java.sql.Timestamp
case class WebLog(host:String, 
                  timestamp: Timestamp, 
                  request: String, 
                  http_reply:Int, 
                  bytes: Long
                 )

import java.sql.Timestamp
defined class WebLog


In [ ]:
val connectionWidget = ul(5)
val dataWidget = ul(20)

connectionWidget: notebook.front.widgets.HtmlList = <HtmlList widget>
dataWidget: notebook.front.widgets.HtmlList = <HtmlList widget>


## A Simple TCP server implementation

In [ ]:
// Simple multithreaded server
import java.net._
import java.io._
import java.sql.Timestamp
import scala.concurrent.Future
import scala.annotation.tailrec
import scala.collection.JavaConverters._
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

import scala.concurrent.ExecutionContext.Implicits.global

class SocketHandler(sparkSession: SparkSession, port: Int, data: Dataset[WebLog]) {
  val LogDelay = 500 // millis
  @volatile var active = false 
  
  // non blocking start of the socket handler
  def start() : Unit = {
    active = true
    new Thread() {
      override def run() { 
        connectionWidget.append("Server starting...")
        acceptConnections()
        connectionWidget.append("Server stopped")
      }
    }.start()
  } 
  
  def stop() {
    active = false
  }
  
  @tailrec
  final def acceptConnections(): Unit = {
    val server: ServerSocket = new ServerSocket(port)
    val socket = server.accept()
    connectionWidget.append("Accepting connection from: " + socket)
    serve(socket)
    if (active) {
      acceptConnections() 
    } else {
      () // finish recursing for new connections
    }
  }
  
  // 1-thread per connection model for example purposes.
  def serve(socket: Socket) = {
    import sparkSession.implicits._
    val minTimestamp  = data.select(min($"timestamp")).as[Timestamp].first
    val now = System.currentTimeMillis
    val offset = now - minTimestamp.getTime()
    val offsetData = data.map(weblog => weblog.copy(timestamp = new Timestamp(weblog.timestamp.getTime+ offset)))
    val jsonData = offsetData.toJSON
    val iter = jsonData.toLocalIterator.asScala
    new Thread() {
      override def run() {
        val out = new PrintStream(socket.getOutputStream())
        connectionWidget.append("Starting data stream for: " + socket.getInetAddress() + "]")
        while(iter.hasNext && active) {
          val data = iter.next()
          out.println(data)
          dataWidget.append(s"[${socket.getInetAddress()}] sending: ${data.take(40)}...")
          out.flush()
          Thread.sleep(LogDelay)
        }
        out.close()
        socket.close()
      }
    }.start()
  }
}


import java.net._
import java.io._
import java.sql.Timestamp
import scala.concurrent.Future
import scala.annotation.tailrec
import scala.collection.JavaConverters._
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import scala.concurrent.ExecutionContext.Implicits.global
defined class SocketHandler


## We want to reuse the NASA weblog dataset with a Back-to-the-Future twist.
We are going to bring the timestamps to our current time.

In [ ]:
// This is the location of the unpackaged files. Update accordingly
val logsDirectory = "/tmp/data/nasa_dataset_july_1995"
val rawLogs = sparkSession.read.json(logsDirectory)

logsDirectory: String = /tmp/data/nasa_dataset_july_1995
rawLogs: org.apache.spark.sql.DataFrame = [bytes: bigint, host: string ... 3 more fields]


In [ ]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.IntegerType
val preparedLogs = rawLogs.withColumn("http_reply", $"http_reply".cast(IntegerType))
val weblogs = preparedLogs.as[WebLog]

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.IntegerType
preparedLogs: org.apache.spark.sql.DataFrame = [bytes: bigint, host: string ... 3 more fields]
weblogs: org.apache.spark.sql.Dataset[WebLog] = [bytes: bigint, host: string ... 3 more fields]


In [ ]:
val server = new SocketHandler(sparkSession, 9999, weblogs)

server: SocketHandler = SocketHandler@34a68859


# Interactions Monitor
These two widgets will give us a view on connections and data being sent to a connecting client.

In [ ]:
connectionWidget

res12: notebook.front.widgets.HtmlList = <HtmlList widget>


In [ ]:
dataWidget 

res14: notebook.front.widgets.HtmlList = <HtmlList widget>


## Start the server accept process

In [ ]:
server.start()

# Stop the server
After experimenting with the TCP stream, execute the `close` method below to stop the data stream.

*DO NOT* stop the server right after starting it. The command is commented out to prevent accidental execution. Uncomment and execute to stop this producer.  

In [ ]:
server.stop()