<a href="https://colab.research.google.com/github/smduarte/ps2024/blob/main/lab3/ps2024_lab2_sol.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Processamento de Streams 2024
## Lab 2 - (Unstructured) Spark Streaming - Solution
---
### Colab Setup



In [None]:
#@title Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title Install PySpark
!pip install pyspark findspark --quiet
import findspark
findspark.init()
findspark.find()

---
### Weblog Sender
The stream server is a small python TCP server, listening
on port 7777 (localhost).

The stream will consist of a set of text lines, obtained from the output log of a webserver.



In [None]:
!wget -q -O - https://github.com/smduarte/ps2024/raw/main/colab/logsender.tgz | tar xfz - 2> /dev/null

!nohup python logsender/server.py logsender/web.log 7777 > /dev/null 2> /dev/null &
!nohup python logsender/server.py logsender/webipv4.log 7778 > /dev/null 2> /dev/null &
!nohup python logsender/server.py logsender/webipv6.log 7779 > /dev/null 2> /dev/null &

In [None]:
%%bash
apt-get -qq install -y net-tools
netstat -a | grep tcp | grep 777

The python code below shows the basics needed to process data from socket source using PySpark.

Spark Streaming python documentation is found [here](https://spark.apache.org/docs/latest/api/python/reference/pyspark.streaming.html)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


from pyspark.sql import *
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import tempfile

spark = SparkSession.builder.master('local[*]').appName('WebLogExample').getOrCreate()
sc = spark.sparkContext

try :
  ssc = StreamingContext.getActiveOrCreate(checkpointPath = tempfile.TemporaryDirectory().name, setupFunc = lambda : StreamingContext(sc, 1))
  lines = ssc.socketTextStream("localhost", 7777)

  lines.pprint()

  ssc.start()
  ssc.awaitTermination(10)
except Exception as err:
  ssc.stop()

---
# Exercises

## Exercise 1

In a denial-of-service event it is important to identify the IP sources that might be attacking the system, by issuing a large number of requests.

Write a program to find the IP sources that have done more than 50 requests in the last 10 seconds -- dump this information every 5 seconds.


In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


from pyspark.sql import *
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import tempfile

spark = SparkSession.builder.master('local[*]').appName('WebLogExample-Ex1').getOrCreate()
sc = spark.sparkContext

try :
  ssc = StreamingContext.getActiveOrCreate(checkpointPath = tempfile.TemporaryDirectory().name,
                                           setupFunc = lambda : StreamingContext(sc, 1))

  lines = ssc.socketTextStream("localhost", 7777)

  results = lines.window(10, 5)\
            .filter( lambda line : len(line) > 0 ) \
            .map( lambda line: line.split(' ')) \
            .map( lambda values : (values[1], 1)) \
            .reduceByKey( lambda v1, v2: v1 + v2) \
            .filter( lambda kv : kv[1] > 50)

  results.pprint()

  ssc.start()
  ssc.awaitTermination(120)
except Exception as err:
  ssc.stop()

### Exercise 2

#### a)
Write a program to dump the number of requests, minimum processing time, maximum processing time for request in the last 10 seconds, **for all** source IPs that performed more than 100 requests -- dump this information every 5 second.

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pyspark.sql import *
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import tempfile

spark = SparkSession.builder.master('local[*]').appName('WebLogExample-Ex2a').getOrCreate()
sc = spark.sparkContext

try:
  ssc = StreamingContext.getActiveOrCreate(checkpointPath = tempfile.TemporaryDirectory().name,
                                           setupFunc = lambda : StreamingContext(sc, 1))
  lines = ssc.socketTextStream("localhost", 7777)

  results = lines.window(10, 5)\
            .filter( lambda line : len(line) > 0 ) \
            .map( lambda line: line.split(' ')) \
            .map( lambda values : (values[1], (1, float(values[5]), float(values[5])))) \
            .reduceByKey( lambda a, b: (a[0] + b[0], min( a[1], b[1] ), max( a[2], b[2]))) \
            .filter( lambda kv : kv[1][0] > 100)


  results.pprint()

  ssc.start()
  ssc.awaitTermination(120)

except Exception as err:
  ssc.stop()

#### b)

Write a program to dump the number of requests, minimum processing time, maximum processing time for request in the last 10 seconds, **only if at least one** source IP has performed more than 100 requests -- dump this information every 5 second.

In [None]:
#@title Using DStream.join()
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pyspark.sql import *
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import tempfile

spark = SparkSession.builder.master('local[*]').appName('WebLogExample-Ex2b').getOrCreate()
sc = spark.sparkContext

try :
  ssc = StreamingContext.getActiveOrCreate(checkpointPath = tempfile.TemporaryDirectory().name,
                                           setupFunc = lambda : StreamingContext(sc, 1))
  lines = ssc.socketTextStream("localhost", 7777)


  results = lines.window(10, 5)\
            .filter( lambda line : len(line) > 0 ) \
            .map( lambda line: line.split(' ')) \
            .map( lambda values : (values[1], (1, float(values[5]), float(values[5])))) \
            .reduceByKey( lambda a, b: (a[0] + b[0], min( a[1], b[1] ), max( a[2], b[2])))

  moreThan100 = results.filter( lambda t : t[1][0] > 100) \
                  .map( lambda t : (None, None)) \
                  .reduceByKey( lambda a, b : None)

  results = results.map( lambda t : (None, t)) \
                  .join( moreThan100 ) \
                  .map( lambda t: t[1][0])

  results.pprint()
  ssc.start()
  ssc.awaitTermination(120)
except Exception as err:
  ssc.stop()

In [None]:
#@title Using DStream.glom()

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pyspark.sql import *
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import tempfile

spark = SparkSession.builder.master('local[*]').appName('WebLogExample-Ex2b').getOrCreate()
sc = spark.sparkContext

try :
  ssc = StreamingContext.getActiveOrCreate(checkpointPath = tempfile.TemporaryDirectory().name,
                                           setupFunc = lambda : StreamingContext(sc, 1))
  lines = ssc.socketTextStream("localhost", 7777)

  results = lines.window(10, 5)\
            .filter( lambda line : len(line) > 0 ) \
            .map( lambda line: line.split(' ')) \
            .map( lambda values : (values[1], (1, float(values[5]), float(values[5])))) \
            .reduceByKey( lambda a, b: (a[0] + b[0], min( a[1], b[1] ), max( a[2], b[2]))) \
            .repartition(1) \
            .glom() \
            .map( lambda l : l if max(p[1][0] for p in l) > 100 else [] ) \
            .flatMap( lambda l : l)

  results.pprint()

  ssc.start()
  ssc.awaitTermination(120)
except Exception as err:
  ssc.stop()

## Exercise 3
Write a program to dump the IP sources that deviate most from the average in terms of the number of requests made in the last 30 seconds - dump this information every 5 seconds.

In [None]:
#@title Using DStream.join()
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pyspark.sql import *
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import tempfile

spark = SparkSession.builder.master('local[*]').appName('WebLogExample-Ex3').getOrCreate()
sc = spark.sparkContext

try :
  ssc = StreamingContext.getActiveOrCreate(checkpointPath = tempfile.TemporaryDirectory().name,
                                           setupFunc = lambda : StreamingContext(sc, 1))
  lines = ssc.socketTextStream("localhost", 7777)

  counts = lines.window(30, 5)\
            .filter( lambda line : len(line) > 0 ) \
            .map( lambda line: line.split(' ')) \
            .map( lambda values : (values[1], 1)) \
            .reduceByKey( lambda a, b: a + b)

  #counts.pprint()

  average = counts.map( lambda t : (None, (t[1], 1))) \
            .reduceByKey( lambda a, b: (a[0] + b[0], a[1] + b[1])) \
            .map( lambda t : (None, t[1][0] / t[1][1]))

  #average.map(lambda t : t[1]).pprint()

  results = counts.map( lambda t : (None, t)) \
              .join( average) \
              .map( lambda t : (t[1][0][0], t[1][0][1], abs(t[1][0][1] - t[1][1]))) \
              .transform(lambda rdd:
				          rdd.sortBy(lambda x: x[2], ascending=False) \
					           .zipWithIndex() \
					           .filter( lambda v: v[1] < 5) \
					           .map( lambda v: v[0]))
  results.pprint()


  ssc.start()
  ssc.awaitTermination(300)
except Exception as err:
  ssc.stop()

## Exercise 4

Run additional logsender servers for subsets of the logs (IPv4 and IPv6 logs), using the following commands.

```
!nohup python logsender/server.py logsender/webipv4.log 7778 > /dev/null 2> /dev/null &
!nohup python logsender/server.py logsender/webipv6.log 7779 > /dev/null 2> /dev/null &
```

Write a program that combines the two streams, dumping the number of requests made in the last 15 seconds - dump this information every 5 seconds.

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


from pyspark.sql import *
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import tempfile

spark = SparkSession.builder.master('local[*]').appName('WebLogExample-Ex4').getOrCreate()
sc = spark.sparkContext

try :
  ssc = StreamingContext.getActiveOrCreate(checkpointPath = tempfile.TemporaryDirectory().name, setupFunc = lambda : StreamingContext(sc, 1))

  ipv4Sources = ssc.socketTextStream("localhost", 7778)
  ipv6Sources = ssc.socketTextStream("localhost", 7779)

  results = ipv6Sources.union(ipv4Sources) \
            .window(15, 5)\
            .filter( lambda line : len(line) > 0 ) \
            .map( lambda line: line.split(' ')) \
            .map( lambda values : (values[1], 1)) \
            .reduceByKey( lambda a, b: a + b)

  results.pprint()

  ssc.start()
  ssc.awaitTermination(120)
except Exception as err:
  ssc.stop()

## Exercise 5

Write a program that combines the two streams from the previous exercise and dumps the proportion of IPv4 vs IPv6 requests in the last 20 seconds - dump this information every 5 seconds.


In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pyspark.sql import *
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import tempfile

spark = SparkSession.builder.master('local[*]').appName('WebLogExample-Ex5').getOrCreate()
sc = spark.sparkContext

try :
  ssc = StreamingContext.getActiveOrCreate(checkpointPath = tempfile.TemporaryDirectory().name,
                                           setupFunc = lambda : StreamingContext(sc, 1))

  ipv4Sources = ssc.socketTextStream("localhost", 7778) \
            .map( lambda line : ('ipv4', 1 if len(line) > 0 else 0))

  ipv6Sources = ssc.socketTextStream("localhost", 7779) \
            .map( lambda line : ('ipv6', 1 if len(line) > 0 else 0))

  counts = ipv4Sources.union( ipv6Sources) \
            .window(20, 5) \
            .reduceByKey( lambda a, b: a + b) \
            .map( lambda t : (None, t))

  counts.pprint()

  total = counts.reduceByKey( lambda a, b: a[1] + b[1])

  results = counts.join( total ) \
        .map( lambda t : t[1]) \
        .map( lambda t : (t[0][0], round(t[0][1] / t[1], 3)))

  results.pprint()

  ssc.start()
  ssc.awaitTermination(60)
except Exception as err:
  ssc.stop()

In [None]:
from pyspark import SparkContext
import socket
from pyspark.streaming import StreamingContext

sc = SparkContext("local[4]", "WebLogExample-ex5")

try:
  ssc = ssc = StreamingContext(sc, 5)

  ipv4Sources = ssc.socketTextStream("localhost", 7778) \
            .map( lambda line : ('ipv4', 1 if len(line)>0 else 0))

  ipv6Sources = ssc.socketTextStream("localhost", 7779) \
            .map( lambda line : ('ipv6', 1 if len(line)>0 else 0))


  results = ipv4Sources.union( ipv6Sources) \
            .window(20, 5) \
            .reduceByKey( lambda a, b: a+b) \
            .map( lambda t : (None, t))

  total = results.reduceByKey( lambda a, b: a[1] + b[1])

  results = results.join( total ) \
        .map( lambda t : t[1]) \
        .map( lambda t : (t[0][0], round(t[0][1] / t[1], 3)))

  results.pprint()

  ssc.start()
  ssc.awaitTermination(120)
except Exception as err:
  print(err)
  ssc.stop()