In [1]:
# 1. Criar uma aplicação em scala usando o spark para ler os dados da porta 9999 e exibir no console

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from time import sleep

# No terminal atualizar o net cat caso tenha feito um down
# apt update
# apt install netcat

porta_leitura = spark.readStream.format("socket").option("host","localhost").option("port","9999").load
# antes de fazer o start executar o codigo abaixo no shell para realizar o streaming
nc -lp 9999
porta_saida = porta_leitura.writeStream.format("console").start()

# no jupyter 
# porta_saida = porta_leitura.writeStream.format("memory").start() e pyconsole


In [7]:
# 2. Ler os arquivos csv “hdfs://namenode:8020/user/<nome>/data/iris/*.data” em modo streaming com o seguinte schema:

# Verificando o diretorio iris e o que tem dentro
!hdfs dfs -ls /user/marcos/data/exercises-data/iris/*.data

# bezdekIris.data 150 linhas
# iris.data

-rw-r--r--   3 root supergroup       4551 2021-06-23 21:56 /user/marcos/data/exercises-data/iris/bezdekIris.data
-rw-r--r--   3 root supergroup       4551 2021-06-23 21:56 /user/marcos/data/exercises-data/iris/iris.data


In [13]:
# para trabalhar com o readStream precisando setar os schemas
from pyspark.sql.types import StructType


In [14]:
iris_schema = StructType()\
    .add("sepal_lenght","float")\
    .add("sepal_width","float")\
    .add("petal_lenght","float")\
    .add("petal_width","float")\
    .add("class","string")

In [15]:
# 3. Visualizar o schema das informações

print(iris_schema)

StructType(List(StructField(sepal_lenght,FloatType,true),StructField(sepal_width,FloatType,true),StructField(petal_lenght,FloatType,true),StructField(petal_width,FloatType,true),StructField(class,StringType,true)))


In [19]:
# 4. Salvar os dados no diretório “hdfs://namenode:8020/user/<nome>/stream_iris/path” e o checkpoint em “hdfs://namenode:8020/user/<nome>/stream_iris/check”
# Lendro o arquivo CSV e trazendo tudo que tem 'data' no comando *.data
# inserindo o schema criado no data frame
iris = spark.read.schema(iris_schema).csv("/user/marcos/data/exercises-data/iris/*.data").show(5)


+------------+-----------+------------+-----------+-----------+
|sepal_lenght|sepal_width|petal_lenght|petal_width|      class|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



In [17]:
iris = spark.read.schema(iris_schema).csv("/user/marcos/data/exercises-data/iris/*.data").printSchema()

root
 |-- sepal_lenght: float (nullable = true)
 |-- sepal_width: float (nullable = true)
 |-- petal_lenght: float (nullable = true)
 |-- petal_width: float (nullable = true)
 |-- class: string (nullable = true)



In [20]:
# inserindo o stream no DF READSTREAM
iris = spark.readStream.schema(iris_schema).csv("/user/marcos/data/exercises-data/iris/*.data")

In [22]:
iris.printSchema()

root
 |-- sepal_lenght: float (nullable = true)
 |-- sepal_width: float (nullable = true)
 |-- petal_lenght: float (nullable = true)
 |-- petal_width: float (nullable = true)
 |-- class: string (nullable = true)



In [27]:
iris_saida = iris.writeStream.format("csv")\
        .option("checkpointLocation","/user/marcos/stream_iris/check")\
        .option("path","/user/marcos/stream_iris/path")\
        .start()

In [28]:
# 5. Verificar a saida no hdfs e entender como os dados foram salvos
# usando as funções do dataStream
iris_saida.lastProgress

{'id': '58f55def-2f72-4f44-be8c-7c7c526b8905',
 'runId': '0ce02212-736d-46aa-8309-124c1deb3435',
 'name': None,
 'timestamp': '2021-07-06T00:48:25.191Z',
 'batchId': 1,
 'numInputRows': 0,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'getOffset': 16, 'triggerExecution': 16},
 'stateOperators': [],
 'sources': [{'description': 'FileStreamSource[hdfs://namenode:8020/user/marcos/data/exercises-data/iris/*.data]',
   'startOffset': {'logOffset': 0},
   'endOffset': {'logOffset': 0},
   'numInputRows': 0,
   'inputRowsPerSecond': 0.0,
   'processedRowsPerSecond': 0.0}],
 'sink': {'description': 'FileSink[/user/marcos/stream_iris/path]'}}

In [29]:
iris_saida.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}