In [1]:
# https://datanoon.com/blog/loading_data_rest_api_to_spark/

In [2]:
import requests
import json
import time
import pandas as pd

In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql.session import SparkSession
from pyspark.sql import SQLContext

# Create a local StreamingContext with batch interval of 1 seconds
conf = SparkConf().setAppName("app1").setMaster("local")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 1)
sqlContext = SQLContext(sc)
spark = SparkSession(sc)

In [4]:
def get_weather():
    url = f"https://api.openweathermap.org/data/2.5/weather?q=barcelona&appid=24b234cb23d2bddfc71ee6db0e1b1d6c"
    response = requests.get(url)
    return response

In [5]:
# Create the queue through which RDDs can be pushed to
# a QueueInputDStream
num_measurements = 6
interval = 10 # interval between calls to API in seconds

calls = []
rddQueue = []
for i in range(num_measurements):
    time.sleep(interval)
    weather = get_weather()
    rddQueue += [ssc.sparkContext.parallelize([weather.text])]

In [6]:
# https://stackoverflow.com/questions/35245648/how-to-convert-spark-streaming-data-into-spark-dataframe/50894876#50894876
def process_stream(record, spark, queries):
    if not record.isEmpty():
        df = spark.read.json(record) 
        df.show()
        queries.append(df)

In [7]:
# https://stackoverflow.com/questions/36421619/whats-the-meaning-of-dstream-foreachrdd-function

# Create a DStream ("discretized stream") called inputStream, an abstraction that breaks a continuous stream 
# of data into small chunks
inputStream = ssc.queueStream(rddQueue)
queries = []
# save output to textfile
inputStream.saveAsTextFiles('streaming_weather')
# show output on display
inputStream.foreachRDD(lambda rdd: process_stream(rdd, spark, queries))

ssc.start()
time.sleep(num_measurements)
ssc.stop(stopSparkContext=True, stopGraceFully=True) 

+--------+------+---+-------------+----------+-------+--------------------+---------+--------------------+--------+----------+--------------------+----------+
|    base|clouds|cod|        coord|        dt|     id|                main|     name|                 sys|timezone|visibility|             weather|      wind|
+--------+------+---+-------------+----------+-------+--------------------+---------+--------------------+--------+----------+--------------------+----------+
|stations|  [20]|200|[41.39, 2.16]|1607292088|3128760|[277.39, 61, 1005...|Barcelona|[ES, 6398, 160723...|    3600|     10000|[[few clouds, 02n...|[340, 3.1]|
+--------+------+---+-------------+----------+-------+--------------------+---------+--------------------+--------+----------+--------------------+----------+

+--------+------+---+-------------+----------+-------+--------------------+---------+--------------------+--------+----------+--------------------+----------+
|    base|clouds|cod|        coord|        dt