In [1]:
# https://datanoon.com/blog/loading_data_rest_api_to_spark/

# Makes a call to a REST API and converts the resulting JSON to a Spark DF
# All of the save commands in this file save to a folder rather than straigh to a .csv file
# Can save components of the JSON, but not the whole thing due to complex data types

In [2]:
import requests
import json

In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.session import SparkSession

conf = SparkConf().setAppName("app1").setMaster("local")
sc = SparkContext('local', 'CurrentWeather')
spark = SparkSession(sc)

In [4]:
# function to make the call to the REST API service
def get_weather():
    url = f"https://api.openweathermap.org/data/2.5/weather?q=barcelona&appid=24b234cb23d2bddfc71ee6db0e1b1d6c"
    response = requests.get(url)
    return response

In [5]:
weather = get_weather()
print(weather.text)

{"coord":{"lon":2.16,"lat":41.39},"weather":[{"id":801,"main":"Clouds","description":"few clouds","icon":"02n"}],"base":"stations","main":{"temp":281.31,"feels_like":277.32,"temp_min":279.82,"temp_max":282.59,"pressure":1005,"humidity":61},"visibility":10000,"wind":{"speed":3.1,"deg":340},"clouds":{"all":20},"dt":1607292749,"sys":{"type":1,"id":6398,"country":"ES","sunrise":1607238205,"sunset":1607271720},"timezone":3600,"id":3128760,"name":"Barcelona","cod":200}


In [6]:
# Covert json to rdd, and then to df
json_rdd = sc.parallelize([weather.text])
json_df = spark.read.json(json_rdd)
json_df.show()

+--------+------+---+-------------+----------+-------+--------------------+---------+--------------------+--------+----------+--------------------+----------+
|    base|clouds|cod|        coord|        dt|     id|                main|     name|                 sys|timezone|visibility|             weather|      wind|
+--------+------+---+-------------+----------+-------+--------------------+---------+--------------------+--------+----------+--------------------+----------+
|stations|  [20]|200|[41.39, 2.16]|1607292749|3128760|[277.32, 61, 1005...|Barcelona|[ES, 6398, 160723...|    3600|     10000|[[few clouds, 02n...|[340, 3.1]|
+--------+------+---+-------------+----------+-------+--------------------+---------+--------------------+--------+----------+--------------------+----------+



In [7]:
# https://api.openweathermap.org/data/2.5/weather?q=London,uk&APPID=24b234cb23d2bddfc71ee6db0e1b1d6c
# https://stackoverflow.com/questions/50350496/apache-spark-cant-save-grouped-data-as-csv

# columns = [base, clouds, cod, coord, dt, id, main, name, sys, timezone, visibility, weather, wind]

# Save select results of API call to .csv 
json_df.select(
    'main.temp',
    'main.temp_min',
    'main.temp_max',
    'main.pressure',
    'main.humidity',
    'wind.speed',
    'wind.deg',
    'clouds.all',
).write.option("header", "true").csv("current_weather_csv")

# Save all results of API call to text file
json_df.rdd.saveAsTextFile('current_weather_text')

# Attempt to save all results of API call to CSV file 
# Error: AnalysisException: 'CSV data source does not support struct<all:bigint> data type.;'
# json_df.write.option("header", "true").csv("TEST.csv")

In [8]:
# https://datanoon.com/blog/loading_data_rest_api_to_spark/

# Select only parts of the JSON request and save into dataframe
# This method allows us to see and save the nested components of the JSON, though 
# only one stuct at a time

rel_data = json.dumps(weather.json()['main'])
weather_rdd = sc.parallelize([rel_data])
weather_df = spark.read.json(weather_rdd)
weather_df.show()
weather_df.write.option("header", "true").csv("rel_data_csv")

rel_data1 = json.dumps(weather.json()['weather'])
weather_rdd1 = sc.parallelize([rel_data1])
weather_df1 = spark.read.json(weather_rdd1)
weather_df1.show()

rel_data2 = json.dumps(weather.json()['wind'])
weather_rdd2 = sc.parallelize([rel_data2])
weather_df2 = spark.read.json(weather_rdd2)
weather_df2.show()

+----------+--------+--------+------+--------+--------+
|feels_like|humidity|pressure|  temp|temp_max|temp_min|
+----------+--------+--------+------+--------+--------+
|    277.32|      61|    1005|281.31|  282.59|  279.82|
+----------+--------+--------+------+--------+--------+

+-----------+----+---+------+
|description|icon| id|  main|
+-----------+----+---+------+
| few clouds| 02n|801|Clouds|
+-----------+----+---+------+

+---+-----+
|deg|speed|
+---+-----+
|340|  3.1|
+---+-----+



In [9]:
sc.stop() 