# JSON as format

In [None]:
import json
with open("germany.json", "r") as f:
    data = json.load(f)
print(data)

{'name': 'Germany', 'area_km2': 357022, 'population': 83555478, 'cities': [{'name': 'Berlin', 'population': 3748000}, {'name': 'Hamburg', 'population': 1841000}, {'name': 'Munich', 'population': 1488200}]}


In [None]:
data.keys()

dict_keys(['name', 'area_km2', 'population', 'cities'])

In [None]:
data["cities"]

[{'name': 'Berlin', 'population': 3748000},
 {'name': 'Hamburg', 'population': 1841000},
 {'name': 'Munich', 'population': 1488200}]

In [None]:
data["cities"][0]

{'name': 'Berlin', 'population': 3748000}

In [None]:
data["cities"][0]["name"]

'Berlin'

# JSON as string type

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [None]:
df = spark.read.option("header", "true").csv("products-json.csv")
df.show(truncate=False)

+----------+--------------------------------+
|name      |price                           |
+----------+--------------------------------+
|Laptop    |{"price_netto": 3000, "tax": 23}|
|Smartphone|{"price_netto": 1500, "tax": 23}|
|Headphones|{"price_netto": 250, "tax": 23} |
+----------+--------------------------------+



In [None]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- price: string (nullable = true)



In [None]:
from pyspark.sql.functions import from_json, col

schema_str = "price_netto integer, tax integer"

df = df.withColumn("price_json", from_json(col("price"), schema_str))
df.show()

+----------+--------------------+----------+
|      name|               price|price_json|
+----------+--------------------+----------+
|    Laptop|{"price_netto": 3...|{3000, 23}|
|Smartphone|{"price_netto": 1...|{1500, 23}|
|Headphones|{"price_netto": 2...| {250, 23}|
+----------+--------------------+----------+



In [None]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- price: string (nullable = true)
 |-- price_json: struct (nullable = true)
 |    |-- price_netto: integer (nullable = true)
 |    |-- tax: integer (nullable = true)



In [None]:
df = df.select("name",
               col("price_json.price_netto").alias("price_netto"),
               col("price_json.tax").alias("tax"))

df.show()

+----------+-----------+---+
|      name|price_netto|tax|
+----------+-----------+---+
|    Laptop|       3000| 23|
|Smartphone|       1500| 23|
|Headphones|        250| 23|
+----------+-----------+---+



In [None]:
from pyspark.sql.functions import to_json, struct

df = df.withColumn("all_as_json", to_json(struct("*")))
df.show(truncate=False)

+----------+-----------+---+-------------------------------------------------+
|name      |price_netto|tax|all_as_json                                      |
+----------+-----------+---+-------------------------------------------------+
|Laptop    |3000       |23 |{"name":"Laptop","price_netto":3000,"tax":23}    |
|Smartphone|1500       |23 |{"name":"Smartphone","price_netto":1500,"tax":23}|
|Headphones|250        |23 |{"name":"Headphones","price_netto":250,"tax":23} |
+----------+-----------+---+-------------------------------------------------+



In [None]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- price_netto: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- all_as_json: string (nullable = true)



# Reading and processing JSON files

In [None]:
# df = spark.read.json("/content/germany.json")
# df.show(truncate=False)

In [1]:
df = spark.read.option("multiline", True).json("/content/germany.json")
df.show(truncate=False)

NameError: name 'spark' is not defined

In [None]:
df.printSchema()

root
 |-- area_km2: long (nullable = true)
 |-- cities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- population: long (nullable = true)
 |-- name: string (nullable = true)
 |-- population: long (nullable = true)



In [None]:
from pyspark.sql.functions import explode

df_exploded = df.withColumn("city", explode("cities"))

df_exploded.show(truncate=False)

+--------+----------------------------------------------------------+-------+----------+------------------+
|area_km2|cities                                                    |name   |population|city              |
+--------+----------------------------------------------------------+-------+----------+------------------+
|357022  |[{Berlin, 3748000}, {Hamburg, 1841000}, {Munich, 1488200}]|Germany|83555478  |{Berlin, 3748000} |
|357022  |[{Berlin, 3748000}, {Hamburg, 1841000}, {Munich, 1488200}]|Germany|83555478  |{Hamburg, 1841000}|
|357022  |[{Berlin, 3748000}, {Hamburg, 1841000}, {Munich, 1488200}]|Germany|83555478  |{Munich, 1488200} |
+--------+----------------------------------------------------------+-------+----------+------------------+



In [None]:
df_exploded.select("name", "area_km2", "population", "city.*").show(truncate=False)


+-------+--------+----------+-------+----------+
|name   |area_km2|population|name   |population|
+-------+--------+----------+-------+----------+
|Germany|357022  |83555478  |Berlin |3748000   |
|Germany|357022  |83555478  |Hamburg|1841000   |
|Germany|357022  |83555478  |Munich |1488200   |
+-------+--------+----------+-------+----------+



In [None]:
from pyspark.sql.functions import col

df_exploded = df\
   .withColumn("city", explode(col("cities"))) \
   .select("name", "area_km2", "population",
            col("city.name").alias("city_name"),
            col("city.population").alias("city_population"))

df_exploded.show()

+-------+--------+----------+---------+---------------+
|   name|area_km2|population|city_name|city_population|
+-------+--------+----------+---------+---------------+
|Germany|  357022|  83555478|   Berlin|        3748000|
|Germany|  357022|  83555478|  Hamburg|        1841000|
|Germany|  357022|  83555478|   Munich|        1488200|
+-------+--------+----------+---------+---------------+



In [None]:
df_exploded.printSchema()

root
 |-- name: string (nullable = true)
 |-- area_km2: long (nullable = true)
 |-- population: long (nullable = true)
 |-- city_name: string (nullable = true)
 |-- city_population: long (nullable = true)



In [None]:
schema = "area_km2 long, name string, population long, cities array<struct<name string, population string>>"

In [None]:
df = spark.read.option("multiline", True).schema(schema).json("/content/germany.json")
df.show(truncate=False)

+--------+-------+----------+----------------------------------------------------------+
|area_km2|name   |population|cities                                                    |
+--------+-------+----------+----------------------------------------------------------+
|357022  |Germany|83555478  |[{Berlin, 3748000}, {Hamburg, 1841000}, {Munich, 1488200}]|
+--------+-------+----------+----------------------------------------------------------+



In [None]:
df.printSchema()

root
 |-- area_km2: long (nullable = true)
 |-- name: string (nullable = true)
 |-- population: long (nullable = true)
 |-- cities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- population: string (nullable = true)



In [None]:
df_countries = spark.read.option("multiLine", "true").json("countries.json")
df_countries.printSchema()
df_countries.show(truncate=False)


root
 |-- country_details: struct (nullable = true)
 |    |-- area_km2: long (nullable = true)
 |    |-- cities: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- population: long (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- population: long (nullable = true)
 |-- country_name: string (nullable = true)

+---------------------------------------------------------------------------------------+------------+
|country_details                                                                        |country_name|
+---------------------------------------------------------------------------------------+------------+
|{357022, [{Berlin, 3748000}, {Hamburg, 1841000}, {Munich, 1488200}], Germany, 83555478}|Germany     |
|{551695, [{Paris, 2148000}, {Lyon, 522969}, {Marseille, 870321}], France, 67508400}    |France      |
+---------------------------------------------------------

In [None]:
df_countries\
  .select("country_name",
          "country_details.area_km2",
          "country_details.population",
          "country_details.cities")\
  .show(truncate=False)

+------------+--------+----------+----------------------------------------------------------+
|country_name|area_km2|population|cities                                                    |
+------------+--------+----------+----------------------------------------------------------+
|Germany     |357022  |83555478  |[{Berlin, 3748000}, {Hamburg, 1841000}, {Munich, 1488200}]|
|France      |551695  |67508400  |[{Paris, 2148000}, {Lyon, 522969}, {Marseille, 870321}]   |
+------------+--------+----------+----------------------------------------------------------+



In [2]:
df_countries\
  .withColumn("city", explode(col("country_details.cities")))\
  .select("country_name",
          "country_details.area_km2",
          "country_details.population",
          col("city.name").alias("city_name"),
          col("city.population").alias("city_population"))\
  .show(truncate=False)

NameError: name 'df_countries' is not defined

In [None]:
import pandas as pd

df_pandas = df.toPandas()
df_pandas.to_csv("delme_final_output.csv", index=False)
