### System Config :

In [None]:
#initial config work

import os
import sys
        
# add working directory
os.chdir(os.getcwd())

# Create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path.
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.7-src.zip"))

### SPARK Session Config :

In [None]:
#create spark session
from pyspark.sql import SparkSession

spark = (SparkSession.builder
                     .master("local")
                     .appName("complex-type-scratch")
                     .config("spark.executor.memory", "1g")
                     .config("spark.cores.max", "2")
                     .getOrCreate())

### json custom object read :

In [None]:

categoryData = (spark.range(1)
                     .selectExpr("""
                                     '
                                         {"categories" :

                                            [{"id": 1, 
                                            "name" : "cat_1", 
                                            "sections":[
                                                {
                                                "id": 1,
                                                "name": "sec_1",
                                                "articles":[{
                                                                "id":11,
                                                                "name": "art_11"
                                                            },
                                                            {
                                                                "id":12,
                                                                "name": "art_12"
                                                            },
                                                            {
                                                                "id":13,
                                                                "name": "art_13"
                                                            }]
                                                },
                                                {
                                                "id": 2,
                                                "name": "sec_2",
                                                "articles":[{
                                                                "id":21,
                                                                "name": "art_21"
                                                            },
                                                            {
                                                                "id":22,
                                                                "name": "art_22"
                                                            },
                                                            {
                                                                "id":23,
                                                                "name": "art_23"
                                                            }]
                                                }]
                                                },
                                            {"id": 2,
                                            "name" : "cat_2", 
                                            "sections":[
                                                {
                                                "id": 3,
                                                "name": "sec_3",
                                                "articles":[{
                                                                "id":31,
                                                                "name": "art_31"
                                                            },
                                                            {
                                                                "id":32,
                                                                "name": "art_32"
                                                            },
                                                            {
                                                                "id":33,
                                                                "name": "art_33"
                                                            }]
                                                },
                                                {
                                                "id": 4,
                                                "name": "sec_4",
                                                "articles":[{
                                                                "id":41,
                                                                "name": "art_41"
                                                            },
                                                            {
                                                                "id":42,
                                                                "name": "art_42"
                                                            },
                                                            {
                                                                "id":43,
                                                                "name": "art_43"
                                                            }]
                                                }]
                                                }
                                                ]
                                                }'
                                                    as categoryData
                                            
                                         """))



In [None]:
from pyspark.sql.functions import get_json_object, json_tuple, col

flatdata_df = (categoryData.withColumn('article_id',get_json_object(col('categoryData'), '$.categories[*].sections[*].articles[*].id'))
                           .withColumn('section_id',get_json_object(col('categoryData'), '$.categories[*].sections[*].id'))
                           .withColumn('category_id',get_json_object(col('categoryData'), '$.categories[*].id')))



In [None]:
from pyspark.sql.functions import explode
flatdata_df.select('article_id', 'section_id', 'category_id').show()


#### Read JSON File:

In [None]:
cms_df = (spark.read.json('./../datasets/cms/*.json',
                                multiLine=True))
cms_df.printSchema()

In [None]:
cms_df.show()

In [None]:
# from pyspark.sql.functions import to_json
# category_df = category_df.withColumn('category_json', to_json(col('categories')))
# category_df.printSchema()

#### Transform to CSV format:

In [None]:
from pyspark.sql.functions import get_json_object, json_tuple, col, expr, explode

category_breakdown_df = cms_df.withColumn('category', explode('categories')).select('category')

section_breakdown_df = (category_breakdown_df.withColumn('section', explode(col('category').getField('sections')))
                          .withColumn('category_id', col('category').getField('id'))
                           .withColumn('category_name', col('category').getField('name'))
                          .drop('category'))


article_breakdown_df = (section_breakdown_df.withColumn('article', explode(col('section').getField('articles')))
                          .withColumn('section_id', col('section').getField('id'))
                           .withColumn('section_name', col('section').getField('name'))
                          .drop('section'))


flat_df = (article_breakdown_df.withColumn('article_id', col('article').getField('id'))
                          .withColumn('article_name', col('article').getField('name'))
                          .drop('article'))

flat_df.printSchema()

In [None]:
flat_df.count()

### Session Stop :

In [None]:
spark.stop()