# Importing Spark Libraries

In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
# Lets read the Multiline JSON
from pyspark.sql.functions import explode

spark = SparkSession.builder.getOrCreate()
spark

# Import 2015 and Older Data

In [2]:
import os 

first_file_processed = False


for dirname, _, filenames in os.walk(r"D:\Data\Kickstarter Data\2015_and_older"):
    for filename in filenames:
        if not first_file_processed:
            data = spark.read.json(os.path.join(dirname, filename), multiLine = True).withColumn('projects', explode('projects'))
            first_file_processed = True
            print(f'First file: {filename} processed')
        else:
            df = spark.read.json(os.path.join(dirname, filename), multiLine = True).withColumn('projects', explode('projects'))
            data = data.unionByName(df, allowMissingColumns=True)
            print(f'Union Succesful for {filename}')

First file: Kickstarter_2014-04-22.json processed
Union Succesful for Kickstarter_2014-08-13.json
Union Succesful for Kickstarter_2014-10-17.json
Union Succesful for Kickstarter_2014-12-02.json
Union Succesful for Kickstarter_2015-04-02.json


In [3]:
data.printSchema()

root
 |-- projects: struct (nullable = true)
 |    |-- backers_count: long (nullable = true)
 |    |-- blurb: string (nullable = true)
 |    |-- category: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- parent_id: long (nullable = true)
 |    |    |-- position: long (nullable = true)
 |    |    |-- slug: string (nullable = true)
 |    |    |-- urls: struct (nullable = true)
 |    |    |    |-- web: struct (nullable = true)
 |    |    |    |    |-- discover: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- created_at: long (nullable = true)
 |    |-- creator: struct (nullable = true)
 |    |    |-- avatar: struct (nullable = true)
 |    |    |    |-- medium: string (nullable = true)
 |    |    |    |-- small: string (nullable = true)
 |    |    |    |-- thumb: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |

In [4]:
data.select('projects.creator.name').show(5)

+--------------------+
|                name|
+--------------------+
|      Maridee Slater|
|April Yvette Thom...|
|        Lucile Scott|
|  Three Day Hangover|
|      Throes Theater|
+--------------------+
only showing top 5 rows



# Code to import 2015/10/22

In [5]:
data1 = spark.read.json(r"D:\Data\Kickstarter Data\2015_new\Kickstarter_2015-10-22T09_57_48_703Z.json", multiLine = False).\
        withColumn('data.projects', explode('data.projects')).\
        drop('data',
            'created_at',
            'id',
            'robot_id',
            'run_id',
            'table_id').withColumnRenamed('data.projects', 'projects')

In [6]:
data1.select('projects.creator.name').show(5)

+----------------+
|            name|
+----------------+
|Jamie Schumacher|
|    LittleBerlin|
|    Jerry Snyder|
| Blake McConnell|
|       Owen Cook|
+----------------+
only showing top 5 rows



In [7]:
data2 = spark.read.json(r"D:\Data\Kickstarter Data\2015_new\Kickstarter_2015-11-01T14_09_04_557Z.json").\
        withColumnRenamed('data', 'projects').\
        drop('created_at',
            'id',
            'robot_id',
            'run_id',
            'table_id')



In [8]:
data2.select('projects.creator.name').show(5)

+-------------------+
|               name|
+-------------------+
| Reedsburg ArtsLink|
|      Natasha Marin|
|Luci Riffel (LUCI∆)|
|        Brian Quinn|
|            Rebecca|
+-------------------+
only showing top 5 rows



In [9]:
data_bad = spark.read.json(r"D:\Data\Kickstarter Data\Kickstarter_2015-08-21.json", encoding = 'ISO-8859-1', lineSep='\n')

In [10]:
data_bad.printSchema()

root
 |-- _corrupt_record: string (nullable = true)

