
##PySpark Datasources


###PySpark Read CSV file into DataFrame


In [0]:
import pyspark
from pyspark.sql import SparkSession

In [0]:
spark=SparkSession.builder \
      .master('local[*]') \
      .appName('SparkByExamples') \
      .getOrCreate()

In [0]:
#https://github.com/spark-examples/pyspark-examples/blob/master/resources/zipcodes.csv 

Filepath : "File uploaded to /FileStore/tables/zipcodes.csv"
df = spark.read.options(delimiter=",").csv("/FileStore/tables/zipcodes.csv", header=True, inferSchema=True)
# or 

df = spark.read.options(header='True', inferSchema='True', delimiter=',').csv('/FileStore/tables/zipcodes.csv')
df.printSchema()

# Read Multiple CSV Files
# df = spark.read.csv("path1,path2,path3")




root
 |-- RecordNumber: integer (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- ZipCodeType: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- LocationType: string (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Long: double (nullable = true)
 |-- Xaxis: double (nullable = true)
 |-- Yaxis: double (nullable = true)
 |-- Zaxis: double (nullable = true)
 |-- WorldRegion: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- LocationText: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Decommisioned: boolean (nullable = true)
 |-- TaxReturnsFiled: integer (nullable = true)
 |-- EstimatedPopulation: integer (nullable = true)
 |-- TotalWages: integer (nullable = true)
 |-- Notes: string (nullable = true)



#### Reading CSV files with a user-specified custom schema

In [0]:
from pyspark.sql.types import StructType, IntegerType, StringType, DoubleType, BooleanType

schema = StructType() \
      .add("RecordNumber",IntegerType(),True) \
      .add("Zipcode",IntegerType(),True) \
      .add("ZipCodeType",StringType(),True) \
      .add("City",StringType(),True) \
      .add("State",StringType(),True) \
      .add("LocationType",StringType(),True) \
      .add("Lat",DoubleType(),True) \
      .add("Long",DoubleType(),True) \
      .add("Xaxis",IntegerType(),True) \
      .add("Yaxis",DoubleType(),True) \
      .add("Zaxis",DoubleType(),True) \
      .add("WorldRegion",StringType(),True) \
      .add("Country",StringType(),True) \
      .add("LocationText",StringType(),True) \
      .add("Location",StringType(),True) \
      .add("Decommisioned",BooleanType(),True) \
      .add("TaxReturnsFiled",StringType(),True) \
      .add("EstimatedPopulation",IntegerType(),True) \
      .add("TotalWages",IntegerType(),True) \
      .add("Notes",StringType(),True)
      
df_with_schema = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/FileStore/tables/zipcodes.csv")

df_with_schema.printSchema()

root
 |-- RecordNumber: integer (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- ZipCodeType: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- LocationType: string (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Long: double (nullable = true)
 |-- Xaxis: integer (nullable = true)
 |-- Yaxis: double (nullable = true)
 |-- Zaxis: double (nullable = true)
 |-- WorldRegion: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- LocationText: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Decommisioned: boolean (nullable = true)
 |-- TaxReturnsFiled: string (nullable = true)
 |-- EstimatedPopulation: integer (nullable = true)
 |-- TotalWages: integer (nullable = true)
 |-- Notes: string (nullable = true)



####Saving mode

In [0]:
df_with_schema.write.mode('overwrite').csv('FileStore/tables/modified_zipcoe.csv')


### PySpark Read and Write Parquet File


* **What is Parquet File?**

Apache parquet file is a column storage format available to any project in the hadoop ecosystem

- Advantages: 
  
  - While querying columnar storage, it skips the non-relevant data very quickly, making fatser query execution. As a result aggrgation queries consule less time compared to row-oriented databases

In [0]:
import pyspark
from pyspark.sql import SparkSession

data = [("James ","","Smith","36636","M",3000),
              ("Michael ","Rose","","40288","M",4000),
              ("Robert ","","Williams","42114","M",4000),
              ("Maria ","Anne","Jones","39192","F",4000),
              ("Jen","Mary","Brown","","F",-1)]

columns = ["firstname","middlename","lastname","dob","gender","salary"]

df = spark.createDataFrame(data, columns)
df.write.mode('overwrite').parquet('/FileStore/tables/people.parquet')

In [0]:
parDF1= spark.read.parquet('/FileStore/tables/people.parquet')
parDF1.show()

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|  dob|gender|salary|
+---------+----------+--------+-----+------+------+
|  Robert |          |Williams|42114|     M|  4000|
|   Maria |      Anne|   Jones|39192|     F|  4000|
| Michael |      Rose|        |40288|     M|  4000|
|   James |          |   Smith|36636|     M|  3000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



In [0]:
parDF1.createOrReplaceTempView('parquetTable')
spark.sql('select * from parquetTable').show()

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|  dob|gender|salary|
+---------+----------+--------+-----+------+------+
|  Robert |          |Williams|42114|     M|  4000|
|   Maria |      Anne|   Jones|39192|     F|  4000|
| Michael |      Rose|        |40288|     M|  4000|
|   James |          |   Smith|36636|     M|  3000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



In [0]:
df.write.partitionBy('gender','salary').mode('overwrite').parquet('/FileStore/tables/people_partitions.parquet')

In [0]:
parDF2=spark.read.parquet('/FileStore/tables/people_partitions.parquet/gender=M/salary=4000')
parDF2.show()

+---------+----------+--------+-----+
|firstname|middlename|lastname|  dob|
+---------+----------+--------+-----+
|  Robert |          |Williams|42114|
| Michael |      Rose|        |40288|
+---------+----------+--------+-----+



In [0]:
spark.sql('create temporary view person2 using parquet options(path \"/FileStore/tables/people_partitions.parquet/gender=F\")')

Out[19]: DataFrame[]

In [0]:
spark.sql('select * from person2').show()

+---------+----------+--------+-----+------+
|firstname|middlename|lastname|  dob|salary|
+---------+----------+--------+-----+------+
|   Maria |      Anne|   Jones|39192|  4000|
|      Jen|      Mary|   Brown|     |    -1|
+---------+----------+--------+-----+------+



###PySpark Read JSON file into DataFrame


In [0]:
# df= spark.read.json("resource/zipcodes.json")

#read multiline json file

# multiline_df = spark.read.option('multiline', 'true').json('resources/multiline-zipcode.json')
# multiline_df.show()

#read multiple files
# df2=spark.read.json(['resources/zipcode2.json', 'resources/zipcode1.json'])
# df2.show()

#Read All JSON files from a directory
# df3=spark.read.json('resources/*.json')
# df3.show()

# Create a table from json File
# spark.sql("create or replace temporary table zipcode3 using json options" + "(path 'resources/zipcodes.json')")
# spark.sql('select * from zipcode3').show()


# PySpark write Parquet File
# df2.write.mode('Overwrite').json('/tmp/spark_output/zipcodes.json')
