In [1]:
import os
import sys
from pyspark.sql import SparkSession

os.environ["HADOOP_HOME"] = "C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2"
sys.path.append('C:\\Users\\SkJain\\Downloads\\Compressed\\winutils-master\\hadoop-3.2.2\\bin')

In [1]:
import getpass
getpass.getuser()

'SkJain'

## Spark Context and Spark Session
- SparkSession is class in pyspark.sql package
- it is a wrapper on top of SparkContext
- when we create SparkSession, it will automatically cretae SparkContext if it doesn't exists
- SparkContext is a web service which is created when spark app is submitted through spark-submit, spark-shell or pyspark
- it maintains context of all jobs that are submitted until it is killed
- 

In [2]:
spark = SparkSession. \
    builder. \
    config("spark.ui.port", "0"). \
    enableHiveSupport(). \
    appName('Pyspark - Part 1'). \
    master('local'). \
    getOrCreate()

In [3]:
spark

## Read APIs

In [8]:
# spark.read.csv? # we can use this in jupyter based IDEs
# help(spark.read.csv) # this can be used anywhere

In [4]:
# reading csv
filename = './datasets/orders'
spark. \
    read. \
    csv(filename,
        header=True,
        schema = ''' 
        order_id INT,
        order_date STRING,
        order_cust_id INT,
        order_status STRING
        '''
       ). \
    show(20, False)

# this file doesn't contain headers so setting header=True will skip the first line
# here it is showing corrrect header values is bcz we have explicitly mentioned the schema, but first rec
#is till skipped

+--------+---------------------+-------------+---------------+
|order_id|order_date           |order_cust_id|order_status   |
+--------+---------------------+-------------+---------------+
|2       |2013-07-25 00:00:00.0|256          |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00.0|12111        |COMPLETE       |
|4       |2013-07-25 00:00:00.0|8827         |CLOSED         |
|5       |2013-07-25 00:00:00.0|11318        |COMPLETE       |
|6       |2013-07-25 00:00:00.0|7130         |COMPLETE       |
|7       |2013-07-25 00:00:00.0|4530         |COMPLETE       |
|8       |2013-07-25 00:00:00.0|2911         |PROCESSING     |
|9       |2013-07-25 00:00:00.0|5657         |PENDING_PAYMENT|
|10      |2013-07-25 00:00:00.0|5648         |PENDING_PAYMENT|
|11      |2013-07-25 00:00:00.0|918          |PAYMENT_REVIEW |
|12      |2013-07-25 00:00:00.0|1837         |CLOSED         |
|13      |2013-07-25 00:00:00.0|9149         |PENDING_PAYMENT|
|14      |2013-07-25 00:00:00.0|9842         |PROCESSIN

In [5]:
#in other file formats like json we don't need any other args like headers bcz each value will have 
# a corresponding key which is treated as column name
# it automactically infersSchema
# but if we want we can explicitly provide a schema as well

json_file_path = 'C:/Users/SkJain/Downloads/Compressed/data-master/retail_db_json/orders'
spark.read.json(json_file_path).show(20, False)

+-----------------+---------------------+--------+---------------+
|order_customer_id|order_date           |order_id|order_status   |
+-----------------+---------------------+--------+---------------+
|11599            |2013-07-25 00:00:00.0|1       |CLOSED         |
|256              |2013-07-25 00:00:00.0|2       |PENDING_PAYMENT|
|12111            |2013-07-25 00:00:00.0|3       |COMPLETE       |
|8827             |2013-07-25 00:00:00.0|4       |CLOSED         |
|11318            |2013-07-25 00:00:00.0|5       |COMPLETE       |
|7130             |2013-07-25 00:00:00.0|6       |COMPLETE       |
|4530             |2013-07-25 00:00:00.0|7       |COMPLETE       |
|2911             |2013-07-25 00:00:00.0|8       |PROCESSING     |
|5657             |2013-07-25 00:00:00.0|9       |PENDING_PAYMENT|
|5648             |2013-07-25 00:00:00.0|10      |PENDING_PAYMENT|
|918              |2013-07-25 00:00:00.0|11      |PAYMENT_REVIEW |
|1837             |2013-07-25 00:00:00.0|12      |CLOSED      

#### read airlines data
- first read with read.text to understand delimiter, header and all
- then read with read.csv using this information

In [6]:
airlines_folder = 'C:/Users/SkJain/Downloads/Compressed/data-master/airlines_all/airlines'
spark.read.text(airlines_folder).show(5, False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                                                                                                                                                |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Year,Month,Dayo

- from above we can infer that
    - header is present
    - comma separated

In [7]:
spark. \
    read. \
    csv(airlines_folder,
        header=True,
       ). \
    show(5, False)

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|IsArrDelayed|IsDepDelayed|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|1987|10   |14  

In [8]:
# we can use inferrSchema to find the schema of a datatset
# but inferSchema will go through each and every record of the data to determine the schema
# this can be very performance costly for large datasets
# if multiple files have same schema  then we can first infer it using one of the files

airlines_single_file = airlines_folder + '/part-00000'
airlines_partial_df = spark. \
    read. \
    csv(airlines_single_file,
        header=True,
        inferSchema=True
       )

In [9]:
airlines_partial_df. \
    show(5, False)

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|IsArrDelayed|IsDepDelayed|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|1987|10   |14  

In [25]:
airlines_partial_df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Car

In [10]:
# we can get schema like mentioned below and give it to schema parameter of read apis while reading file
airlines_partial_df.schema

StructType(List(StructField(Year,IntegerType,true),StructField(Month,IntegerType,true),StructField(DayofMonth,IntegerType,true),StructField(DayOfWeek,IntegerType,true),StructField(DepTime,StringType,true),StructField(CRSDepTime,IntegerType,true),StructField(ArrTime,StringType,true),StructField(CRSArrTime,IntegerType,true),StructField(UniqueCarrier,StringType,true),StructField(FlightNum,IntegerType,true),StructField(TailNum,StringType,true),StructField(ActualElapsedTime,StringType,true),StructField(CRSElapsedTime,IntegerType,true),StructField(AirTime,StringType,true),StructField(ArrDelay,StringType,true),StructField(DepDelay,StringType,true),StructField(Origin,StringType,true),StructField(Dest,StringType,true),StructField(Distance,StringType,true),StructField(TaxiIn,StringType,true),StructField(TaxiOut,StringType,true),StructField(Cancelled,IntegerType,true),StructField(CancellationCode,StringType,true),StructField(Diverted,IntegerType,true),StructField(CarrierDelay,StringType,true),Str

In [11]:
# we can directly get it by doing
airlines_single_file = airlines_folder + '/part-00000'
airlines_schema = spark. \
    read. \
    csv(airlines_single_file,
        header=True,
        inferSchema=True
       ). \
    schema

In [28]:
airlines_schema

StructType(List(StructField(Year,IntegerType,true),StructField(Month,IntegerType,true),StructField(DayofMonth,IntegerType,true),StructField(DayOfWeek,IntegerType,true),StructField(DepTime,StringType,true),StructField(CRSDepTime,IntegerType,true),StructField(ArrTime,StringType,true),StructField(CRSArrTime,IntegerType,true),StructField(UniqueCarrier,StringType,true),StructField(FlightNum,IntegerType,true),StructField(TailNum,StringType,true),StructField(ActualElapsedTime,StringType,true),StructField(CRSElapsedTime,IntegerType,true),StructField(AirTime,StringType,true),StructField(ArrDelay,StringType,true),StructField(DepDelay,StringType,true),StructField(Origin,StringType,true),StructField(Dest,StringType,true),StructField(Distance,StringType,true),StructField(TaxiIn,StringType,true),StructField(TaxiOut,StringType,true),StructField(Cancelled,IntegerType,true),StructField(CancellationCode,StringType,true),StructField(Diverted,IntegerType,true),StructField(CarrierDelay,StringType,true),Str

In [12]:
#schema can be pased either as string or struct type
#for order we did in using string type , here we will use structtype

airlines_full_df = spark. \
    read. \
    schema(airlines_schema). \
    csv(airlines_folder,
        header=True
       )

In [13]:
airlines_full_df.count()

1290395

In [14]:
airlines_full_df.distinct().count()

1290323

### creating dataframe from python collection

In [41]:
employees = [(1, "Scott", "Tiger", 1000.0, "united states"),
             (2, "Henry", "Ford", 1250.0, "India"),
             (3, "Nick", "Junior", 750.0, "united KINGDOM"),
             (4, "Bill", "Gomes", 1500.0, "AUSTRALIA")
            ]

In [42]:
emp_df = spark.createDataFrame(employees,
                               schema = """ employee_id INT,
                                           f_name STRING,
                                           l_name STRING,
                                           salary FLOAT,
                                           country STRING"""
                              )

In [43]:
emp_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- f_name: string (nullable = true)
 |-- l_name: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- country: string (nullable = true)



In [50]:
airlines_full_df.show(5, False)

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|IsArrDelayed|IsDepDelayed|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|1987|10   |14  

In [52]:
# to see only a few columns
airlines_full_df.select("FlightNum", "Cancelled").show(5, False)

+---------+---------+
|FlightNum|Cancelled|
+---------+---------+
|1451     |0        |
|1451     |0        |
|1451     |0        |
|1451     |0        |
|1451     |0        |
+---------+---------+
only showing top 5 rows



In [54]:
# to not see a few columns
airlines_full_df.drop("Year", "Month").show(5)

+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|IsArrDelayed|IsDepDelayed|
+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+------------+
|        14|        3|    741|       730|    912|

In [56]:
from pyspark.sql.functions import *

In [58]:
airlines_full_df. \
    select("Year", "Month", "DayofMonth"). \
    withColumn("fullDate", concat_ws("-","Year", "Month", "DayofMonth")). \
    show(5)

+----+-----+----------+----------+
|Year|Month|DayofMonth|  fullDate|
+----+-----+----------+----------+
|1987|   10|        14|1987-10-14|
|1987|   10|        15|1987-10-15|
|1987|   10|        17|1987-10-17|
|1987|   10|        18|1987-10-18|
|1987|   10|        19|1987-10-19|
+----+-----+----------+----------+
only showing top 5 rows



In [59]:
# to add a constant, we can use lit() function, otherwise it will try to look for column with name '-'

airlines_full_df. \
    select("Year", "Month", "DayofMonth"). \
    withColumn("fullDate", concat("Year", lit("-"), "Month", lit("-"), "DayofMonth")). \
    show(5)

+----+-----+----------+----------+
|Year|Month|DayofMonth|  fullDate|
+----+-----+----------+----------+
|1987|   10|        14|1987-10-14|
|1987|   10|        15|1987-10-15|
|1987|   10|        17|1987-10-17|
|1987|   10|        18|1987-10-18|
|1987|   10|        19|1987-10-19|
+----+-----+----------+----------+
only showing top 5 rows



In [63]:
# we can use selectExpr to do the same thing as above using sql style query
airlines_full_df. \
    select("Year", "Month", "DayofMonth"). \
    selectExpr("Year", "Month", "DayofMonth", "concat(Year, '-', Month, '-', DayOfMonth) as fullDate"). \
    show()

+----+-----+----------+----------+
|Year|Month|DayofMonth|  fullDate|
+----+-----+----------+----------+
|1987|   10|        14|1987-10-14|
|1987|   10|        15|1987-10-15|
|1987|   10|        17|1987-10-17|
|1987|   10|        18|1987-10-18|
|1987|   10|        19|1987-10-19|
|1987|   10|        21|1987-10-21|
|1987|   10|        22|1987-10-22|
|1987|   10|        23|1987-10-23|
|1987|   10|        24|1987-10-24|
|1987|   10|        25|1987-10-25|
|1987|   10|        26|1987-10-26|
|1987|   10|        28|1987-10-28|
|1987|   10|        29|1987-10-29|
|1987|   10|        31|1987-10-31|
|1987|   10|         1| 1987-10-1|
|1987|   10|         2| 1987-10-2|
|1987|   10|         3| 1987-10-3|
|1987|   10|         4| 1987-10-4|
|1987|   10|         5| 1987-10-5|
|1987|   10|         6| 1987-10-6|
+----+-----+----------+----------+
only showing top 20 rows



### Write APIs
- when we pass options to writeApis, if there's any mistake in them then query will not fail, the option will simply be ignored
- by default the number of files created in the output directory is equal to number of tasks in last stage. But we can control the number of files so that we don't get to many small sized files
- that can be done using **coalesce** which has done on the dataframe before invoing write

In [66]:
filename = './datasets/orders'
orders_df = spark. \
    read. \
    csv(filename,
        schema = ''' 
        order_id INT,
        order_date STRING,
        order_cust_id INT,
        order_status STRING
        '''
       )

In [67]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_cust_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [68]:
orders_df.show(5)

+--------+--------------------+-------------+---------------+
|order_id|          order_date|order_cust_id|   order_status|
+--------+--------------------+-------------+---------------+
|       1|2013-07-25 00:00:...|        11599|         CLOSED|
|       2|2013-07-25 00:00:...|          256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|        12111|       COMPLETE|
|       4|2013-07-25 00:00:...|         8827|         CLOSED|
|       5|2013-07-25 00:00:...|        11318|       COMPLETE|
+--------+--------------------+-------------+---------------+
only showing top 5 rows



In [69]:
orders_df.count()

68883

In [70]:
orders_df. \
    write. \
    parquet('./ordersOutputFolder',
           mode='overwrite',
           compression='none')

In [71]:
#alternate way
orders_df. \
    write. \
    mode('overwrite'). \
    option('compression', 'none'). \
    parquet('./ordersOutputFolder')

In [72]:
#alternate way
orders_df. \
    write. \
    mode('overwrite'). \
    option('compression', 'none'). \
    format('parquet').\
    save('./ordersOutputFolder')

#### write json to 
- pipe delimited file 
- with gzip compression
- in a single file
- ignore if target location already exists

In [73]:
json_file_path = 'C:/Users/SkJain/Downloads/Compressed/data-master/retail_db_json/order_items'
order_items_df = spark.read.json(json_file_path)

In [74]:
order_items_df.printSchema()

root
 |-- order_item_id: long (nullable = true)
 |-- order_item_order_id: long (nullable = true)
 |-- order_item_product_id: long (nullable = true)
 |-- order_item_product_price: double (nullable = true)
 |-- order_item_quantity: long (nullable = true)
 |-- order_item_subtotal: double (nullable = true)



In [75]:
order_items_df.show(5)

+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_product_price|order_item_quantity|order_item_subtotal|
+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|            1|                  1|                  957|                  299.98|                  1|             299.98|
|            2|                  2|                 1073|                  199.99|                  1|             199.99|
|            3|                  2|                  502|                    50.0|                  5|              250.0|
|            4|                  2|                  403|                  129.99|                  1|             129.99|
|            5|                  4|                  897|                   24.99|                  2|              49.98|
+-------------+-

In [76]:
order_items_df.count()

172198

In [79]:
# sicne we have used mode as ignore, if we run this second time when folder is already present, the timestamp of folder will not change
# that means since the folder is present, it ignored the write command and did nothing

#apart from 'overwrite' and 'ignore' we can also use
# append: if we use this, we'll see another file will be added to the same folder (single file in this case bcz of coalesce)
#error: code will fail and throw error if target folder already exists

order_items_df. \
coalesce(1). \
write. \
mode('ignore'). \
option('compression', 'gzip'). \
option('sep', '|'). \
csv('./orderItemsOutputFolder')