In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

In [2]:
%%sh

hdfs dfs -ls -h /public/airlines_all/airlines/part-0000*

-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 08:56 /public/airlines_all/airlines/part-00000
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 09:34 /public/airlines_all/airlines/part-00001
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 07:44 /public/airlines_all/airlines/part-00002
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 10:44 /public/airlines_all/airlines/part-00003
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 08:01 /public/airlines_all/airlines/part-00004
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 10:51 /public/airlines_all/airlines/part-00005
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 11:02 /public/airlines_all/airlines/part-00006
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 09:12 /public/airlines_all/airlines/part-00007
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 08:48 /public/airlines_all/airlines/part-00008
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 08:53 /public/airlines_all/airlines/part-00009


In [3]:
%%sh

hdfs dfs -ls /public/airlines_all/airlines/ | wc -l
# There are total 1923 files

1923


In [4]:
# create schema by reading one of the files

airlines_schema = spark.read. \
    csv("/public/airlines_all/airlines/part-00000",
        header=True,
        inferSchema=True
       ). \
    schema

In [5]:
# Apply schema while reading 10 files

airlines = spark.read. \
    schema(airlines_schema). \
    csv("/public/airlines_all/airlines/part-0000*",
        header=True
       )

In [6]:
airlines.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Car

##### Check if dataframe has duplicate records or not

In [7]:
airlines.count()

6489231

In [8]:
airlines.distinct().count()

6489146

##### Create dataframe from collection

In [9]:
employees = [(1, "Scott", "Tiger", 1000.0, "united states"),
             (2, "Henry", "Ford", 1250.0, "India"),
             (3, "Nick", "Junior", 750.0, "united KINGDOM"),
             (4, "Bill", "Gomes", 1500.0, "AUSTRALIA")
            ]

In [10]:
type(employees)

list

In [11]:
type(employees[1])

tuple

In [12]:
employeesDf = spark \
    .createDataFrame(employees,
                    schema = """employee_id INT,
                                first_name STRING,
                                last_name STRING,
                                salary FLOAT,
                                nationality STRING
                    """)

In [13]:
employeesDf.show()

+-----------+----------+---------+------+--------------+
|employee_id|first_name|last_name|salary|   nationality|
+-----------+----------+---------+------+--------------+
|          1|     Scott|    Tiger|1000.0| united states|
|          2|     Henry|     Ford|1250.0|         India|
|          3|      Nick|   Junior| 750.0|united KINGDOM|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|
+-----------+----------+---------+------+--------------+



In [14]:
employeesDf.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- nationality: string (nullable = true)



##### * Project employee first name and last name.

In [15]:
employeesDf.select("first_name", "last_name").show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Scott|    Tiger|
|     Henry|     Ford|
|      Nick|   Junior|
|      Bill|    Gomes|
+----------+---------+



##### * Project all the fields except for Nationality

In [16]:
employeesDf.drop("nationality").show()

+-----------+----------+---------+------+
|employee_id|first_name|last_name|salary|
+-----------+----------+---------+------+
|          1|     Scott|    Tiger|1000.0|
|          2|     Henry|     Ford|1250.0|
|          3|      Nick|   Junior| 750.0|
|          4|      Bill|    Gomes|1500.0|
+-----------+----------+---------+------+



##### * add a column called fullname

In [17]:
from pyspark.sql.functions import *

employeesDf \
    .withColumn("fullname", concat("first_name", "last_name")).show()

+-----------+----------+---------+------+--------------+----------+
|employee_id|first_name|last_name|salary|   nationality|  fullname|
+-----------+----------+---------+------+--------------+----------+
|          1|     Scott|    Tiger|1000.0| united states|ScottTiger|
|          2|     Henry|     Ford|1250.0|         India| HenryFord|
|          3|      Nick|   Junior| 750.0|united KINGDOM|NickJunior|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA| BillGomes|
+-----------+----------+---------+------+--------------+----------+



In [23]:
# add a space between first and last name

# This will throw error as concat expects " " to be a column
# notice cannot resolve '` `' given input columns:
employeesDf \
    .withColumn("fullname", concat("first_name", " ", "last_name")).show()

AnalysisException: "cannot resolve '` `' given input columns: [last_name, first_name, salary, nationality, employee_id];;\n'Project [employee_id#237, first_name#238, last_name#239, salary#240, nationality#241, concat(first_name#238, ' , last_name#239) AS fullname#395]\n+- LogicalRDD [employee_id#237, first_name#238, last_name#239, salary#240, nationality#241], false\n"

In [22]:
# We can use lit() i.e., literal to avoid the error, now concat treats " " as a literal instead of a column
employeesDf \
    .withColumn("full_name", concat("first_name", lit(" "), "last_name")).show()

+-----------+----------+---------+------+--------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|  full_name|
+-----------+----------+---------+------+--------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states|Scott Tiger|
|          2|     Henry|     Ford|1250.0|         India| Henry Ford|
|          3|      Nick|   Junior| 750.0|united KINGDOM|Nick Junior|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA| Bill Gomes|
+-----------+----------+---------+------+--------------+-----------+



In [19]:
# Use SQL expression
employeesDf.selectExpr("*", "concat(first_name, ' ',last_name) as full_name").show()

+-----------+----------+---------+------+--------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|  full_name|
+-----------+----------+---------+------+--------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states|Scott Tiger|
|          2|     Henry|     Ford|1250.0|         India| Henry Ford|
|          3|      Nick|   Junior| 750.0|united KINGDOM|Nick Junior|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA| Bill Gomes|
+-----------+----------+---------+------+--------------+-----------+

