In [2]:
# Create SparkSession from builder
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .appName('chapter01_to_05') \
                    .getOrCreate()

In [3]:
flightData2015 =  spark.read.option("inferSchema","true").\
                  option("header","true").\
                  csv("spark_practice/datas/flight-data/csv/2015-summary.csv")

In [4]:
flightData2015.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [5]:
flightData2015.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
+-----------------+-------------------+-----+
only showing top 3 rows



In [6]:
flightData2015.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [7]:
flightData2015.sort('count').explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#19 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(count#19 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=45]
      +- FileScan csv [DEST_COUNTRY_NAME#17,ORIGIN_COUNTRY_NAME#18,count#19] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/fm-pc-lt-342/Documents/Spark Docx/spark_practice/datas/flig..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




### Shuffle Default Partition

In [8]:
spark.conf.get('spark.sql.shuffle.partitions')

'200'

### Repartition to 5

In [9]:
spark.conf.set('spark.sql.shuffle.partitions',"5")

In [10]:
flightData2015.sort('DEST_COUNTRY_NAME').show()
"""This code in PySpark sorts the DataFrame flightData2015 by the column named 'DEST_COUNTRY_NAME' in ascending order and then displays the first 20 rows of the resulting DataFrame
. The sort() function is used to sort one or more columns in a DataFrame, and the show() function is used to display the resulting DataFrame"""

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|             Algeria|      United States|    4|
|              Angola|      United States|   15|
|            Anguilla|      United States|   41|
| Antigua and Barbuda|      United States|  126|
|           Argentina|      United States|  180|
|               Aruba|      United States|  346|
|           Australia|      United States|  329|
|             Austria|      United States|   62|
|          Azerbaijan|      United States|   21|
|             Bahrain|      United States|   19|
|            Barbados|      United States|  154|
|             Belgium|      United States|  259|
|              Belize|      United States|  188|
|             Bermuda|      United States|  183|
|             Bolivia|      United States|   30|
|Bonaire, Sint Eus...|      United States|   58|
|              Brazil|      United States|  853|
|British Virgin Is..

"This code in PySpark sorts the DataFrame flightData2015 by the column named 'DEST_COUNTRY_NAME' in ascending order and then displays the first 20 rows of the resulting DataFrame\n. The sort() function is used to sort one or more columns in a DataFrame, and the show() function is used to display the resulting DataFrame"

#### Any dataFrame can be made into a table or view with a simple method called **createOrReplaceTempView**

In [11]:
flightData2015.createOrReplaceTempView("2015_temp_view")

In [12]:
sqlWay = spark.sql("SELECT DEST_COUNTRY_NAME, count(1) FROM 2015_temp_view GROUP BY DEST_COUNTRY_NAME")
sqlWay.show(10)

+--------------------+--------+
|   DEST_COUNTRY_NAME|count(1)|
+--------------------+--------+
|             Moldova|       1|
|             Bolivia|       1|
|             Algeria|       1|
|Turks and Caicos ...|       1|
|            Pakistan|       1|
|    Marshall Islands|       1|
|            Suriname|       1|
|              Panama|       1|
|         New Zealand|       1|
|             Liberia|       1|
+--------------------+--------+
only showing top 10 rows



### Converting to Spark Types (Literals)
Sometimes, we need to pass explicit values into Spark that are just a value (rather than a newcolumn). This might be a constant value or something we’ll need to compare to later on. The
way we do this is through literals. This is basically a translation from a given programming language’s literal value to one that Spark understands. Literals are expressions and you can use
them in the same way:

In [13]:
from pyspark.sql.functions import lit
literals = flightData2015.select("*",lit(1).alias("One")).show()

+--------------------+-------------------+-----+---+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|One|
+--------------------+-------------------+-----+---+
|       United States|            Romania|   15|  1|
|       United States|            Croatia|    1|  1|
|       United States|            Ireland|  344|  1|
|               Egypt|      United States|   15|  1|
|       United States|              India|   62|  1|
|       United States|          Singapore|    1|  1|
|       United States|            Grenada|   62|  1|
|          Costa Rica|      United States|  588|  1|
|             Senegal|      United States|   40|  1|
|             Moldova|      United States|    1|  1|
|       United States|       Sint Maarten|  325|  1|
|       United States|   Marshall Islands|   39|  1|
|              Guyana|      United States|   64|  1|
|               Malta|      United States|    1|  1|
|            Anguilla|      United States|   41|  1|
|             Bolivia|      United States|   3

### Adding Columns
PySpark withColumn() is a transformation function of DataFrame which is used to change the value, convert the datatype of an existing column, create a new column, and many more. In this post, I will walk you through commonly used PySpark DataFrame column operations using withColumn() examples.

In [16]:
from pyspark.sql.functions import expr, col
adding_new_column = flightData2015.withColumn("new_column",expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME"))
adding_new_column.show(5)

+-----------------+-------------------+-----+----------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|new_column|
+-----------------+-------------------+-----+----------+
|    United States|            Romania|   15|     false|
|    United States|            Croatia|    1|     false|
|    United States|            Ireland|  344|     false|
|            Egypt|      United States|   15|     false|
|    United States|              India|   62|     false|
+-----------------+-------------------+-----+----------+
only showing top 5 rows



In [17]:
new_col = flightData2015.withColumn("new_column_2",col("count") * 2)
new_col.show(5)

+-----------------+-------------------+-----+------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|new_column_2|
+-----------------+-------------------+-----+------------+
|    United States|            Romania|   15|          30|
|    United States|            Croatia|    1|           2|
|    United States|            Ireland|  344|         688|
|            Egypt|      United States|   15|          30|
|    United States|              India|   62|         124|
+-----------------+-------------------+-----+------------+
only showing top 5 rows



### Renaming Columns
Though we cannot rename a column using withColumn, still we wanted to cover this as renaming is one of the common operations we perform on DataFrame. To rename an existing column use withColumnRenamed() function on DataFrame.

In [18]:
renaming_column = new_col.withColumnRenamed("new_column_2","Count * 2")
renaming_column.show(5)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|Count * 2|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|       30|
|    United States|            Croatia|    1|        2|
|    United States|            Ireland|  344|      688|
|            Egypt|      United States|   15|       30|
|    United States|              India|   62|      124|
+-----------------+-------------------+-----+---------+
only showing top 5 rows



### Case Sensitivity
By default Spark is case insensitive; however, you can make Spark case sensitive by setting the configuration:

-- in SQL

set spark.sql.caseSensitive true

### Removing Columns
df.drop("ORIGIN_COUNTRY_NAME")

We can drop multiple columns by passing in multiple columns as arguments:

df.drop("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME")

In [19]:
drop_column = renaming_column.drop(col("`Count * 2`"))
drop_column.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
+-----------------+-------------------+-----+
only showing top 3 rows



### Changing a Column’s Type (cast)

In [20]:
drop_column.withColumn("count",col("count").cast("long")).printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



### Filtering Rows
To filter rows, we create an expression that evaluates to true or false. There are two methods to perform this operation: we can use where or filter
and they both will perform the same operation and accept the same argument types when used
with DataFrames.

In [21]:
flightData2015.filter(flightData2015["count"]<2).show(3)
# flightData2015.filter(col("count") < 2).show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 3 rows



In [22]:
flightData2015.where(col("count")>5).show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
+-----------------+-------------------+-----+
only showing top 3 rows



### 
Instinctually, you might want to put multiple filters into the same expression. Although this is possible, it is not always useful, because Spark automatically performs all filtering operations at
the same time regardless of the filter ordering. This means that if you want to specify multiple
AND filters, just chain them sequentially and let Spark handle the rest:

In [23]:
flightData2015.where(col("count") < 2).where(col("ORIGIN_COUNTRY_NAME") != "Croatia").show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



### Getting Unique Rows

In [24]:
flightData2015.select("DEST_COUNTRY_NAME","ORIGIN_COUNTRY_NAME").distinct().count()

256

### Sorting Rows

In [25]:
# flightData2015.sort("count").show(5)
# flightData2015.orderBy("count", "DEST_COUNTRY_NAME").show(5)
flightData2015.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
|           Cyprus|      United States|    1|
|         Djibouti|      United States|    1|
|        Indonesia|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [26]:
# To more explicitly specify sort direction, you need to use the asc and desc functions if operating
# on a column. These allow you to specify the order in which a given column should be sorted:
from pyspark.sql.functions import desc, asc

flightData2015.orderBy(expr("count desc")).show(2)
flightData2015.orderBy(col("count").desc(), col("DEST_COUNTRY_NAME").asc()).show(2)


# An advanced tip is to use asc_nulls_first, desc_nulls_first, asc_nulls_last, or
# desc_nulls_last to specify where you would like your null values to appear in an ordered
# DataFrame.

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|          Moldova|      United States|    1|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
+-----------------+-------------------+------+
only showing top 2 rows



In [27]:
flightData2015.rdd.getNumPartitions() # 1

1

In [28]:
flightData2015.repartition(5)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: int]

### Collect, Take and Show

collect() : collect gets all data from the entire DataFrame.

take(N) : take gets the first N rows.

show() : how prints out a number of rows nicely.