In [None]:
#Basic Structured Operations

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("StructuredOperations").getOrCreate()


In [4]:
flightDF=spark.read.format("json").load("/content/2015-summary.json")

In [10]:
flightDF.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [12]:
import pprint
pprint.pprint(flightDF.schema)

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', LongType(), True)])


In [None]:
# example that follows shows how to create and enforce a specific schema on a DataFrame


In [13]:
from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
 StructField("DEST_COUNTRY_NAME", StringType(), True),
 StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
 StructField("count", LongType(), False, metadata={"hello":"world"})
 ])


In [17]:
flightdf2 = spark.read.format("json")\
 .schema(myManualSchema)\
 .load("/content/2015-summary.json")

flightdf2.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [None]:
# We must use Spark transformations within a DataFrame to modify the contents of a column

In [22]:
flightdf2.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [24]:
flightdf2.first()

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)

In [None]:
#Creating Rows
 #You can create rows by manually instantiating a Row object with the values that belong in each column.

In [29]:
from pyspark.sql import Row

myRow=[]

myRow.append(Row("rashid", None, 1, False))
myRow.append(Row("shuja", None, 1, False))


#Creating DataFrames

In [6]:

mynewdf = spark.read.format("json").load("/content/2015-summary.json")
mynewdf.createOrReplaceTempView("dfTable")

In [35]:
spark.sql("select * from dfTable").show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [36]:
 from pyspark.sql import Row
 from pyspark.sql.types import StructField, StructType, StringType, LongType

 myManualSchema = StructType([
 StructField("some", StringType(), True),
 StructField("col", StringType(), True),
 StructField("names", LongType(), False)
 ])

 myRow = Row("Hello", None, 1)

 myDf = spark.createDataFrame([myRow], myManualSchema)
 myDf.show()

+-----+----+-----+
| some| col|names|
+-----+----+-----+
|Hello|NULL|    1|
+-----+----+-----+



#select and selectExpr
 select and selectExpr allow you to do the DataFrame equivalent of SQL queries on a table of  data:


In [37]:
mynewdf.select("DEST_COUNTRY_NAME").show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows


In [38]:
mynewdf.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(2)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
+-----------------+-------------------+
only showing top 2 rows


In [39]:
from pyspark.sql.functions import expr, col, column

mynewdf.select(
 expr("DEST_COUNTRY_NAME"),
 col("DEST_COUNTRY_NAME"),
 column("DEST_COUNTRY_NAME"))\
 .show(2)

+-----------------+-----------------+-----------------+
|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|
+-----------------+-----------------+-----------------+
|    United States|    United States|    United States|
|    United States|    United States|    United States|
+-----------------+-----------------+-----------------+
only showing top 2 rows


expr is the most flexible reference that we can use. It can refer to a plain
 column or a string manipulation of a column.

In [40]:
 mynewdf.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows


In [41]:
mynewdf.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME"))\
 .show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows


 Because select followed by a series of expr is such a common pattern, Spark has a shorthand for doing this efficiently: selectExpr.

In [42]:
mynewdf.selectExpr("DEST_COUNTRY_NAME as newColumnName", "DEST_COUNTRY_NAME").show(2)

+-------------+-----------------+
|newColumnName|DEST_COUNTRY_NAME|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows


Here’s a simple example that adds a new column withinCountry to our DataFrame

In [43]:
mynewdf.selectExpr(
 "*", # all original columns
 "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry")\
 .show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows


In [50]:
mynewdf.selectExpr("count(distinct(DEST_COUNTRY_NAME))").show()

+---------------------------------+
|count(DISTINCT DEST_COUNTRY_NAME)|
+---------------------------------+
|                              132|
+---------------------------------+



In [49]:
mynewdf.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show()

+-----------+---------------------------------+
| avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+



**Converting to Spark Types (Literals)**

we need to pass explicit values into Spark that are just a value (rather than a new column). This might be a constant value or something we’ll need to compare to later on

In [53]:
from pyspark.sql.functions import lit

mynewdf.select(expr("*"), lit(1).alias("One")).show(4)
#in SQL
# SELECT *, 1 as One FROM dfTable LIMIT

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|One|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|  1|
|    United States|            Croatia|    1|  1|
|    United States|            Ireland|  344|  1|
|            Egypt|      United States|   15|  1|
+-----------------+-------------------+-----+---+
only showing top 4 rows


 **Adding Columns**

In [54]:
 mynewdf.withColumn("numberOne", lit(1)).show(2)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows


In [59]:
#set a Boolean flag for when the origin country is the same as the destination country
mynewdf.withColumn("withinCountry", expr("DEST_COUNTRY_NAME=ORIGIN_COUNTRY_NAME")).show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows


In [60]:
# we can also rename  a column this way
mynewdf.withColumn("Destination", expr("DEST_COUNTRY_NAME")).columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count', 'Destination']

**Renaming Columns**

In [61]:
mynewdf.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns

['dest', 'ORIGIN_COUNTRY_NAME', 'count']

 **Removing Columns**

In [62]:
mynewdf.drop("ORIGIN_COUNTRY_NAME").columns

['DEST_COUNTRY_NAME', 'count']

**Changing a Column’s Type (cast)**

In [63]:
#added a new col=count2 with datatype as long
mynewdf.withColumn("count2", col("count").cast("long"))

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint, count2: bigint]

 **Filtering Rows**

In [64]:
mynewdf.filter(col("count") < 2).show(2)


+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows


In [65]:
mynewdf.where("count < 2").show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows


you might want to put multiple filters into the same expression.

In [66]:
mynewdf.where(col("count") < 2).where(col("ORIGIN_COUNTRY_NAME") != "Croatia")\
 .show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows


 **Getting Unique Rows**

In [67]:
mynewdf.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").distinct().count()

256

In [68]:
mynewdf.select("ORIGIN_COUNTRY_NAME").distinct().count()

125

**Random Samples**

Random Sampling means selecting a random subset of rows from a DataFrame

**Replacement**
Same row can appear multiple times,Used in bootstrapping

**Seed**
When Spark performs a random operation (like sample())
Spark uses a pseudo-random number generator
The seed initializes that generator
Same seed + same data + same operation → same result
Without a seed:
Every run produces a different random result

**Fraction**
Fraction parameter means the proportion of rows you want to randomly select




In [94]:
seed=42
withReplacement = True
fraction = 0.1
sampledf=mynewdf.sample(withReplacement, fraction,seed)
sampledf.show()

+-----------------+--------------------+-----+
|DEST_COUNTRY_NAME| ORIGIN_COUNTRY_NAME|count|
+-----------------+--------------------+-----+
|         Pakistan|       United States|   12|
|     Sint Maarten|       United States|  325|
|    United States|              Cyprus|    1|
|    United States|           Guatemala|  318|
|         Colombia|       United States|  873|
|    United States|            Malaysia|    3|
|         Thailand|       United States|    3|
|    United States|               China|  920|
|    United States|    Saint Barthelemy|   41|
|    United States|              Turkey|  129|
|    United States|              Panama|  465|
|    United States|            Thailand|    4|
|    United States|Turks and Caicos ...|  236|
|          Croatia|       United States|    2|
|            Haiti|       United States|  226|
|          Finland|       United States|   26|
|        Singapore|       United States|    3|
|             Peru|       United States|  279|
|   Czech Rep

**Random Splits**

Random splits can be helpful when you need to break up your DataFrame into a random “splits”
we’ll split our DataFrame into two different DataFrames

In [95]:
 dataFrames = mynewdf.randomSplit([0.25, 0.75], seed)
 dataFrames[0].count() > dataFrames[1].count()

False

 **Concatenating and Appending Rows (Union)**

  To union two DataFrames, you must be sure that they have the same schema and
 number of columns; otherwise, the union will fail.

In [98]:
from pyspark.sql import Row

schema = mynewdf.schema

newRows = [
 Row("New Country", "Other Country", 5),
 Row("New Country 2", "Other Country 3", 1)
]

parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)

In [99]:
mynewdf.union(newDF)\
 .where("count = 1")\
 .where(col("ORIGIN_COUNTRY_NAME") != "United States")\
 .show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|    United States|          Gibraltar|    1|
|    United States|             Cyprus|    1|
|    United States|            Estonia|    1|
|    United States|          Lithuania|    1|
|    United States|           Bulgaria|    1|
|    United States|            Georgia|    1|
|    United States|            Bahrain|    1|
|    United States|   Papua New Guinea|    1|
|    United States|         Montenegro|    1|
|    United States|            Namibia|    1|
|    New Country 2|    Other Country 3|    1|
+-----------------+-------------------+-----+



**Sorting Rows**

In [100]:
mynewdf.sort("count").show(5)


+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
+--------------------+-------------------+-----+
only showing top 5 rows


In [101]:
 mynewdf.orderBy("count", "DEST_COUNTRY_NAME").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
|           Cyprus|      United States|    1|
|         Djibouti|      United States|    1|
|        Indonesia|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows


**Repartition and Coalesce**

In [20]:
 mynewdf.rdd.getNumPartitions()

1

In [21]:

mynewdf = mynewdf.repartition(5)
print(mynewdf.rdd.getNumPartitions())

5


If you know that you’re going to be filtering by a certain column often, it can be worth repartitioning based on that column:

In [11]:
 from pyspark.sql.functions import expr, col, column
 mynewdf.repartition(col("DEST_COUNTRY_NAME"))

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [17]:
mynewdf.repartition(5, col("DEST_COUNTRY_NAME"))

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [19]:
#operation will shuffle your data into five partitions based on the destination country name, and
# then coalesce them (without a full shuffle)

mynewdf.repartition(5, col("DEST_COUNTRY_NAME")).coalesce(2)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]