In [None]:
# 
%run "D:\GitLocal\big_data\Spark\spark the definitive guide\my_code\data_read.py"

# Reserved Characters and Keywords: 
In Spark, Reserved characters like spaces or dashes in column names handled by using backtick(`) characters.    

In [10]:
from pyspark.sql.functions import col, expr, column

dfWithLongColName = df.withColumn("This Long Column-Name", expr("Origin_Country_Name"))
dfWithLongColName.show(1)

+-----------------+-------------------+-----+---------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|This Long Column-Name|
+-----------------+-------------------+-----+---------------------+
|    United States|            Romania|   15|              Romania|
+-----------------+-------------------+-----+---------------------+
only showing top 1 row



In [14]:
dfWithLongColName.selectExpr("`This Long Column-Name`",
                             "`This Long Column-Name` as `new col`")\
                             .show(2)

+---------------------+-------+
|This Long Column-Name|new col|
+---------------------+-------+
|              Romania|Romania|
|              Croatia|Croatia|
+---------------------+-------+
only showing top 2 rows



In [28]:
dfWithLongColName.select(expr("`This Long Column-Name`")).columns

['This Long Column-Name']

In [29]:
df.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

# Removing Columns:


In [34]:
df.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [35]:
df.drop("ORIGIN_COUNTRY_NAME").show(5)

+-----------------+-----+
|DEST_COUNTRY_NAME|count|
+-----------------+-----+
|    United States|   15|
|    United States|    1|
|    United States|  344|
|            Egypt|   15|
|    United States|   62|
+-----------------+-----+
only showing top 5 rows



In [36]:
dfWithLongColName.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count', 'This Long Column-Name']

In [38]:
dfWithLongColName.drop("ORIGIN_COUNTRY_NAME", "count").show(2)

+-----------------+---------------------+
|DEST_COUNTRY_NAME|This Long Column-Name|
+-----------------+---------------------+
|    United States|              Romania|
|    United States|              Croatia|
+-----------------+---------------------+
only showing top 2 rows



# Changing a Column's Types (cast):


In [41]:
df.printSchema()
df.withColumn("count2", col("count").cast("int")).printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)
 |-- count2: integer (nullable = true)



# Filtering Rows:

In [44]:
df.filter(col("count") <= 2).show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
|            Malta|      United States|    1|
|    United States|          Gibraltar|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [46]:
df.where(col("count") <= 2).show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
|            Malta|      United States|    1|
|    United States|          Gibraltar|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [48]:
df.where(col("count") < 2).where(col("ORIGIN_COUNTRY_NAME") != "Croatia").show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|          Singapore|    1|
|             Moldova|      United States|    1|
|               Malta|      United States|    1|
|       United States|          Gibraltar|    1|
|Saint Vincent and...|      United States|    1|
+--------------------+-------------------+-----+
only showing top 5 rows



# Getting Unique Rows:

In [49]:
df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").distinct().count()

256

In [50]:
df.select("ORIGIN_COUNTRY_NAME").distinct().count()

125

# Random Samples:

In [51]:
seed = 5
withReplacement = False
fraction = 0.5
df.sample(withReplacement, fraction, seed).count()

138

# Concatenating and Appending Rows (Union):

In [55]:
from pyspark.sql import Row

schema = df.schema
print(schema)

newRows = [
    Row("New Country", "Other Country", 5),
    Row("New Country 2", "Other Country 3", 1)
]

prallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(prallelizedRows, schema)

newDF.show()

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', LongType(), True)])
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|      New Country|      Other Country|    5|
|    New Country 2|    Other Country 3|    1|
+-----------------+-------------------+-----+



In [57]:
df.union(newDF)\
    .where("count = 1")\
    .where(col("ORIGIN_COUNTRY_NAME") != "United States")\
    .show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|    United States|          Gibraltar|    1|
|    United States|             Cyprus|    1|
|    United States|            Estonia|    1|
|    United States|          Lithuania|    1|
|    United States|           Bulgaria|    1|
|    United States|            Georgia|    1|
|    United States|            Bahrain|    1|
|    United States|   Papua New Guinea|    1|
|    United States|         Montenegro|    1|
|    United States|            Namibia|    1|
|    New Country 2|    Other Country 3|    1|
+-----------------+-------------------+-----+



# Sorting Rows:

In [59]:
df.sort("count").show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Estonia|    1|
|              Kosovo|      United States|    1|
|              Zambia|      United States|    1|
|       United States|   Papua New Guinea|    1|
|               Malta|      United States|    1|
|       United States|          Gibraltar|    1|
|            Suriname|      United States|    1|
|       United States|            Croatia|    1|
|            Djibouti|      United States|    1|
|        Burkina Faso|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|             Cyprus|    1|
|       United States|          Singapore|    1|
|             Moldova|      United States|    1|
|              Cyprus|      United States|    1|
|       United States|          Lithuania|    1|
|       United States|           Bulgaria|    1|
|       United State

In [61]:
df.orderBy("count", "DEST_COUNTRY_NAME").show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|        Burkina Faso|      United States|    1|
|       Cote d'Ivoire|      United States|    1|
|              Cyprus|      United States|    1|
|            Djibouti|      United States|    1|
|           Indonesia|      United States|    1|
|                Iraq|      United States|    1|
|              Kosovo|      United States|    1|
|               Malta|      United States|    1|
|             Moldova|      United States|    1|
|       New Caledonia|      United States|    1|
|Saint Vincent and...|      United States|    1|
|            Suriname|      United States|    1|
|       United States|            Estonia|    1|
|       United States|             Cyprus|    1|
|       United States|          Singapore|    1|
|       United States|   Papua New Guinea|    1|
|       United States|            Bahrain|    1|
|       United State

In [62]:
df.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|        Burkina Faso|      United States|    1|
|       Cote d'Ivoire|      United States|    1|
|              Cyprus|      United States|    1|
|            Djibouti|      United States|    1|
|           Indonesia|      United States|    1|
|                Iraq|      United States|    1|
|              Kosovo|      United States|    1|
|               Malta|      United States|    1|
|             Moldova|      United States|    1|
|       New Caledonia|      United States|    1|
|Saint Vincent and...|      United States|    1|
|            Suriname|      United States|    1|
|       United States|            Estonia|    1|
|       United States|             Cyprus|    1|
|       United States|          Singapore|    1|
|       United States|   Papua New Guinea|    1|
|       United States|            Bahrain|    1|
|       United State

In [69]:
from pyspark.sql.functions import asc, desc

df.orderBy(expr("count desc")).show(2)

df.orderBy(expr("count asc")).show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|          Moldova|      United States|    1|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|          Moldova|      United States|    1|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [70]:
df.orderBy(expr("count").desc(),  col("DEST_COUNTRY_NAME").asc()).show()

+------------------+-------------------+------+
| DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+------------------+-------------------+------+
|     United States|      United States|370002|
|     United States|             Canada|  8483|
|            Canada|      United States|  8399|
|     United States|             Mexico|  7187|
|            Mexico|      United States|  7140|
|    United Kingdom|      United States|  2025|
|     United States|     United Kingdom|  1970|
|             Japan|      United States|  1548|
|     United States|              Japan|  1496|
|           Germany|      United States|  1468|
|     United States| Dominican Republic|  1420|
|Dominican Republic|      United States|  1353|
|     United States|            Germany|  1336|
|       South Korea|      United States|  1048|
|     United States|        The Bahamas|   986|
|       The Bahamas|      United States|   955|
|     United States|             France|   952|
|            France|      United States|

In [None]:
spark.read.format("json").load("D:\\GitLocal\\Spark-The-Definitive-Guide\\data\\flight-data\\json\\*-summary.json")\
    .sortWithinPartitions("count")