# Setup
Install Java (I am on 11), Spark and Python (I have 3.8). 

Install Jupyter server and jupyter plugin in vscode.

Set env variables as below (Change values according to your setup)

```
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
export SPARK_HOME=/home/kamal/spark-3.1.2-bin-hadoop3.2
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
export PYTHONPATH=${SPARK_HOME}/python/:$(echo ${SPARK_HOME}/python/lib/py4j-*-src.zip):${PYTHONPATH}
```


# Create Sessions
## Create Spark Session

In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
.appName("airline").getOrCreate())


21/09/26 18:11:52 WARN Utils: Your hostname, kamal-Lenovo-G505 resolves to a loopback address: 127.0.1.1; using 192.168.1.14 instead (on interface wlp3s0)
21/09/26 18:11:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/09/26 18:11:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Create Delta spark session
* This requires delta lake jar be downloaded from mavenrepository and placed in SPARK_HOME/jars. Restart Jupyter server after that.  
* Install delta-spark package (pip install delta-spark)

In [None]:
from delta import *

builder = SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

delta_spark = configure_spark_with_delta_pip(builder).getOrCreate()


# Read and operate on Data
## Departure Delays - Read CSV with string schema
Data used here is available from databricks github [page](https://github.com/databricks/LearningSparkV2.git)

In [28]:
csv_file = "./databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
schema = "date string, delay int, distance int, origin string, destination string"
df = (spark.read
.schema(schema)
.option("header", "true")
.csv(csv_file)).cache()
df.printSchema()
df.first()

root
 |-- date: string (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)





Row(date='01011245', delay=6, distance=602, origin='ABE', destination='ATL')

## Operate on Columns, with col

In [29]:
df.select(col("distance") / 1000).show(2)

+-----------------+
|(distance / 1000)|
+-----------------+
|            0.602|
|            0.369|
+-----------------+
only showing top 2 rows



## Using withColumn() to add new computed column

In [30]:
df.withColumn("calculatedCol", col("distance")/1000).show(2)

+--------+-----+--------+------+-----------+-------------+
|    date|delay|distance|origin|destination|calculatedCol|
+--------+-----+--------+------+-----------+-------------+
|01011245|    6|     602|   ABE|        ATL|        0.602|
|01020600|   -8|     369|   ABE|        DTW|        0.369|
+--------+-----+--------+------+-----------+-------------+
only showing top 2 rows



## Timestamp from millisecond (since epoch)

In [31]:
spark.sql("select timestamp_millis(1631163333673) as timestampdata").show(truncate=False)

+-----------------------+
|timestampdata          |
+-----------------------+
|2021-09-09 10:25:33.673|
+-----------------------+



## Working with JSON

### Extract with from_json()

In [33]:
data = [(1, '''{"a": 1,"b": 2.3}''')]
schema = 'a INT, b FLOAT'
df_json = spark.createDataFrame(data, ("key", "value"))
df_json.select(from_json("value", schema)["a"].alias("id"), 
from_json(df_json.value, schema)["b"].alias("value")).show()

+---+-----+
| id|value|
+---+-----+
|  1|  2.3|
+---+-----+



### Extract with get_json_object()

In [34]:
df_json.select(get_json_object(df_json.value,"$.a").alias("id"), 
get_json_object("value","$.b").alias("value")).show()

+---+-----+
| id|value|
+---+-----+
|  1|  2.3|
+---+-----+



# Filter Dataframe

In [35]:
df.printSchema()
from pyspark.sql.functions import col
df.select("distance","origin","destination").filter(col("distance") > 1000).distinct().show(5)

root
 |-- date: string (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)





+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    1036|   DFW|        DCA|
|    2115|   EWR|        PDX|
|    1853|   EWR|        PHX|
|    2071|   KOA|        SJC|
|    1014|   MDW|        FLL|
+--------+------+-----------+
only showing top 5 rows





# Create Views
* Global views are used, in queries, with "global_temp" prefix.

In [36]:
df.createOrReplaceTempView("Airline")
df.createOrReplaceGlobalTempView("GlobalAirlineView")

In [37]:
spark.sql("select * from Airline limit 2").show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
+--------+-----+--------+------+-----------+



In [38]:
spark.sql("""SELECT delay, origin, destination, 
              CASE
                  WHEN delay > 360 THEN 'Very Long Delays'
                  WHEN delay >= 120 AND delay <= 360 THEN 'Long Delays'
                  WHEN delay >= 60 AND delay < 120 THEN 'Short Delays'
                  WHEN delay > 0 and delay < 60 THEN 'Tolerable Delays'
                  WHEN delay = 0 THEN 'No Delays'
                  ELSE 'Early'
               END AS Flight_Delays
               FROM Airline
               ORDER BY origin, delay DESC""").show(10)



+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  Long Delays|
|  305|   ABE|        ATL|  Long Delays|
|  275|   ABE|        ATL|  Long Delays|
|  257|   ABE|        ATL|  Long Delays|
|  247|   ABE|        ATL|  Long Delays|
|  247|   ABE|        DTW|  Long Delays|
|  219|   ABE|        ORD|  Long Delays|
|  211|   ABE|        ATL|  Long Delays|
|  197|   ABE|        DTW|  Long Delays|
|  192|   ABE|        ORD|  Long Delays|
+-----+------+-----------+-------------+
only showing top 10 rows





In [39]:
spark.sql("""SELECT *,
                CASE
                    WHEN delay < 0 THEN 'EARLY'
                    WHEN delay == 0 THEN 'ON-TIME'
                    ELSE 'DELAY'
                END AS FL_Delay
             FROM Airline""").show(6)   

+--------+-----+--------+------+-----------+--------+
|    date|delay|distance|origin|destination|FL_Delay|
+--------+-----+--------+------+-----------+--------+
|01011245|    6|     602|   ABE|        ATL|   DELAY|
|01020600|   -8|     369|   ABE|        DTW|   EARLY|
|01021245|   -2|     602|   ABE|        ATL|   EARLY|
|01020605|   -4|     602|   ABE|        ATL|   EARLY|
|01031245|   -4|     602|   ABE|        ATL|   EARLY|
|01030605|    0|     602|   ABE|        ATL| ON-TIME|
+--------+-----+--------+------+-----------+--------+
only showing top 6 rows



# Add column to DF
## Use multiple conditions on column with when-otherwise

In [40]:
from pyspark.sql.functions import *
df.withColumn('FL_delay', when(col("delay") > 0, 'DELAYED')
.when(col("delay") < 0, 'EARLY')
.otherwise('ON-TIME')).show(6)

+--------+-----+--------+------+-----------+--------+
|    date|delay|distance|origin|destination|FL_delay|
+--------+-----+--------+------+-----------+--------+
|01011245|    6|     602|   ABE|        ATL| DELAYED|
|01020600|   -8|     369|   ABE|        DTW|   EARLY|
|01021245|   -2|     602|   ABE|        ATL|   EARLY|
|01020605|   -4|     602|   ABE|        ATL|   EARLY|
|01031245|   -4|     602|   ABE|        ATL|   EARLY|
|01030605|    0|     602|   ABE|        ATL| ON-TIME|
+--------+-----+--------+------+-----------+--------+
only showing top 6 rows



# Set shuffle partitions

In [41]:
spark.conf.set("spark.sql.shuffle.partitions",4)
df.write.option("path","/home/kamal/airline_data").format("csv").saveAsTable("Airline")
df.rdd.getNumPartitions()



4

# Create, Use and drop databases

In [42]:
spark.catalog.listDatabases()
spark.sql("drop database if exists newdb cascade")
spark.sql("create database newdb location '/home/kamal/spark-warehouse/newdatabase'")
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/home/kamal/Documents/dev/spark/spark_notebook/spark-warehouse'),
 Database(name='newdb', description='', locationUri='file:/home/kamal/spark-warehouse/newdatabase')]

In [43]:
spark.sql("use newdb")

DataFrame[]

In [46]:
df.write.saveAsTable("airline_date", mode="overwrite", format="parquet")



In [64]:
spark.sql("select * from airline_date where origin = 'SFO'").createOrReplaceGlobalTempView("us_origin_airport_SFO_global")
spark.sql("select * from global_temp.us_origin_airport_SFO_global").count()



39483

In [70]:
spark.sql("show views").show()
spark.catalog.dropGlobalTempView("us_origin_airport_SFO_global")

+---------+--------+-----------+
|namespace|viewName|isTemporary|
+---------+--------+-----------+
|         | airline|       true|
+---------+--------+-----------+



# Working with AVRO
Avro package is still external (but supported), so needs to be downloaded and placed in classpath. Download spark-avro jar from [mavenrepo](https://mvnrepository.com/artifact/org.apache.spark/spark-avro_2.12/3.1.2),  and place it in $SPARK_HOME/jars. Then restart Jupyter server 

In [None]:
spark.conf.set("spark.jars.packages", "com.databricks:spark-avro_2.12:3.1.2")
spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW episode_tbl
    USING com.databricks.spark.avro
    OPTIONS (
      path "./databricks-datasets/learning-spark-v2/flights/summary-data/avro/*"
    )
""")

In [None]:
df_fl_summary = spark.sql("select * from episode_tbl")
df_fl_summary.count()
df_fl_summary.show(5)

# UDFs

In [None]:
from pyspark.sql.types import LongType
import pandas as pd
def cubed(s: pd.Series) -> pd.Series:
    return s*s*s

df_range = spark.range(1,9)    
cubed_udf = pandas_udf(cubed, returnType=LongType())

In [None]:
df_range.select("id",cubed_udf("id").alias("cubed")).show()

In [None]:
spark.udf.register("cubed_udf",cubed_udf)

In [None]:
df_range.createOrReplaceTempView("udf_test")
spark.sql("select id, cubed_udf(id) from udf_test").show()

# Delta Read and Write

In [None]:
data = delta_spark.sql("select * from global_temp.GlobalAirlineView").cache()
data.write.format("delta").option("mergeSchema", "true").mode("overwrite").save("/tmp/delta-table/")

In [None]:
df = spark.read.format("delta").load("/tmp/delta-table")
df.show(5)

# Spark Streaming

## foreachBatch Function

foreachBatch lets us use "update", "append" and "complete" output modes. Whereas file sink only allows append.

In [None]:
from functools import partial
def foreachBatchFunction(format, location, df, batchId):
        (df
        .write
        .mode("overwrite")
        .format(format)
        .save(location+str(batchId)))
writeMicroBatchUpdateToJSON = partial(foreachBatchFunction,"JSON","/home/kamal/jsonUpdate/")
writeMicroBatchAppendToJSON = partial(foreachBatchFunction,"JSON","/home/kamal/jsonAppend/")
writeMicroBatchCompleteToJSON = partial(foreachBatchFunction,"JSON","/home/kamal/jsonComplete/")
writeMicroBatchAppendToJSON10Sec = partial(foreachBatchFunction,"JSON","/home/kamal/jsonAppend10sec/")
writeMicroBatchToCSV = partial(foreachBatchFunction,"JSON","/home/kamal/sparkcsv/")

## Word Count
Generate test dat using [datastreamer](https://github.com/skamalj/datagenerator) - with followung config. It creates space separated list of  6 words in each line
>`lorem|words|textline|6`

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
sparkstream = (SparkSession.builder
.appName("airline").getOrCreate())
lines = (sparkstream
  .readStream.format("socket")
  .option("host", "localhost")
  .option("port", 4000)
  .load())

linesnojson = lines.select(get_json_object("value","$.textline").alias("line"))

words = linesnojson.select(explode(split(col("line"), " ")).alias("word"))
counts = words.groupBy("word").count()

checkpointDir = "/home/kamal/sparkcheckpoint"
checkpointFileSyncDir = "/home/kamal/sparkcheckpointcsv"

#streamingQuery = (counts
#  .writeStream
#  .format("console")
#  .outputMode("complete")
#  .trigger(processingTime="20 second")
#  .option("checkpointLocation", checkpointDir)
#  .start())

streamingQueryFile = (counts
  .writeStream
  .foreachBatch(writeMicroBatchToCSV)
  .outputMode("update")
  .trigger(processingTime="20 second")
  .option("checkpointLocation", checkpointFileSyncDir)
  .start())

In [None]:
#streamingQuery.stop()
streamingQueryFile.stop()

## Streaming with eventtime
Use generator with following config
>`datatype|number|id|{"min":0,"max":10}`

>`datatype|float|temp|{"min":20,"max":50}`

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
sparkstream = (SparkSession.builder
.appName("avgtemp").getOrCreate())
readings = (sparkstream
  .readStream.format("socket")
  .option("host", "localhost")
  .option("port", 4000)
  .load().withColumn("current_timestamp", current_timestamp()))

readings_tab = readings.select(get_json_object("value","$.id").alias("id"),
get_json_object("value","$.temp").alias("temp"),
timestamp_seconds(round(get_json_object("value","$.eventtime") / 1000)).alias("eventtime"),
"current_timestamp")

readings_tab = readings_tab.withColumn("delayed_eventtime", col("eventtime") - expr("INTERVAL 10 seconds"))

eventcol = "delayed_eventtime"

samples = (readings_tab.withWatermark(eventcol, "10 seconds")
.groupBy("id", window(eventcol, "20 second"))
.agg(count("id").alias("samples"),avg("temp").alias("averageTemp"),
max("current_timestamp").alias("processed_at"), max(eventcol).alias("generated_at")))

## Cooment / uncomment below blocks as per need

#streamingQuery = (samples
#  .writeStream
#  .format("console")
#  .option("truncate", "false")
#  .outputMode("append")
#  .trigger(processingTime="10 second")
#  .start())
#
#streamingQueryJsonAppend = (samples
#  .writeStream
#  .foreachBatch(writeMicroBatchAppendToJSON10Sec)
#  .outputMode("append")
#  .option("checkpointLocation", "/home/kamal/spark/checkpoint/jsonappend10sec")
#  .trigger(processingTime="10 second")
#  .start())   

streamingQueryJsonAppend = (samples
  .writeStream
  .foreachBatch(writeMicroBatchAppendToJSON)
  .outputMode("append")
  .option("checkpointLocation", "/home/kamal/spark/checkpoint/jsonappend")
  .trigger(processingTime="20 second")
  .start())   

#streamingQueryJsonUpdate = (samples
#  .writeStream
#  .foreachBatch(writeMicroBatchUpdateToJSON)
#  .outputMode("update")
#  .option("checkpointLocation", "/home/kamal/spark/checkpoint/jsonupdate")
#  .trigger(processingTime="20 second")
#  .start())   

#streamingQueryJsonComplete = (samples
#  .writeStream
#  .foreachBatch(writeMicroBatchCompleteToJSON)
#  .outputMode("complete")
#  .option("checkpointLocation", "/home/kamal/spark/checkpoint/jsoncomplete")
#  .start())

## Stop Streaming Queries

In [None]:
#streamingQueryJsonUpdate.stop()
streamingQueryJsonAppend.stop()

In [None]:
dfc = spark.read.format("delta").load("/tmp/deltaeventscomplete").cache()
dfc.filter("id == 2").show(15, truncate = False)


In [None]:
df = spark.read.format("delta").load("/tmp/deltaevents").cache()
df.filter("id == 2").show(15, truncate = False)

In [None]:

from pyspark.sql.functions import col
(df.join(dfc,['id','window'], "fullOuter")
    .select("id", "window", dfc.samples.alias("complete"), df.samples,dfc.processed_at)
    .orderBy("id")
    .filter("id == 10")
    .show(50, truncate=False))

# Scala excercise from [Sample question](https://databricks-prod-cloudfront.cloud.databricks.com/public/793177bc53e528530b06c78a4fa0e086/0/6221173/100020/latest.html). Translated to Pyspark

## Imports and Create DF

In [10]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

#schema = StructType([
#  StructField("name", StringType(), True),
#  StructField("department", IntegerType(), True),
#  StructField("score", ArrayType(IntegerType()), True)
#])

# Works same as above
schema = "name string, department string, score array<int>"

peopleDF = spark.createDataFrame([
  ("Ali", 0, [100]),
  ("Barbara", 1, [300, 250, 100]),
  ("Cesar", 1, [350, 100]),
  ("Dongmei", 1, [400, 100]),
  ("Eli", 2, [250]),
  ("Florita", 2, [500, 300, 100]),
  ("Gatimu", 3, [300, 100])
],schema) 

In [80]:
peopleDF.schema

StructType(List(StructField(name,StringType,true),StructField(department,StringType,true),StructField(score,ArrayType(IntegerType,true),true)))

## Create windowSpec

In [None]:
from pyspark.sql.window import Window

windowSpec = Window.partitionBy("department").orderBy(col("escore").desc())


## Execute using windowSpec

In [93]:
people = (peopleDF.select("*", explode("score")
.alias("escore"))
.drop("score")
.select("name", expr("department *1.0").alias("Dept"),"escore", rank().over(windowSpec).alias("maxrank"))
.filter("maxrank == 1")
.drop("maxrank")
.orderBy("department", ascending=False)
.show(5))




+-------+----+------+
|   name|Dept|escore|
+-------+----+------+
| Gatimu| 3.0|   300|
|Florita| 2.0|   500|
|Dongmei| 1.0|   400|
|    Ali| 0.0|   100|
+-------+----+------+





In [73]:
mnmdata = "/home/kamal/Documents/dev/spark/LearningSparkV2/chapter2/py/src/data/mnm_dataset.csv"

mnmDF = (spark
.read
.format("csv")
.option("header","True")
.option("inferschema", "True")
.load(mnmdata))

resultDF = (mnmDF
.groupBy("State","Color")
.agg(sum("count").alias("Total"))
.orderBy("Total",ascending = False)
)



In [74]:
resultDF.count()



60

# Show current spark context, get the UI url

In [75]:
spark.sparkContext

In [96]:
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter",
"LinkedIn"]],
       [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter",
"LinkedIn"]],
       [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web",
"twitter", "FB", "LinkedIn"]],
       [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568, 
["twitter", "FB"]],
       [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web",
"twitter", "FB", "LinkedIn"]],
       [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, 
["twitter", "LinkedIn"]]
      ]

schema = """`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, \
`Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"""

In [97]:
blogs = spark.createDataFrame(data,schema)
#blogs.write.format("json").save("blogs-json")

In [102]:
blogs.withColumn("BigHitters", expr("Hits > 10000")).filter(col("First").endswith("i")).show()

+---+-----+-------+-----------------+---------+-----+--------------------+----------+
| Id|First|   Last|              Url|Published| Hits|           Campaigns|BigHitters|
+---+-----+-------+-----------------+---------+-----+--------------------+----------+
|  5|Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|      true|
+---+-----+-------+-----------------+---------+-----+--------------------+----------+



# Fire Department Dataset excercise

In [3]:
firecallsDF = (spark
.read
.format("csv")
.option("samplingRatio", .001)
.option("header", True)
.load("databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv"))

firecallsDF.select("CallType").distinct().show()



+--------------------+
|            CallType|
+--------------------+
|Elevator / Escala...|
|         Marine Fire|
|  Aircraft Emergency|
|Confined Space / ...|
|      Administrative|
|              Alarms|
|Odor (Strange / U...|
|Citizen Assist / ...|
|              HazMat|
|Watercraft in Dis...|
|           Explosion|
|           Oil Spill|
|        Vehicle Fire|
|  Suspicious Package|
|Extrication / Ent...|
|               Other|
|        Outside Fire|
|   Traffic Collision|
|       Assist Police|
|Gas Leak (Natural...|
+--------------------+
only showing top 20 rows



In [5]:
from pyspark.sql.functions import *
new_fire_df = (firecallsDF
.select("*",
    to_timestamp(col("CallDate"),"MM/dd/yyyy").alias("IncidentDate"),
    to_timestamp(col("WatchDate"), "MM/dd/yyyy").alias("OnWatchDate"),
    to_timestamp(col("AvailableDtTm"),"MM/dd/yyyy hh:mm:ss a").alias("AvailableDtTs"))
.where(col("Incidentdate").isNotNull())
.drop("CallDate")
.drop("WatchDate")
.drop("AvailableDtTm"))

### Which calltype generated most calls

In [17]:
(firecallsDF
.select("CallType")
.filter(col("CallType").isNotNull())
.groupBy("CallType")
.agg(count("CallType").alias("count"))
.orderBy("count", ascending=False)
.show(30, truncate = False))



+--------------------------------------------+------+
|CallType                                    |count |
+--------------------------------------------+------+
|Medical Incident                            |113794|
|Structure Fire                              |23319 |
|Alarms                                      |19406 |
|Traffic Collision                           |7013  |
|Citizen Assist / Service Call               |2524  |
|Other                                       |2166  |
|Outside Fire                                |2094  |
|Vehicle Fire                                |854   |
|Gas Leak (Natural and LP Gases)             |764   |
|Water Rescue                                |755   |
|Odor (Strange / Unknown)                    |490   |
|Electrical Hazard                           |482   |
|Elevator / Escalator Rescue                 |453   |
|Smoke Investigation (Outside)               |391   |
|Fuel Spill                                  |193   |
|HazMat                     



### What were all the different types of fire calls in 2018?

In [7]:
(new_fire_df
.select("CallType",year("IncidentDate").alias("Year"))
.filter("Year == 2018")
.groupBy("CallType")
.agg(count("CallType").alias("count"))
.show(2)
)



+--------------------+-----+
|            CallType|count|
+--------------------+-----+
|Elevator / Escala...|   36|
|              Alarms| 1144|
+--------------------+-----+
only showing top 2 rows





In [12]:
new_fire_df.columns

['CallNumber',
 'UnitID',
 'IncidentNumber',
 'CallType',
 'CallFinalDisposition',
 'Address',
 'City',
 'Zipcode',
 'Battalion',
 'StationArea',
 'Box',
 'OriginalPriority',
 'Priority',
 'FinalPriority',
 'ALSUnit',
 'CallTypeGroup',
 'NumAlarms',
 'UnitType',
 'UnitSequenceInCallDispatch',
 'FirePreventionDistrict',
 'SupervisorDistrict',
 'Neighborhood',
 'Location',
 'RowID',
 'Delay',
 'IncidentDate',
 'OnWatchDate',
 'AvailableDtTs']

### Which neighborhood in San Francisco generated the most fire calls in 2018

In [27]:
(new_fire_df
.select("city","Neighborhood","CallType",year("IncidentDate").alias("Year"))
.where("city in ('San Francisco','SF', 'SAN FRANCISCO')")
.where(col("Year") == 2018)  #Can also be written as "Year == 2018"
.where(col("CallType").like('%Fire%'))
.groupBy("Neighborhood")
.agg(count("Neighborhood").alias("TotalCalls"))
.orderBy("TotalCalls",ascending=False)
.show(3, truncate=False))



+------------------------------+----------+
|Neighborhood                  |TotalCalls|
+------------------------------+----------+
|Tenderloin                    |105       |
|Financial District/South Beach|92        |
|Mission                       |80        |
+------------------------------+----------+
only showing top 3 rows





In [5]:
from pyspark.sql.functions import *
from pyspark.sql.types import LongType

def cube(s):
    return s*s*s

spark.udf.register("cube", cube, LongType())

spark.range(1,5).createOrReplaceTempView("udf_test")

21/09/26 18:17:09 WARN SimpleFunctionRegistry: The function cube replaced a previously registered function.


In [7]:
spark.sql("select id, cube(id) from udf_test").show()



+---+--------+
| id|cube(id)|
+---+--------+
|  1|       1|
|  2|       8|
|  3|      27|
|  4|      64|
+---+--------+



In [9]:
cubed_udf = pandas_udf(cube, LongType())
df_id = spark.range(1,4)
df_id.select("id",cubed_udf("id")).show()



+---+--------+
| id|cube(id)|
+---+--------+
|  1|       1|
|  2|       8|
|  3|      27|
+---+--------+





In [14]:
people_listscore = peopleDF.select("name","department",explode("score").alias("score"))

In [15]:
people_listscore.groupBy("name").agg(collect_list("score")).show(20)



+-------+-------------------+
|   name|collect_list(score)|
+-------+-------------------+
| Gatimu|         [300, 100]|
|Barbara|    [300, 250, 100]|
|  Cesar|         [350, 100]|
|Florita|    [500, 300, 100]|
|    Eli|              [250]|
|Dongmei|         [400, 100]|
|    Ali|              [100]|
+-------+-------------------+



In [29]:
from pyspark.sql.functions import expr
tripdelaysFilePath = "databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
airportsnaFilePath =  "databricks-datasets/learning-spark-v2/flights/airport-codes-na.txt"
  
# Obtain airports data set
airportsna = (spark.read
  .format("csv")
  .options(header="true", inferSchema="true", sep="\t")
  .load(airportsnaFilePath))

airportsna.createOrReplaceTempView("airports_na")

departureDelays = (spark.read
  .format("csv")
  .options(header="true")
  .load(tripdelaysFilePath))

departureDelays = (departureDelays
.withColumn("delay", expr("CAST(delay as INT) as delay"))
.withColumn("distance", expr("CAST(distance as INT) as distance")))

departureDelays.createOrReplaceTempView("departureDelays")

In [None]:
spark.sql("select filter(score, s -> s = 250) as new_score from global_temp.peopleDF").show(truncate=False)

In [36]:
spark.sql("set -v").select("key","value").show(truncate=False)

+-----------------------------------------------------------+--------------------------------------------------------------------+
|key                                                        |value                                                               |
+-----------------------------------------------------------+--------------------------------------------------------------------+
|spark.sql.adaptive.advisoryPartitionSizeInBytes            |<value of spark.sql.adaptive.shuffle.targetPostShuffleInputSize>    |
|spark.sql.adaptive.coalescePartitions.enabled              |true                                                                |
|spark.sql.adaptive.coalescePartitions.initialPartitionNum  |<undefined>                                                         |
|spark.sql.adaptive.coalescePartitions.minPartitionNum      |<undefined>                                                         |
|spark.sql.adaptive.enabled                                 |false                 

In [42]:
df_big_range2 = spark.range(1000000).repartition(8)
df_big_range2.rdd.getNumPartitions()
df_big_range2.cache()
df_big_range2.count()

21/09/26 20:55:24 WARN CacheManager: Asked to cache already cached data.
21/09/26 20:55:28 WARN MemoryStore: Not enough space to cache rdd_176_4 in memory! (computed 48.2 MiB so far)
21/09/26 20:55:28 WARN BlockManager: Persisting block rdd_176_4 to disk instead.
21/09/26 20:55:55 WARN MemoryStore: Not enough space to cache rdd_176_4 in memory! (computed 48.2 MiB so far)


1000000

21/09/26 20:56:00 WARN MemoryStore: Not enough space to cache rdd_176_6 in memory! (computed 48.2 MiB so far)
21/09/26 20:56:01 WARN MemoryStore: Not enough space to cache rdd_176_5 in memory! (computed 48.2 MiB so far)
21/09/26 20:56:04 WARN MemoryStore: Not enough space to cache rdd_176_7 in memory! (computed 48.2 MiB so far)


In [43]:
df_big_range2.count()

1000000