# SQL Options in Spark Code

In [1]:
%config Completer.use_jedi = False

import os
import sys
import shutil

BASE_DIR = os.path.realpath(os.path.join(os.getcwd(), "..", ".."))

if not BASE_DIR in sys.path:
    sys.path.append(BASE_DIR)
    
from utils import extract_zip

DATASETS_PATH = "datasets/"

In [3]:
import pyspark 

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkSQL").getOrCreate()
spark

In [5]:
data_file = extract_zip(
    zip_file=os.path.join(DATASETS_PATH, "rec-crime-pfa.csv.zip"), 
    member="rec-crime-pfa.csv"
)

In [8]:
crime =  spark.read.csv(data_file, inferSchema=True, header=True)

In [9]:
crime.count()

46469

In [10]:
crime.printSchema()

root
 |-- 12 months ending: string (nullable = true)
 |-- PFA: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Offence: string (nullable = true)
 |-- Rolling year total number of offences: integer (nullable = true)



In [11]:
crime.limit(5).toPandas()

Unnamed: 0,12 months ending,PFA,Region,Offence,Rolling year total number of offences
0,31/03/2003,Avon and Somerset,South West,All other theft offences,25959
1,31/03/2003,Avon and Somerset,South West,Bicycle theft,3090
2,31/03/2003,Avon and Somerset,South West,Criminal damage and arson,26202
3,31/03/2003,Avon and Somerset,South West,Death or serious injury caused by illegal driving,2
4,31/03/2003,Avon and Somerset,South West,Domestic burglary,14561


In [12]:
df = crime.withColumnRenamed("Rolling year total number of offences", "Count")

In [13]:
df.printSchema()

root
 |-- 12 months ending: string (nullable = true)
 |-- PFA: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Offence: string (nullable = true)
 |-- Count: integer (nullable = true)



In [14]:
df.createOrReplaceTempView("tempview")

In [15]:
spark.sql("SELECT * FROM tempview").limit(5).toPandas()

Unnamed: 0,12 months ending,PFA,Region,Offence,Count
0,31/03/2003,Avon and Somerset,South West,All other theft offences,25959
1,31/03/2003,Avon and Somerset,South West,Bicycle theft,3090
2,31/03/2003,Avon and Somerset,South West,Criminal damage and arson,26202
3,31/03/2003,Avon and Somerset,South West,Death or serious injury caused by illegal driving,2
4,31/03/2003,Avon and Somerset,South West,Domestic burglary,14561


In [16]:
spark.sql("SELECT * FROM tempview WHERE Count > 1000").limit(5).toPandas()

Unnamed: 0,12 months ending,PFA,Region,Offence,Count
0,31/03/2003,Avon and Somerset,South West,All other theft offences,25959
1,31/03/2003,Avon and Somerset,South West,Bicycle theft,3090
2,31/03/2003,Avon and Somerset,South West,Criminal damage and arson,26202
3,31/03/2003,Avon and Somerset,South West,Domestic burglary,14561
4,31/03/2003,Avon and Somerset,South West,Drug offences,2308


In [18]:
(
    spark
    .sql("SELECT Region, Offence FROM tempview WHERE Count > 1000")
    .limit(5)
    .toPandas()
)

Unnamed: 0,Region,Offence
0,South West,All other theft offences
1,South West,Bicycle theft
2,South West,Criminal damage and arson
3,South West,Domestic burglary
4,South West,Drug offences


In [19]:
results = spark.sql("SELECT Region, Offence FROM tempview WHERE Count > 1000")
results.limit(5).toPandas()

Unnamed: 0,Region,Offence
0,South West,All other theft offences
1,South West,Bicycle theft
2,South West,Criminal damage and arson
3,South West,Domestic burglary
4,South West,Drug offences


In [20]:
(
    spark
    .sql("SELECT Region, sum(Count) AS Total FROM tempview GROUP BY Region")
    .toPandas()
)

Unnamed: 0,Region,Total
0,Fraud: CIFAS,7678981
1,North West,30235732
2,British Transport Police,3029117
3,Wales,11137260
4,London,42691902
5,South East,30911995
6,Fraud: Action Fraud,5921984
7,Fraud: UK Finance,2925861
8,South West,17985880
9,East,19890612


## SQLTransformer

In [21]:
from pyspark.ml.feature import SQLTransformer

In [27]:
agent = SQLTransformer(statement="SELECT PFA, Region, Offence FROM __THIS__")

# NOTE: The Transformation markup **MUST BE** __THIS__

In [28]:
agent.transform(df).show(5)

+-----------------+----------+--------------------+
|              PFA|    Region|             Offence|
+-----------------+----------+--------------------+
|Avon and Somerset|South West|All other theft o...|
|Avon and Somerset|South West|       Bicycle theft|
|Avon and Somerset|South West|Criminal damage a...|
|Avon and Somerset|South West|Death or serious ...|
|Avon and Somerset|South West|   Domestic burglary|
+-----------------+----------+--------------------+
only showing top 5 rows



In [31]:
agent = SQLTransformer(statement="SELECT Offence, SUM(Count) as Total FROM __THIS__ GROUP BY Offence")

In [33]:
agent.transform(df).show()

+--------------------+--------+
|             Offence|   Total|
+--------------------+--------+
|Public order offe...|10925676|
|       Bicycle theft| 5297006|
|Residential burglary| 1671469|
|Violence without ...|16590158|
|All other theft o...|30979393|
|             Robbery| 3788128|
|               CIFAS| 7678981|
|      Fraud offences| 2596554|
|     Sexual offences| 4006741|
|Criminal damage a...|37767463|
|            Homicide|   34154|
|Possession of wea...| 1555951|
|          UK Finance| 2925861|
|Stalking and hara...| 5587434|
|Theft from the pe...| 5105153|
|         Shoplifting|16781641|
|       Drug offences| 9999435|
|    Vehicle offences|26075670|
|   Domestic burglary|11694636|
|Miscellaneous cri...| 3143136|
+--------------------+--------+
only showing top 20 rows



In [55]:
agent = SQLTransformer(statement="SELECT SUM(Count) as Total FROM __THIS__")

res = agent.transform(df)

total = res.collect()[0][0]



244720928

In [67]:
from pyspark.sql.functions import expr

df.withColumn("Percent", expr(f"round((count/{total})*100, 2)")).show(truncate=False)

+----------------+-----------------+----------+-------------------------------------------------+-----+-------+
|12 months ending|PFA              |Region    |Offence                                          |Count|Percent|
+----------------+-----------------+----------+-------------------------------------------------+-----+-------+
|31/03/2003      |Avon and Somerset|South West|All other theft offences                         |25959|0.01   |
|31/03/2003      |Avon and Somerset|South West|Bicycle theft                                    |3090 |0.0    |
|31/03/2003      |Avon and Somerset|South West|Criminal damage and arson                        |26202|0.01   |
|31/03/2003      |Avon and Somerset|South West|Death or serious injury caused by illegal driving|2    |0.0    |
|31/03/2003      |Avon and Somerset|South West|Domestic burglary                                |14561|0.01   |
|31/03/2003      |Avon and Somerset|South West|Drug offences                                    |2308 |0

In [68]:
df.select("*", expr(f"round((count/{total})*100, 2) as Percent")).show(truncate=False)

+----------------+-----------------+----------+-------------------------------------------------+-----+-------+
|12 months ending|PFA              |Region    |Offence                                          |Count|Percent|
+----------------+-----------------+----------+-------------------------------------------------+-----+-------+
|31/03/2003      |Avon and Somerset|South West|All other theft offences                         |25959|0.01   |
|31/03/2003      |Avon and Somerset|South West|Bicycle theft                                    |3090 |0.0    |
|31/03/2003      |Avon and Somerset|South West|Criminal damage and arson                        |26202|0.01   |
|31/03/2003      |Avon and Somerset|South West|Death or serious injury caused by illegal driving|2    |0.0    |
|31/03/2003      |Avon and Somerset|South West|Domestic burglary                                |14561|0.01   |
|31/03/2003      |Avon and Somerset|South West|Drug offences                                    |2308 |0

In [70]:
df.selectExpr("*", f"round((count/{total})*100, 2) as Percent").show(truncate=False)

+----------------+-----------------+----------+-------------------------------------------------+-----+-------+
|12 months ending|PFA              |Region    |Offence                                          |Count|Percent|
+----------------+-----------------+----------+-------------------------------------------------+-----+-------+
|31/03/2003      |Avon and Somerset|South West|All other theft offences                         |25959|0.01   |
|31/03/2003      |Avon and Somerset|South West|Bicycle theft                                    |3090 |0.0    |
|31/03/2003      |Avon and Somerset|South West|Criminal damage and arson                        |26202|0.01   |
|31/03/2003      |Avon and Somerset|South West|Death or serious injury caused by illegal driving|2    |0.0    |
|31/03/2003      |Avon and Somerset|South West|Domestic burglary                                |14561|0.01   |
|31/03/2003      |Avon and Somerset|South West|Drug offences                                    |2308 |0