# String functions

https://www.kaggle.com/datasets/thedevastator/global-shark-attack-incidents?select=Sharks+Attack+Men+More+.json

In [1]:
#!/bin/bash
! curl -L -o incidents.zip https://www.kaggle.com/api/v1/datasets/download/thedevastator/global-shark-attack-incidents
! unzip incidents.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  593k  100  593k    0     0   585k      0  0:00:01  0:00:01 --:--:--  585k
Archive:  incidents.zip
  inflating: GSAF5.xls.csv           
  inflating: Sharks Attack Men More .csv  
  inflating: Sharks Attack Men More .json  
  inflating: download-2019-02-20T23-55-18-113Z.png  


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, \
                              BooleanType, DateType, FloatType

from pyspark.sql.functions import col

file_path = "GSAF5.xls.csv"
shark_df = spark.read.\
  csv(file_path, header=True, inferSchema=True)\
  .select("index", "Date", "Year", "Type", "Country", "Area", "Location",
          "Activity", "Name", "Age", "Injury", "Fatal (Y/N)", "Time",
          "Species ")\
  .withColumnRenamed("Species ", "Species")\
  .withColumn("Year", col("Year").cast(IntegerType()))\
  .withColumn("Age", col("Age").cast(IntegerType()))\
  .withColumn("Fatal", col("Fatal (Y/N)").cast(BooleanType()))\
  .drop("Fatal (Y/N)")

shark_df.show()
shark_df.printSchema()

+-----+--------------------+----+------------+----------------+--------------------+--------------------+--------------------+-----------------+----+--------------------+-------------+--------------------+-----+
|index|                Date|Year|        Type|         Country|                Area|            Location|            Activity|             Name| Age|              Injury|         Time|             Species|Fatal|
+-----+--------------------+----+------------+----------------+--------------------+--------------------+--------------------+-----------------+----+--------------------+-------------+--------------------+-----+
|    0|         05-Feb-2020|2020|  Unprovoked|             USA|                Maui|                NULL|Stand-Up Paddle b...|             NULL|NULL|No injury, but pa...|        09h40|         Tiger shark|false|
|    1|Reported 30-Jan-2020|2020|    Provoked|         BAHAMAS|              Exumas|                NULL|            Floating|  Ana Bruna Avila|  24|PRO

In [None]:
# shark_df.select("Country").distinct().sort("Country").show(999)

In [4]:
italy_count = shark_df.filter(shark_df.Country == 'ITALY').count()
italy_count

71

In [5]:
shark_df\
  .filter(shark_df.Country == 'ITALY')\
  .select("year", "area", "location", "age")\
  .sort("year", ascending=False)\
  .show(italy_count, truncate=False)

+----+-----------------------------------+-----------------------------------------------------------------------------+----+
|year|area                               |location                                                                     |age |
+----+-----------------------------------+-----------------------------------------------------------------------------+----+
|2015|Sardinia                           |NULL                                                                         |43  |
|2012|Sardinia                           |Muravera                                                                     |57  |
|2006|NULL                               |Lampedusa Island                                                             |NULL|
|2001|NULL                               |Rimini                                                                       |NULL|
|1999|Adriatic Sea                       |San Benedetto                                                               

In [6]:
from pyspark.sql.functions import when

shark_df\
  .filter(shark_df.Country == 'ITALY')\
  .select("year", "area", "location", "age")\
  .withColumn("adult", when(col("age") >= 18, "ADULT")\
                       .when(col("age") < 18, "CHILD")\
                       .otherwise("---"))\
  .sort("year", ascending=False)\
  .show()

+----+--------------------+--------------------+----+-----+
|year|                area|            location| age|adult|
+----+--------------------+--------------------+----+-----+
|2015|            Sardinia|                NULL|  43|ADULT|
|2012|            Sardinia|            Muravera|  57|ADULT|
|2006|                NULL|    Lampedusa Island|NULL|  ---|
|2001|                NULL|              Rimini|NULL|  ---|
|1999|        Adriatic Sea|       San Benedetto|NULL|  ---|
|1998|      Marches region|12 miles off Seni...|NULL|  ---|
|1991|        Ligurian Sea|Portofino, 20 mil...|  40|ADULT|
|1989|             Tuscany|Marinella, betwee...|NULL|  ---|
|1989|      Tyrrhenian Sea|Golfo di Baratti,...|  47|ADULT|
|1988|         Manfredonia|           Ippocampo|  16|CHILD|
|1987|      Tyrrhenian Sea|Marciana Marina, ...|NULL|  ---|
|1985|              Sicily|         Punta Secca|  13|CHILD|
|1984|      Tyrrhenian Sea|Marciana Marina, ...|NULL|  ---|
|1983|     Northwest Italy|Riomaggiore (

In [7]:
shark_df\
  .filter(shark_df.Country == 'ITALY')\
  .select("year", "area", "location", "age")\
  .withColumn("adult", when(col("age") >= 18, "ADULT")\
                       .when(col("age") < 18, "CHILD")\
                       .otherwise("---"))\
  .withColumn("area", when(col("area").isNull(), col("location")).otherwise(col("area")))\
  .withColumn("location", when(col("location").isNull(), col("area")).otherwise(col("location")))\
  .sort("year", ascending=False)\
  .show()


+----+--------------------+--------------------+----+-----+
|year|                area|            location| age|adult|
+----+--------------------+--------------------+----+-----+
|2015|            Sardinia|            Sardinia|  43|ADULT|
|2012|            Sardinia|            Muravera|  57|ADULT|
|2006|    Lampedusa Island|    Lampedusa Island|NULL|  ---|
|2001|              Rimini|              Rimini|NULL|  ---|
|1999|        Adriatic Sea|       San Benedetto|NULL|  ---|
|1998|      Marches region|12 miles off Seni...|NULL|  ---|
|1991|        Ligurian Sea|Portofino, 20 mil...|  40|ADULT|
|1989|             Tuscany|Marinella, betwee...|NULL|  ---|
|1989|      Tyrrhenian Sea|Golfo di Baratti,...|  47|ADULT|
|1988|         Manfredonia|           Ippocampo|  16|CHILD|
|1987|      Tyrrhenian Sea|Marciana Marina, ...|NULL|  ---|
|1985|              Sicily|         Punta Secca|  13|CHILD|
|1984|      Tyrrhenian Sea|Marciana Marina, ...|NULL|  ---|
|1983|     Northwest Italy|Riomaggiore (

In [8]:
from pyspark.sql.functions import regexp_replace

shark_df\
  .filter(shark_df.Country == 'ITALY')\
  .select("year", "area", "location", "age")\
  .withColumn("adult", when(col("age") >= 18, "ADULT")\
                       .when(col("age") < 18, "CHILD")\
                       .otherwise("---"))\
  .withColumn("area", when(col("area").isNull(), col("location")).otherwise(col("area")))\
  .withColumn("location", when(col("location").isNull(), col("area")).otherwise(col("location")))\
  .withColumn("area", regexp_replace("area", "(?i)sard.*nia", "Italy"))\
  .sort("year", ascending=False)\
  .show()


+----+--------------------+--------------------+----+-----+
|year|                area|            location| age|adult|
+----+--------------------+--------------------+----+-----+
|2015|               Italy|            Sardinia|  43|ADULT|
|2012|               Italy|            Muravera|  57|ADULT|
|2006|    Lampedusa Island|    Lampedusa Island|NULL|  ---|
|2001|              Rimini|              Rimini|NULL|  ---|
|1999|        Adriatic Sea|       San Benedetto|NULL|  ---|
|1998|      Marches region|12 miles off Seni...|NULL|  ---|
|1991|        Ligurian Sea|Portofino, 20 mil...|  40|ADULT|
|1989|             Tuscany|Marinella, betwee...|NULL|  ---|
|1989|      Tyrrhenian Sea|Golfo di Baratti,...|  47|ADULT|
|1988|         Manfredonia|           Ippocampo|  16|CHILD|
|1987|      Tyrrhenian Sea|Marciana Marina, ...|NULL|  ---|
|1985|              Sicily|         Punta Secca|  13|CHILD|
|1984|      Tyrrhenian Sea|Marciana Marina, ...|NULL|  ---|
|1983|     Northwest Italy|Riomaggiore (

https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#string-functions

In [9]:
from pyspark.sql.functions import expr
arm_df = shark_df.filter(expr("regexp_like(Injury, '(?i)arm')")).select("Injury")

# arm_df = shark_df.filter(col("Injury").rlike("(?i)arm")).select("Injury")
print(arm_df.count())
arm_df.show(truncate=False)

605
+-------------------------------------------------------------------------------+
|Injury                                                                         |
+-------------------------------------------------------------------------------+
|Arm bitten                                                                     |
|Arm bitten                                                                     |
|Lacerations to arm from hooked shark PROVOKED INCIDENT                         |
|Multiple lacerations to right arm                                              |
|Bite to right forearm                                                          |
|Severe injuries to arms and chest                                              |
|Laceration to arm                                                              |
|Laceration to left arm and hands                                               |
|3 puncture marks to left forearm                                               |
|Lacerations

In [10]:
leg_df = shark_df.filter(col("Injury").rlike("(?i)leg")).select("Injury")
print(leg_df.count())
leg_df.show(truncate=False)

1004
+-----------------------------------------------------------------------------------------------+
|Injury                                                                                         |
+-----------------------------------------------------------------------------------------------+
|Minor injury to lower leg                                                                      |
|Lacerations to leg                                                                             |
|Right leg bitten                                                                               |
|Right leg bitten                                                                               |
|Raddon’s right foot was severed and Maggs sustained serious lacerations to his lower right leg.|
|Hooked shark bit his leg. PROVOKED INCIDENT                                                    |
|Puncture marks to lower leg                                                                    |
|Arc of punctur