# Batch  processing

## Examples

In [None]:
import findspark
findspark.init()
import pyspark
import random

from pyspark.sql import SparkSession

sc = pyspark.SparkContext(appName="SPARK_API")
spark = SparkSession(sc)


In [None]:
# Number of passengers in each class in the Titanic dataset:

from pyspark.sql.functions import count

df = spark.read.option("header", "true").option("inferSchema", "true").csv("titanic.csv")

passengerCounts = df.groupBy("Pclass").agg(count("*").alias("count"))

passengerCounts.show()



In [None]:
# Determine the number of passengers by age group (e.g. 0-10 years, 11-20 years, etc.) and by class:

from pyspark.sql.functions import count, when

df = spark.read.option("header", "true").option("inferSchema", "true").csv("titanic.csv")

ageRanges = [(0, 10), (11, 20), (21, 30), (31, 40), (41, 50), (51, 60), (61, 70), (71, 80)]

ageRangesDf = df
for lower, upper in ageRanges:
    ageRangesDf = ageRangesDf.withColumn(f"AgeRange_{lower}_to_{upper}", when(df.Age.between(lower, upper), 1).otherwise(0))

passengerCountsByAge = ageRangesDf.groupBy("Pclass") \
                                 .agg(count(when(ageRangesDf.AgeRange_0_to_10 == 1, True)).alias("0-10"),
                                      count(when(ageRangesDf.AgeRange_11_to_20 == 1, True)).alias("11-20"),
                                      count(when(ageRangesDf.AgeRange_21_to_30 == 1, True)).alias("21-30"),
                                      count(when(ageRangesDf.AgeRange_31_to_40 == 1, True)).alias("31-40"),
                                      count(when(ageRangesDf.AgeRange_41_to_50 == 1, True)).alias("41-50"),
                                      count(when(ageRangesDf.AgeRange_51_to_60 == 1, True)).alias("51-60"),
                                      count(when(ageRangesDf.AgeRange_61_to_70 == 1, True)).alias("61-70"),
                                      count(when(ageRangesDf.AgeRange_71_to_80 == 1, True)).alias("71-80"))

passengerCountsByAge.show()


In [None]:
# Number of passengers by gender in each class

from pyspark.sql.functions import count

df = spark.read.option("header", "true").option("inferSchema", "true").csv("titanic.csv")

passengerCountsBySex = df.groupBy("Pclass", "Sex").agg(count("*").alias("count"))

passengerCountsBySex.show()


In [None]:
# Determining the number of passengers between survivors and non-survivors

from pyspark.sql.functions import count

df = spark.read.option("header", "true").option("inferSchema", "true").csv("titanic.csv")

survivorsCounts = df.groupBy("Survived").agg(count("*").alias("count"))

survivorsCounts.show()


In [None]:
# Average age of passengers by gender and by class:

from pyspark.sql.functions import avg

df = spark.read.option("header", "true").option("inferSchema", "true").csv("titanic.csv")

avgAgeBySexAndClass = df.groupBy("Pclass", "Sex").agg(avg("Age").alias("avg_age"))

avgAgeBySexAndClass.show()


In [None]:
# Average age of passengers by survival

from pyspark.sql.functions import avg

df = spark.read.option("header", "true").option("inferSchema", "true").csv("titanic.csv")

avgAgeBySurvival = df.groupBy("Survived").agg(avg("Age").alias("avg_age"))

avgAgeBySurvival.show()


In [None]:
# Average age of first class passengers

from pyspark.sql.functions import avg

df = spark.read.option("header", "true").option("inferSchema", "true").csv("titanic.csv")

avgAgeFirstClass = df.filter(df.Pclass == 1).agg(avg("Age").alias("avg_age"))

avgAgeFirstClass.show()


In [None]:
# Average age of passengers by class and survival status

from pyspark.sql.functions import avg

df = spark.read.option("header", "true").option("inferSchema", "true").csv("titanic.csv")

avgAgeByClassAndSurvival = df.groupBy("Pclass", "Survived").agg(avg("Age").alias("avg_age"))

avgAgeByClassAndSurvival.show()


## Exercises to solve:

In [None]:
# Determine the age of the oldest and youngest passengers.




Expected output:

```
+--------+--------+  
|min(Age)|max(Age)|  
+--------+--------+  
|    0.42|    80.0|  
+--------+--------+    
```


In [None]:
# Determining the number of women and men aged 30 and over in first class




Expected output:

```
First class, number of women over 30: 50
First class, age 30 and over: 75
```


In [None]:
# Passenger survival rates by cabin.




Expected output:

```
+-------+------------------+
|  Cabin|      SurvivalRate|
+-------+------------------+
|    A23|               1.0|
|    B79|               1.0|
|    E44|               0.5|
|  F E69|               1.0|
|    D28|               1.0|
|    C78|               0.5|
|    C95|               0.0|
|  F G73|               0.0|
|B58 B60|               0.5|
|     D7|               1.0|
|   C128|               0.0|
|    B39|               1.0|
|    B22|               0.5|
|   C110|               0.0|
|    D21|               1.0|
|     F2|0.6666666666666666|
|    B30|               0.0|
|   C104|               1.0|
|    B50|               1.0|
|     A6|               1.0|
+-------+------------------+
only showing top 20 rows
```


In [None]:
# Passenger survival based on the place of embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)




Expected output:

```
+--------+-------------------+
|Embarked|       SurvivalRate|
+--------+-------------------+
|       Q|0.38961038961038963|
|    null|                1.0|
|       C| 0.5535714285714286|
|       S|0.33695652173913043|
+--------+-------------------+
```


In [None]:
# The most common age group among passengers is 10 years.Scroll down. 0, 10, 20, etc..




Expected output:

```
+-----------+-----+
|Age_rounded|count|
+-----------+-----+
|       30.0|  201|
|       20.0|  200|
|       40.0|  120|
|       50.0|   73|
|        0.0|   40|
|       10.0|   38|
|       60.0|   31|
|       70.0|   10|
|       80.0|    1|
|       null|    0|
+-----------+-----+
```


In [None]:
# The survival rate of passengers in the most common age group.




Expected output:

```
+-----------+-----+-------------------+
|Age_rounded|count|        AvgSurvived|
+-----------+-----+-------------------+
|       30.0|  201| 0.3880597014925373|
|       20.0|  200|              0.365|
|       40.0|  120|              0.425|
|       50.0|   73|  0.410958904109589|
|        0.0|   40|              0.675|
|       10.0|   38|0.47368421052631576|
|       60.0|   31| 0.3870967741935484|
|       70.0|   10|                0.0|
|       80.0|    1|                1.0|
|       null|    0| 0.2937853107344633|
+-----------+-----+-------------------+
```


In [None]:
# Average age and class of passengers by survival.




Expected output:

```
+--------+------------------+------------------+
|Survived|          avg(Age)|       avg(Pclass)|
+--------+------------------+------------------+
|       1|28.343689655172415|1.9502923976608186|
|       0| 30.62617924528302|2.5318761384335153|
+--------+------------------+------------------+
```


In [None]:
# Total number of passengers, by class and gender.




Expected output:

```
+------+------+------------------+
|Pclass|   Sex|count(PassengerId)|
+------+------+------------------+
|     2|female|                76|
|     3|  male|               347|
|     1|  male|               122|
|     3|female|               144|
|     1|female|                94|
|     2|  male|               108|
+------+------+------------------+
```


In [None]:
# Calculating the survival rate among passengers whose cabin brain age is unknown.




Expected output:

```
+-------------------+
|      avg(Survived)|
+-------------------+
|0.29985443959243085|
+-------------------+
```


In [None]:
# Total number of passengers with no known cabin, by class and survival status.




Expected output:

```
+------+--------+------------------+
|PClass|Survived|count(PassengerId)|
+------+--------+------------------+
|     1|       0|                21|
|     3|       1|               113|
|     1|       1|                19|
|     2|       1|                74|
|     2|       0|                94|
|     3|       0|               366|
+------+--------+------------------+
```
