In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [3]:
spark = SparkSession.builder.getOrCreate()

In [5]:
df = spark.read.csv("train.csv", header=True, inferSchema=True)

In [6]:
df.createOrReplaceTempView("titanic")

##### SCOPE
Here I try to get the requests with spark methods and sql queries. The requests according to `titanic_simple_analysis.py` file:
1. Schema of the dataset.
2. Gender distribution of cleaned dataset.
3. City distribution of cleaned dataset.
4. Average of descriptive stats.
5. Describe of descriptive stats.
6. Median of descriptive stats.
7. Price averages according to classes.
8. Association between price and age.
9. Family size table.
10. Age group table. <br> <br> <br>
***
##### TODO
- [x] Schema of the dataset.
- [x] Gender distribution of cleaned dataset.
- [x] City distribution of cleaned dataset.
- [x] Average of descriptive stats.
- [x] Describe of descriptive stats.
- [ ] Median of descriptive stats.
- [x] Price averages according to classes.
- [x] Association between price and age.
- [ ] Family size table.
- [ ] Age group table.

In [21]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [7]:
spark.sql("DESCRIBE titanic;").show()

+-----------+---------+-------+
|   col_name|data_type|comment|
+-----------+---------+-------+
|PassengerId|      int|   NULL|
|   Survived|      int|   NULL|
|     Pclass|      int|   NULL|
|       Name|   string|   NULL|
|        Sex|   string|   NULL|
|        Age|   double|   NULL|
|      SibSp|      int|   NULL|
|      Parch|      int|   NULL|
|     Ticket|   string|   NULL|
|       Fare|   double|   NULL|
|      Cabin|   string|   NULL|
|   Embarked|   string|   NULL|
+-----------+---------+-------+



In [8]:
cleaned_df = df.na.drop()
cleaned_df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+-----------+--------+-----------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|     Ticket|    Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+-----------+--------+-----------+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|   PC 17599| 71.2833|        C85|       C|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|     113803|    53.1|       C123|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|      17463| 51.8625|        E46|       S|
|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1|    PP 9549|    16.7|         G6|       S|
|         12|       1|     1|Bonnell, Miss. El...|female|58.0|    0|    0|     113783|   26.55|       C103|       S|
|         22|       1|     2|Beesley, Mr. Lawr...|  male|34.0|  

In [9]:
spark.sql(
    """
    SELECT * FROM titanic
    WHERE PassengerId IS NOT NULL
    AND Survived IS NOT NULL
    AND Pclass IS NOT NULL
    AND Name IS NOT NULL
    AND Sex IS NOT NULL
    AND Age IS NOT NULL
    AND SibSp IS NOT NULL
    AND Parch IS NOT NULL
    AND Ticket IS NOT NULL
    AND Fare IS NOT NULL
    AND Cabin IS NOT NULL
    AND Embarked IS NOT NULL
    """
).show()

+-----------+--------+------+--------------------+------+----+-----+-----+-----------+--------+-----------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|     Ticket|    Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+-----------+--------+-----------+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|   PC 17599| 71.2833|        C85|       C|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|     113803|    53.1|       C123|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|      17463| 51.8625|        E46|       S|
|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1|    PP 9549|    16.7|         G6|       S|
|         12|       1|     1|Bonnell, Miss. El...|female|58.0|    0|    0|     113783|   26.55|       C103|       S|
|         22|       1|     2|Beesley, Mr. Lawr...|  male|34.0|  

In [9]:
cleaned_df.groupBy("Sex").count().show()

+------+-----+
|   Sex|count|
+------+-----+
|female|   88|
|  male|   95|
+------+-----+



In [20]:
spark.sql(
    """
    SELECT COUNT(Sex) FROM titanic
    WHERE PassengerId IS NOT NULL
    AND Survived IS NOT NULL
    AND Pclass IS NOT NULL
    AND Name IS NOT NULL
    AND Sex IS NOT NULL
    AND Age IS NOT NULL
    AND SibSp IS NOT NULL
    AND Parch IS NOT NULL
    AND Ticket IS NOT NULL
    AND Fare IS NOT NULL
    AND Cabin IS NOT NULL
    AND Embarked IS NOT NULL
    GROUP BY Sex;
    """
).show()

+----------+
|count(Sex)|
+----------+
|        88|
|        95|
+----------+



In [18]:
cleaned_df.groupBy("Embarked").count().show()

+--------+-----+
|Embarked|count|
+--------+-----+
|       Q|    2|
|       C|   65|
|       S|  116|
+--------+-----+



In [21]:
spark.sql(
    """
    SELECT COUNT(Embarked) FROM titanic
    WHERE PassengerId IS NOT NULL
    AND Survived IS NOT NULL
    AND Pclass IS NOT NULL
    AND Name IS NOT NULL
    AND Sex IS NOT NULL
    AND Age IS NOT NULL
    AND SibSp IS NOT NULL
    AND Parch IS NOT NULL
    AND Ticket IS NOT NULL
    AND Fare IS NOT NULL
    AND Cabin IS NOT NULL
    AND Embarked IS NOT NULL
    GROUP BY Embarked;
    """
).show()

+---------------+
|count(Embarked)|
+---------------+
|              2|
|             65|
|            116|
+---------------+



descriptive stats
1. Pclass
2. Age
3. SibSp
4. Parch
5. Fare

In [37]:
df.select("*").groupBy().avg("Pclass", "Age", "SibSp", "Parch", "Fare").show()

+-----------------+-----------------+------------------+-------------------+----------------+
|      avg(Pclass)|         avg(Age)|        avg(SibSp)|         avg(Parch)|       avg(Fare)|
+-----------------+-----------------+------------------+-------------------+----------------+
|2.308641975308642|29.69911764705882|0.5230078563411896|0.38159371492704824|32.2042079685746|
+-----------------+-----------------+------------------+-------------------+----------------+



In [40]:
spark.sql(
    """ 
    SELECT AVG(Pclass), AVG(Age), AVG(SibSp), AVG(Parch), AVG(Fare) FROM titanic;
    """
).show()

+-----------------+-----------------+------------------+-------------------+----------------+
|      avg(Pclass)|         avg(Age)|        avg(SibSp)|         avg(Parch)|       avg(Fare)|
+-----------------+-----------------+------------------+-------------------+----------------+
|2.308641975308642|29.69911764705882|0.5230078563411896|0.38159371492704824|32.2042079685746|
+-----------------+-----------------+------------------+-------------------+----------------+



In [42]:
df.describe("Pclass", "Age", "SibSp", "Parch", "Fare").show()

+-------+------------------+------------------+------------------+-------------------+-----------------+
|summary|            Pclass|               Age|             SibSp|              Parch|             Fare|
+-------+------------------+------------------+------------------+-------------------+-----------------+
|  count|               891|               714|               891|                891|              891|
|   mean| 2.308641975308642| 29.69911764705882|0.5230078563411896|0.38159371492704824| 32.2042079685746|
| stddev|0.8360712409770491|14.526497332334035|1.1027434322934315| 0.8060572211299488|49.69342859718089|
|    min|                 1|              0.42|                 0|                  0|              0.0|
|    max|                 3|              80.0|                 8|                  6|         512.3292|
+-------+------------------+------------------+------------------+-------------------+-----------------+



24/12/03 16:05:18 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [20]:
spark.sql(
    """
    SELECT
        'Pclass' AS Pclass,
        COUNT(Pclass) AS count,
        AVG(Pclass) AS mean,
        STDDEV(Pclass) AS stddev,
        MIN(Pclass) AS min,
        MAX(Pclass) AS max,
        PERCENTILE_APPROX(Pclass, 0.5) AS median
    FROM titanic
    UNION
    SELECT
        'Age' AS Age,
        COUNT(Age) AS count,
        AVG(Age) AS mean,
        STDDEV(Age) AS stddev,
        MIN(Age) AS min,
        MAX(Age) AS max,
        PERCENTILE_APPROX(Age, 0.5) AS median
    FROM titanic
    UNION
    SELECT
        'SibSp' AS SibSp,
        COUNT(SibSp) AS count,
        AVG(SibSp) AS mean,
        STDDEV(SibSp) AS stddev,
        MIN(SibSp) AS min,
        MAX(SibSp) AS max,
        PERCENTILE_APPROX(SibSp, 0.5) AS median
    FROM titanic
    UNION
    SELECT
        'Parch' AS Parch,
        COUNT(Parch) AS count,
        AVG(Parch) AS mean,
        STDDEV(Parch) AS stddev,
        MIN(Parch) AS min,
        MAX(Parch) AS max,
        PERCENTILE_APPROX(Parch, 0.5) AS median
    FROM titanic
    UNION
    SELECT
        'Fare' AS Fare,
        COUNT(Fare) AS count,
        AVG(Fare) AS mean,
        STDDEV(Fare) AS stddev,
        MIN(Fare) AS min,
        MAX(Fare) AS max,
        PERCENTILE_APPROX(Fare, 0.5) AS median
    FROM titanic;
    """
).show()

+------+-----+-------------------+------------------+----+--------+-------+
|Pclass|count|               mean|            stddev| min|     max| median|
+------+-----+-------------------+------------------+----+--------+-------+
|Pclass|  891|  2.308641975308642|0.8360712409770491| 1.0|     3.0|    3.0|
|   Age|  714|  29.69911764705882|14.526497332334035|0.42|    80.0|   28.0|
| SibSp|  891| 0.5230078563411896|1.1027434322934315| 0.0|     8.0|    0.0|
| Parch|  891|0.38159371492704824|0.8060572211299488| 0.0|     6.0|    0.0|
|  Fare|  891|   32.2042079685746| 49.69342859718089| 0.0|512.3292|14.4542|
+------+-----+-------------------+------------------+----+--------+-------+



In [43]:
df.select("Pclass", "Fare").groupBy("Pclass").avg("Fare").orderBy("Pclass").show()

+------+------------------+
|Pclass|         avg(Fare)|
+------+------------------+
|     1| 84.15468749999992|
|     2| 20.66218315217391|
|     3|13.675550101832997|
+------+------------------+



In [28]:
spark.sql(
    """
    SELECT Pclass, AVG(Fare) FROM titanic
    GROUP BY Pclass ORDER BY Pclass;
    """
).show()

+------+------------------+
|Pclass|         avg(Fare)|
+------+------------------+
|     1| 84.15468749999992|
|     2| 20.66218315217391|
|     3|13.675550101832997|
+------+------------------+



In [31]:
price_age_df = df.filter(df.Fare.isNotNull())
p1 = price_age_df.filter((df.Age >= 0) & (df.Age <= 10)).agg(f.avg("Fare").alias("0-10"))
p2 = price_age_df.filter((df.Age >= 10) & (df.Age <= 20)).agg(f.avg("Fare").alias("10-20"))
p3 = price_age_df.filter((df.Age >= 20) & (df.Age <= 30)).agg(f.avg("Fare").alias("20-30"))
p4 = price_age_df.filter((df.Age >= 30) & (df.Age <= 40)).agg(f.avg("Fare").alias("30-40"))
p5 = price_age_df.filter((df.Age >= 40) & (df.Age <= 50)).agg(f.avg("Fare").alias("40-50"))
p6 = price_age_df.filter((df.Age >= 50) & (df.Age <= 60)).agg(f.avg("Fare").alias("50-60"))
p7 = price_age_df.filter((df.Age >= 60) & (df.Age <= 70)).agg(f.avg("Fare").alias("60-70"))
p8 = price_age_df.filter((df.Age >= 70) & (df.Age <= 80)).agg(f.avg("Fare").alias("70-80"))
columns = ["Age Range","Fare Average"]
datas = [
    ("0-10", p1.collect()[0][0]),
    ("10-20", p2.collect()[0][0]),
    ("20-30", p3.collect()[0][0]),
    ("30-40", p4.collect()[0][0]),
    ("40-50", p5.collect()[0][0]),
    ("50-60", p6.collect()[0][0]),
    ("60-70", p7.collect()[0][0]),
    ("70-80", p8.collect()[0][0])
]
new_df = spark.createDataFrame(datas, columns)
new_df.show()

+---------+------------------+
|Age Range|      Fare Average|
+---------+------------------+
|     0-10|30.434439062500008|
|    10-20|  29.4696247863248|
|    20-30|  27.1016648979592|
|    30-40|40.141317777777786|
|    40-50| 40.63093636363638|
|    50-60| 48.47692307692308|
|    60-70|47.642061904761896|
|    70-80|30.169057142857145|
+---------+------------------+



In [17]:
spark.sql(
    """
    SELECT 
        CASE
            WHEN Age > 0 AND Age < 10 THEN '0-10'
            WHEN Age >= 10 AND Age < 20 THEN '10-20'
            WHEN Age >= 20 AND Age < 30 THEN '20-30'
            WHEN Age >= 30 AND Age < 40 THEN '30-40'
            WHEN Age >= 40 AND Age < 50 THEN '40-50'
            WHEN Age >= 50 AND Age < 60 THEN '50-60'
            WHEN Age >= 60 AND Age < 70 THEN '60-70'
            WHEN Age >= 70 AND Age < 80 THEN '70-80'
            ELSE '80+'
        END AS AgeRange,
        AVG(Fare) AS AvgFare
    FROM titanic
    WHERE Age IS NOT NULL
    GROUP BY AgeRange
    ORDER BY AgeRange
    """
).show()

+--------+------------------+
|AgeRange|           AvgFare|
+--------+------------------+
|    0-10| 30.57667903225807|
|   10-20| 32.53513235294118|
|   20-30|27.278937272727294|
|   30-40|40.377294011976055|
|   40-50| 38.00229662921349|
|   50-60| 47.93333333333334|
|   60-70| 48.36754210526315|
|   70-80|30.197233333333333|
|     80+|              30.0|
+--------+------------------+



In [61]:
descriptive_columns = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
for column in descriptive_columns:
    quantiles = df.filter(df[column].isNotNull()).approxQuantile(column, [0.5], 0.01)
    print(f"Median of {column} Column: {float(quantiles[0])}")

Median of Pclass Column: 3.0
Median of Age Column: 28.0
Median of SibSp Column: 0.0
Median of Parch Column: 0.0
Median of Fare Column: 14.4542


In [44]:
spark.sql(
    """
    SELECT 
    PERCENTILE_APPROX(Fare, 0.5) AS median
    FROM titanic
    """
).show()

+-------+
| median|
+-------+
|14.4542|
+-------+



24/12/16 15:45:52 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

In [40]:
ordered_column = spark.sql(
    """
    SELECT Fare, ROW_NUMBER() OVER (ORDER BY Fare) AS rn, (SELECT COUNT(*) FROM titanic) AS total_count FROM titanic;
    """
)

In [None]:
spark.sql(
    """

    """
).show()

In [41]:
ordered_column.createOrReplaceTempView("OrderedColumn")

In [43]:
spark.sql(
    """
    SELECT 
    CASE 
        WHEN total_count % 2 = 1 THEN (SELECT Fare FROM OrderedColumn WHERE rn = (total_count + 1) / 2)
        ELSE (SELECT AVG(Fare) FROM OrderedColumn WHERE rn IN (total_count / 2, total_count / 2 + 1))
    END AS median
    FROM OrderedColumn
    LIMIT 1
    """
).show()

24/12/16 15:39:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/16 15:39:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/16 15:39:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/16 15:39:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/16 15:39:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/16 15:39:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/16 1

+-------+
| median|
+-------+
|14.4542|
+-------+



24/12/16 15:39:02 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

In [76]:
spark.sql(
    """
    SELECT Pclass, Age, SibSp, Parch, Fare FROM titanic 
    WHERE
    Pclass IS NOT NULL AND
    Age IS NOT NULL AND
    SibSp IS NOT NULL AND
    Parch IS NOT NULL AND
    Fare IS NOT NULL
    ORDER BY Pclass, Age, SibSp, Parch, Fare;
    """
).show()

+------+----+-----+-----+--------+
|Pclass| Age|SibSp|Parch|    Fare|
+------+----+-----+-----+--------+
|     1|0.92|    1|    2|  151.55|
|     1| 2.0|    1|    2|  151.55|
|     1| 4.0|    0|    2| 81.8583|
|     1|11.0|    1|    2|   120.0|
|     1|14.0|    1|    2|   120.0|
|     1|15.0|    0|    1|211.3375|
|     1|16.0|    0|    0|    86.5|
|     1|16.0|    0|    1|    39.4|
|     1|16.0|    0|    1| 57.9792|
|     1|17.0|    0|    2|110.8833|
|     1|17.0|    1|    0|    57.0|
|     1|17.0|    1|    0|   108.9|
|     1|18.0|    0|    2|   79.65|
|     1|18.0|    1|    0|   108.9|
|     1|18.0|    1|    0| 227.525|
|     1|18.0|    2|    2| 262.375|
|     1|19.0|    0|    0|    30.0|
|     1|19.0|    0|    2| 26.2833|
|     1|19.0|    1|    0|    53.1|
|     1|19.0|    1|    0| 91.0792|
+------+----+-----+-----+--------+
only showing top 20 rows

