In [4]:
import os
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext, SparkConf

In [5]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import countDistinct, avg
from pyspark.sql.functions import dayofmonth, dayofyear, year, month, hour, weekofyear, date_format
from pyspark.sql.functions import col as func_col

In [6]:
app_name = 'Saravanan Titanic : Spark SQL'
app_name

'Saravanan Titanic : Spark SQL'

In [7]:
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [73]:
tdf = spark.read.csv("hdfs://localhost:9000/module6_datasets/DataSets/Titanic.txt",inferSchema=True)

In [74]:
tdf.createOrReplaceTempView('titanic')

In [75]:
spark.sql("select count(*) as cnt from titanic").show()

+----+
| cnt|
+----+
|1313|
+----+



In [76]:
tdf.count()

1313

In [77]:
tdf.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)



In [85]:
tdf.describe().show()

+-------+-----------------+----+-------------------+--------------------+------------------+-----------+-------------------+------+------------------+-----------------+------+
|summary|              _c0| _c1|                _c2|                 _c3|               _c4|        _c5|                _c6|   _c7|               _c8|              _c9|  _c10|
+-------+-----------------+----+-------------------+--------------------+------------------+-----------+-------------------+------+------------------+-----------------+------+
|  count|             1313|1313|               1313|                1313|              1313|        821|                754|    77|                69|              347|  1313|
|   mean|            657.0|null|  0.341964965727342|                null| 31.19418104265403|       null|               null|2131.0|          101216.5| 7.69620253164557|  null|
| stddev|379.1747618183468|null|0.47454867068071604|                null|14.747525275652208|       null|               n

In [78]:
spark.sql("select * from titanic").show()

+---+---+---+--------------------+-------+-----------+--------------------+-------+-----------------+-----+------+
|_c0|_c1|_c2|                 _c3|    _c4|        _c5|                 _c6|    _c7|              _c8|  _c9|  _c10|
+---+---+---+--------------------+-------+-----------+--------------------+-------+-----------------+-----+------+
|  1|1st|  1|Allen, Miss Elisa...|29.0000|Southampton|        St Louis, MO|    B-5|       24160 L221|    2|female|
|  2|1st|  0|Allison, Miss Hel...| 2.0000|Southampton|Montreal, PQ / Ch...|    C26|             null| null|female|
|  3|1st|  0|Allison, Mr Hudso...|30.0000|Southampton|Montreal, PQ / Ch...|    C26|             null|(135)|  male|
|  4|1st|  0|Allison, Mrs Huds...|25.0000|Southampton|Montreal, PQ / Ch...|    C26|             null| null|female|
|  5|1st|  1|Allison, Master H...| 0.9167|Southampton|Montreal, PQ / Ch...|    C22|             null|   11|  male|
|  6|1st|  1|  Anderson, Mr Harry|47.0000|Southampton|        New York, NY|   E-

In [79]:
spark.sql("select split(_c3,', ')[1],split(_c3,', ')[0] from titanic").show(truncate=False)

+-----------------------------------------+---------------------+
|split(_c3, , , -1)[1]                    |split(_c3, , , -1)[0]|
+-----------------------------------------+---------------------+
|Miss Elisabeth Walton                    |Allen                |
|Miss Helen Loraine                       |Allison              |
|Mr Hudson Joshua Creighton               |Allison              |
|Mrs Hudson J.C. (Bessie Waldo Daniels)   |Allison              |
|Master Hudson Trevor                     |Allison              |
|Mr Harry                                 |Anderson             |
|Miss Kornelia Theodosia                  |Andrews              |
|Mr Thomas                                |Andrews              |
|Mrs Edward Dale (Charlotte Lamson)       |Appleton             |
|Mr Ramon                                 |Artagaveytia         |
|Colonel John Jacob                       |Astor                |
|Mrs John Jacob (Madeleine Talmadge Force)|Astor                |
|Mrs Leont

#### 1: Find the average age of people who died and who survived













In [80]:
spark.sql("select avg(_c4) from titanic where _c2=0").show()

+-----------------+
|         avg(_c4)|
+-----------------+
|32.24810596590909|
+-----------------+



#### 2:	Number of males and females survived in following age range: (age <= 20), (20 < age <= 50) and (age > 50 and age = NA)

In [81]:
spark.sql("select CASE WHEN _c4<=20 THEN '<=20' WHEN (_c4>20 and _c4<= 50) THEN '20-50' ELSE '50_NA' END as age_range,_c10, count(*) as cnt  from titanic group by age_range,_c10 order by age_range,_c10" ).show()

+---------+------+---+
|age_range|  _c10|cnt|
+---------+------+---+
|    20-50|female|151|
|    20-50|  male|269|
|    50_NA|female|245|
|    50_NA|  male|503|
|     <=20|female| 67|
|     <=20|  male| 78|
+---------+------+---+



In [82]:
spark.sql("select _c10,count(_c4) from titanic where _c4<=20 group by _c10").show()

+------+----------+
|  _c10|count(_c4)|
+------+----------+
|female|        67|
|  male|        78|
+------+----------+



#### 3	embarked locations and their count

In [99]:
spark.sql("select _c5, count(_c5) as cnt from titanic group by _c5").show()

+-----------+---+
|        _c5|cnt|
+-----------+---+
|       null|  0|
| Queenstown| 45|
|Southampton|573|
|  Cherbourg|203|
+-----------+---+



In [108]:
spark.sql("select _c5,count(case when isnull(_c5) then 'NULL' else _c5 end) from titanic group by _c5").show()

+-----------+-----------------------------------------------------+
|        _c5|count(CASE WHEN (_c5 IS NULL) THEN NULL ELSE _c5 END)|
+-----------+-----------------------------------------------------+
|       null|                                                  492|
| Queenstown|                                                   45|
|Southampton|                                                  573|
|  Cherbourg|                                                  203|
+-----------+-----------------------------------------------------+



#### 4: Number of people survived in each class

In [64]:
spark.sql("select _c1,sum(_c2) from titanic group by _c1").show()

+---+--------+
|_c1|sum(_c2)|
+---+--------+
|2nd|   119.0|
|1st|   193.0|
|3rd|   137.0|
+---+--------+



#### 5: Number of males survived whose age is less than 30 and travelling in 2nd class

In [68]:
spark.sql("select count(*) from titanic where _c10='male' and _c4<30 and _c1='2nd'").show()

+--------+
|count(1)|
+--------+
|      69|
+--------+



In [93]:
spark.sql("select count(*) from titanic where _c10='male' and _c4<30 and _c1='2nd'").show()


+--------+
|count(1)|
+--------+
|      69|
+--------+



In [95]:
tdf.describe().toPandas().set_index('summary').transpose()

summary,count,mean,stddev,min,max
_c0,1313,657.0,379.1747618183468,1,1313
_c1,1313,,,1st,3rd
_c2,1313,0.341964965727342,0.474548670680716,0,1
_c3,1313,,,"""Brown, Mrs James Joseph (Margaret ""Molly"" Tob...","del Carlo, Mrs Sebastiano (Argenia Genovese)"
_c4,1313,31.19418104265403,14.747525275652208,0.1667,
_c5,821,,,Cherbourg,Southampton
_c6,754,,,"?Havana, Cuba","Zurich, Switzerland"
_c7,77,2131.0,,2131,F-?
_c8,69,101216.5,140047.94688015297,,L15 1s
_c9,347,7.69620253164557,3.894871999310186,(101),D
