In [10]:
import os
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext, SparkConf

In [31]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import countDistinct, avg
from pyspark.sql.functions import dayofmonth, dayofyear, year, month, hour, weekofyear, date_format
from pyspark.sql.functions import col as func_col

In [12]:
app_name = 'Saravanan : Spark SQL'
app_name

'Saravanan : Spark SQL'

In [13]:
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [14]:
odf = spark.read.csv("hdfs://localhost:9000/module6_datasets/DataSets/olympic_Data.txt")

#### KPI-1: No of athletes participated in each Olympic event

In [15]:
odf.createOrReplaceTempView('olymp')

In [38]:
#### Answer
1: No of athletes participated in each Olympic event
spark.sql("SELECT _c3, count(*) FROM olympic group by _c3").show

In [37]:
spark.sql("select _c3 olympicYear,count(*) No_of_Athletes from olymp group by _c3").show()

+-----------+--------------+
|olympicYear|No_of_Athletes|
+-----------+--------------+
|       2012|          1776|
|       2000|          1840|
|       2002|           407|
|       2006|           443|
|       2004|          1839|
|       2008|          1872|
|       2010|           441|
+-----------+--------------+



#### 2: No of medals each country won in each Olympic in ascending order

In [None]:
2: No of medals each country won in each Olympic in ascending order
spark.sql("SELECT _c2, _c3, sum(_c9) as cnt FROM olympic group by _c2, _c3 order by _c3, cnt DESC").show
spark.sql("SELECT _c2, _c3, sum(_c9) as cnt FROM olympic group by _c2, _c3 order by cnt").show

In [39]:
spark.sql("select _c2 as country,_c3 as year_of_olympic, sum(_c9) as number_of_medals from olymp group by _c2,_c3 order by number_of_medals asc").show()

+-------------------+---------------+----------------+
|            country|year_of_olympic|number_of_medals|
+-------------------+---------------+----------------+
|          Venezuela|           2008|             1.0|
|              India|           2004|             1.0|
|          Mauritius|           2008|             1.0|
|Trinidad and Tobago|           2004|             1.0|
|           Mongolia|           2004|             1.0|
|              Japan|           2006|             1.0|
|             Uganda|           2012|             1.0|
|           Barbados|           2000|             1.0|
|           Cameroon|           2004|             1.0|
|          Venezuela|           2012|             1.0|
|            Uruguay|           2000|             1.0|
|             Kuwait|           2000|             1.0|
|            Eritrea|           2004|             1.0|
|              Sudan|           2008|             1.0|
|             Israel|           2000|             1.0|
|         

#### 3: Top 10 athletes who won highest gold medals in all the Olympic events

In [None]:
3: Top 10 athletes who won highest gold medals in all the Olympic events
spark.sql("SELECT _c0, sum(_c9) as cnt FROM olympic group by _c0 order by cnt DESC").show

In [29]:
spark.sql("select _c0 atheltes, sum(_c9) no_of_medals from olymp group by _c0 order by no_of_medals desc").show()

+--------------------+------------+
|            atheltes|no_of_medals|
+--------------------+------------+
|      Michael Phelps|        22.0|
|    Natalie Coughlin|        12.0|
|         Ryan Lochte|        11.0|
|          Ian Thorpe|         9.0|
|Ole Einar Bjørndalen|         9.0|
|        Leisel Jones|         9.0|
|    Apolo Anton Ohno|         8.0|
|      Katalin Kovács|         8.0|
|      Inge de Bruijn|         8.0|
|         Dara Torres|         8.0|
|         Jason Lezak|         8.0|
|Libby Lenton-Tric...|         7.0|
|       Petria Thomas|         7.0|
|     Kirsty Coventry|         7.0|
|     Kosuke Kitajima|         7.0|
|       Grant Hackett|         7.0|
|Veronica Campbell...|         7.0|
|            Yang Wei|         7.0|
|        Kati Wilhelm|         7.0|
|      Felix Gottwald|         7.0|
+--------------------+------------+
only showing top 20 rows



#### 4: No of athletes who won gold and whose age is less than 20

In [None]:
4: No of athletes who won gold and whose age is less than 20
spark.sql("SELECT count(*) FROM olympic where _c1 < 20 and _c6 > 0").show

In [57]:
spark.sql("select count(distinct(_c0)) from olymp where _c1<20 and _c6>0").show()

+-------------------+
|count(DISTINCT _c0)|
+-------------------+
|                187|
+-------------------+



#### 5: Youngest athlete who won gold in each category of sports in each Olympic

In [55]:
spark.sql("select _c3,_c5,min(_c1) from olymp group by _c3, _c5").show() 

+----+-------------------+--------+
| _c3|                _c5|min(_c1)|
+----+-------------------+--------+
|2000|            Archery|      17|
|2000|          Athletics|      17|
|2000|          Badminton|      21|
|2000|           Baseball|      19|
|2000|         Basketball|      19|
|2000|   Beach Volleyball|      25|
|2000|             Boxing|      19|
|2000|           Canoeing|      18|
|2000|            Cycling|      20|
|2000|             Diving|      16|
|2000|         Equestrian|      25|
|2000|            Fencing|      15|
|2000|           Football|      16|
|2000|         Gymnastics|      15|
|2000|           Handball|      21|
|2000|             Hockey|      19|
|2000|               Judo|      18|
|2000|  Modern Pentathlon|      24|
|2000|Rhythmic Gymnastics|      15|
|2000|             Rowing|      20|
+----+-------------------+--------+
only showing top 20 rows



In [None]:
#### 6: No of atheletes from each country who has won a medal in each Olympic in each sports

In [65]:
spark.sql("select _c2,_c3,_c5, count(*) from olymp where _c9>0 group by _c2,_c3,_c5  ").show()

+-------------+----+--------------------+--------+
|          _c2| _c3|                 _c5|count(1)|
+-------------+----+--------------------+--------+
|        China|2008|            Swimming|      12|
|      Germany|2010|Cross Country Skiing|       7|
|        Japan|2008|           Wrestling|       6|
|        Egypt|2004|           Wrestling|       1|
|    Venezuela|2004|           Taekwondo|       1|
|       France|2010|        Snowboarding|       3|
|   Tajikistan|2008|                Judo|       1|
|      Estonia|2004|           Athletics|       1|
|United States|2010|       Alpine Skiing|       4|
|  New Zealand|2008|             Cycling|       5|
|Great Britain|2004|   Modern Pentathlon|       1|
|         Cuba|2008|                Judo|       6|
|       Canada|2006|      Figure Skating|       1|
|  Switzerland|2008|          Equestrian|       4|
|     Bulgaria|2000|           Athletics|       1|
|       Mexico|2012|             Archery|       2|
|  Netherlands|2008|          E

### 7: No of athletes won at least a medal in each events in all the Olympics

In [66]:
spark.sql("select count(*) from olymp where _c9>0").show()

+--------+
|count(1)|
+--------+
|    8618|
+--------+



In [77]:
spark.sql("select _c5,count(distinct(_c0)) cnt from olymp where _c9>0 group by _c5 order by cnt desc").show()

+-------------+---+
|          _c5|cnt|
+-------------+---+
|    Athletics|568|
|       Rowing|444|
|     Swimming|367|
|     Football|352|
|   Ice Hockey|313|
|     Handball|303|
|       Hockey|299|
|    Waterpolo|235|
|   Basketball|223|
|   Volleyball|215|
|    Wrestling|210|
|      Cycling|207|
|     Canoeing|206|
|     Baseball|194|
|         Judo|188|
|      Fencing|183|
|       Boxing|172|
|   Gymnastics|170|
|      Sailing|170|
|Weightlifting|159|
+-------------+---+
only showing top 20 rows



In [76]:
spark.sql("SELECT DISTINCT _c5, count(*) as cnt FROM olymp group by _c5 order by cnt DESC").show()

+----------+---+
|       _c5|cnt|
+----------+---+
| Athletics|687|
|    Rowing|567|
|  Swimming|487|
|  Football|407|
|    Hockey|388|
|Ice Hockey|384|
|  Handball|351|
| Waterpolo|306|
|  Canoeing|295|
|Basketball|287|
|Volleyball|281|
|   Cycling|261|
| Wrestling|245|
|   Fencing|230|
|      Judo|224|
|  Baseball|216|
|   Sailing|210|
|Gymnastics|194|
|    Boxing|188|
|  Shooting|181|
+----------+---+
only showing top 20 rows



#### -8: Country won highest no of medals in wrestling in 2012

In [82]:
spark.sql("select _c2,sum(_c9) cnt from olymp where _c3=2012 and _c5='Wrestling' group by _c2 order by cnt desc limit 1").show()

+------+----+
|   _c2| cnt|
+------+----+
|Russia|11.0|
+------+----+



In [84]:
odf.describe().toPandas().set_index('summary').transpose()

summary,count,mean,stddev,min,max
_c0,8613,,,A. J. Mleczko,Živko Gocic
_c1,8613,26.40543364681296,5.10211819196138,15,61
_c2,8618,,,Afghanistan,Zimbabwe
_c3,8618,2005.978881411,4.289946258614372,2000,2012
_c4,8618,,,10/1/2000,8/29/2004
_c5,8618,,,Alpine Skiing,Wrestling
_c6,8618,0.3647017869575307,0.5453488740297667,0,8
_c7,8618,0.3633093525179856,0.5116126702333476,0,3
_c8,8618,0.3776978417266187,0.5050049522149438,0,3
_c9,8618,1.105708981202135,0.4088919859318833,1,8
