In [3]:
import pyspark
from pyspark.sql import SparkSession,SQLContext
from pyspark.context import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [4]:
spark = SparkSession.builder.appName('Covid sample analysis').getOrCreate()

25/02/05 14:47:58 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


### Practicing Spark operations using a COVID dataset. The tasks include:

#### 1.Reading a CSV file
#### 2.Changing column names and data types
#### 3.Applying filters and transformations
#### 4.Performing joins
#### 5.Using Spark SQL
#### 6.Implementing Spark UDFs
#### 7.Applying window functions


#### 1.Reading a CSV file


In [5]:

covid_data = spark.read.load('./source_data/Case.csv', format="csv", sep=',', inferschema=True, header=True)
covid_data 

DataFrame[ case_id: int, province: string, city: string, group: boolean, infection_case: string, confirmed: int, latitude: string, longitude: string]

In [6]:
covid_data.show()

+--------+--------+---------------+-----+--------------------+---------+---------+----------+
| case_id|province|           city|group|      infection_case|confirmed| latitude| longitude|
+--------+--------+---------------+-----+--------------------+---------+---------+----------+
| 1000001|   Seoul|     Yongsan-gu| true|       Itaewon Clubs|      139|37.538621|126.992652|
| 1000002|   Seoul|      Gwanak-gu| true|             Richway|      119| 37.48208|126.901384|
| 1000003|   Seoul|        Guro-gu| true| Guro-gu Call Center|       95|37.508163|126.884387|
| 1000004|   Seoul|   Yangcheon-gu| true|Yangcheon Table T...|       43|37.546061|126.874209|
| 1000005|   Seoul|      Dobong-gu| true|     Day Care Center|       43|37.679422|127.044374|
| 1000006|   Seoul|        Guro-gu| true|Manmin Central Ch...|       41|37.481059|126.894343|
| 1000007|   Seoul|from other city| true|SMR Newly Planted...|       36|        -|         -|
| 1000008|   Seoul|  Dongdaemun-gu| true|       Dongan Churc

In [8]:
covid_datas =covid_data.withColumnRenamed('infection_case','continment_area')\
       .withColumnRenamed('confirmed','confirmed_cases')  
covid_datas.show()

+--------+--------+---------------+-----+--------------------+---------------+---------+----------+
| case_id|province|           city|group|     continment_area|confirmed_cases| latitude| longitude|
+--------+--------+---------------+-----+--------------------+---------------+---------+----------+
| 1000001|   Seoul|     Yongsan-gu| true|       Itaewon Clubs|            139|37.538621|126.992652|
| 1000002|   Seoul|      Gwanak-gu| true|             Richway|            119| 37.48208|126.901384|
| 1000003|   Seoul|        Guro-gu| true| Guro-gu Call Center|             95|37.508163|126.884387|
| 1000004|   Seoul|   Yangcheon-gu| true|Yangcheon Table T...|             43|37.546061|126.874209|
| 1000005|   Seoul|      Dobong-gu| true|     Day Care Center|             43|37.679422|127.044374|
| 1000006|   Seoul|        Guro-gu| true|Manmin Central Ch...|             41|37.481059|126.894343|
| 1000007|   Seoul|from other city| true|SMR Newly Planted...|             36|        -|         -|


In [9]:
covid_datas.sort('confirmed_cases').show()

+--------+-----------------+---------------+-----+--------------------+---------------+---------+----------+
| case_id|         province|           city|group|     continment_area|confirmed_cases| latitude| longitude|
+--------+-----------------+---------------+-----+--------------------+---------------+---------+----------+
| 7000002|          Jeju-do|              -|false|contact with patient|              0|        -|         -|
| 3000007|       Gangwon-do|              -|false|contact with patient|              0|        -|         -|
| 1000030|            Seoul|     Gangseo-gu| true|SJ Investment Cal...|              0|37.559649|126.835102|
| 1100007|            Busan|from other city| true|Cheongdo Daenam H...|              1|        -|         -|
| 5000003|     Jeollabuk-do|from other city| true|  Shincheonji Church|              1|        -|         -|
| 1000028|            Seoul|from other city| true|Anyang Gunpo Past...|              1|        -|         -|
| 1000025|         

In [10]:
covid_datas.sort(desc('confirmed_cases')).show()

+--------+-----------------+---------------+-----+--------------------+---------------+---------+----------+
| case_id|         province|           city|group|     continment_area|confirmed_cases| latitude| longitude|
+--------+-----------------+---------------+-----+--------------------+---------------+---------+----------+
| 1200001|            Daegu|         Nam-gu| true|  Shincheonji Church|           4511| 35.84008|  128.5667|
| 1200009|            Daegu|              -|false|contact with patient|            917|        -|         -|
| 1200010|            Daegu|              -|false|                 etc|            747|        -|         -|
| 6000001| Gyeongsangbuk-do|from other city| true|  Shincheonji Church|            566|        -|         -|
| 2000020|      Gyeonggi-do|              -|false|     overseas inflow|            305|        -|         -|
| 1000036|            Seoul|              -|false|     overseas inflow|            298|        -|         -|
| 1200002|         