* @Author: Sankar
* @Date: 2021-05-13 07:52:25
* @Last Modified by: Sankar
* @Last Modified time: 2021-05-14 19:58:09
* @Title : Preprocessing of daywise.csv  data

In [1]:
from pyspark.sql import *
from pyspark.sql.types import *

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6,application_1620973165995_0020,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
from pyspark.sql.functions import UserDefinedFunction, col, isnan, when, col, count, isnull, mean, to_date, to_timestamp

In [3]:
spark_df = spark.read.csv('wasbs://hadoopcluster@mydemostorage1234.blob.core.windows.net/raw_input/day_wise.csv', header=True, inferSchema=True)

In [4]:
spark_df.describe()

DataFrame[summary: string, Confirmed: string, Deaths: string, Recovered: string, Active: string, New cases: string, New deaths: string, New recovered: string, Deaths / 100 Cases: string, Recovered / 100 Cases: string, Deaths / 100 Recovered: string, No. of countries: string]

In [5]:
spark_df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Confirmed: integer (nullable = true)
 |-- Deaths: integer (nullable = true)
 |-- Recovered: integer (nullable = true)
 |-- Active: integer (nullable = true)
 |-- New cases: integer (nullable = true)
 |-- New deaths: integer (nullable = true)
 |-- New recovered: integer (nullable = true)
 |-- Deaths / 100 Cases: double (nullable = true)
 |-- Recovered / 100 Cases: double (nullable = true)
 |-- Deaths / 100 Recovered: double (nullable = true)
 |-- No. of countries: integer (nullable = true)

In [6]:
spark_df.head(5)

[Row(Date=datetime.datetime(2020, 1, 22, 0, 0), Confirmed=555, Deaths=17, Recovered=28, Active=510, New cases=0, New deaths=0, New recovered=0, Deaths / 100 Cases=3.06, Recovered / 100 Cases=5.05, Deaths / 100 Recovered=60.71, No. of countries=6), Row(Date=datetime.datetime(2020, 1, 23, 0, 0), Confirmed=654, Deaths=18, Recovered=30, Active=606, New cases=99, New deaths=1, New recovered=2, Deaths / 100 Cases=2.75, Recovered / 100 Cases=4.59, Deaths / 100 Recovered=60.0, No. of countries=8), Row(Date=datetime.datetime(2020, 1, 24, 0, 0), Confirmed=941, Deaths=26, Recovered=36, Active=879, New cases=287, New deaths=8, New recovered=6, Deaths / 100 Cases=2.76, Recovered / 100 Cases=3.83, Deaths / 100 Recovered=72.22, No. of countries=9), Row(Date=datetime.datetime(2020, 1, 25, 0, 0), Confirmed=1434, Deaths=42, Recovered=39, Active=1353, New cases=493, New deaths=16, New recovered=3, Deaths / 100 Cases=2.93, Recovered / 100 Cases=2.72, Deaths / 100 Recovered=107.69, No. of countries=11), Ro

In [7]:
# Converting Date:TIMESTAMP column to DateDayWise:DATE 
# spark_df.select(to_date("Date", "yyyy-MM-dd").alias("DateDayWise"))
df_conv = spark_df.withColumn("DateDayWise",to_date(col("Date"),"yyyy-MM-dd"))

In [8]:
df_conv.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Confirmed: integer (nullable = true)
 |-- Deaths: integer (nullable = true)
 |-- Recovered: integer (nullable = true)
 |-- Active: integer (nullable = true)
 |-- New cases: integer (nullable = true)
 |-- New deaths: integer (nullable = true)
 |-- New recovered: integer (nullable = true)
 |-- Deaths / 100 Cases: double (nullable = true)
 |-- Recovered / 100 Cases: double (nullable = true)
 |-- Deaths / 100 Recovered: double (nullable = true)
 |-- No. of countries: integer (nullable = true)
 |-- DateDayWise: date (nullable = true)

In [9]:
# Helper function to drop unused columns and rename interesting columns.
def selectInterestingColumns(rawDf):
    # Mapping column index to name.
    columnNames = {12: "DateDayWise", 1:"ConfirmedCases", 2:"ConfirmedDeaths", 3:"RecoveredCases"} 
    # Rename column from 'data' to something meaningful
    cols = [col(rawDf.columns[i]).alias(columnNames[i]) for i in columnNames.keys()]
    
    # Drop columns we are not using.
    df = rawDf.select(cols)
    
    return df

In [10]:
df = selectInterestingColumns(df_conv).cache()
df.count()

188

In [12]:
df.printSchema()

root
 |-- ConfirmedCases: integer (nullable = true)
 |-- ConfirmedDeaths: integer (nullable = true)
 |-- RecoveredCases: integer (nullable = true)
 |-- DateDayWise: date (nullable = true)

In [14]:
# Changing the column position
df = df.select(df.DateDayWise, df.ConfirmedCases, df.ConfirmedDeaths, df.RecoveredCases)

In [15]:
df.printSchema()

root
 |-- DateDayWise: date (nullable = true)
 |-- ConfirmedCases: integer (nullable = true)
 |-- ConfirmedDeaths: integer (nullable = true)
 |-- RecoveredCases: integer (nullable = true)

In [16]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+-----------+--------------+---------------+--------------+
|DateDayWise|ConfirmedCases|ConfirmedDeaths|RecoveredCases|
+-----------+--------------+---------------+--------------+
|          0|             0|              0|             0|
+-----------+--------------+---------------+--------------+

In [17]:
df.show()

+-----------+--------------+---------------+--------------+
|DateDayWise|ConfirmedCases|ConfirmedDeaths|RecoveredCases|
+-----------+--------------+---------------+--------------+
| 2020-01-22|           555|             17|            28|
| 2020-01-23|           654|             18|            30|
| 2020-01-24|           941|             26|            36|
| 2020-01-25|          1434|             42|            39|
| 2020-01-26|          2118|             56|            52|
| 2020-01-27|          2927|             82|            61|
| 2020-01-28|          5578|            131|           107|
| 2020-01-29|          6166|            133|           125|
| 2020-01-30|          8234|            171|           141|
| 2020-01-31|          9927|            213|           219|
| 2020-02-01|         12038|            259|           281|
| 2020-02-02|         16787|            362|           459|
| 2020-02-03|         19887|            426|           604|
| 2020-02-04|         23898|            

In [18]:
df.write.csv('wasbs://hadoopcluster@mydemostorage1234.blob.core.windows.net/processed/day_wise.csv')

In [19]:
df.write.saveAsTable("hvdwise")