<a href="https://colab.research.google.com/github/scar110497/Shubham/blob/main/SPARK_Assignment_1_COVID_19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#install Apache Spark 3.0.1 with Hadoop 2.7 from here.
!wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz

# Now, we just need to unzip that folder.
!tar -xvzf spark-3.0.0-bin-hadoop2.7.tgz
!pip install findspark


import os
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop2.7"
import findspark
findspark.init()

In [46]:
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, DoubleType
spark = SparkSession.builder.master("local").appName("SparkDemoApp").getOrCreate()
sc = spark.sparkContext
print(type(spark))

<class 'pyspark.sql.session.SparkSession'>


In [49]:
#Load the covid 19 data in to a spark dataframe (country_wise_latest.csv) with the correct schema definition

schema = StructType([
    StructField("Country/Region", StringType()),
    StructField("Confirmed", IntegerType()),
    StructField("Deaths", IntegerType()),
    StructField("Recovered", IntegerType()),
    StructField("Active", IntegerType()),
    StructField("New cases", IntegerType()),
    StructField("New deaths", IntegerType()),
    StructField("New recovered", IntegerType()),
    StructField("Deaths / 100 Cases", DoubleType()),
    StructField("Recovered / 100 Cases", DoubleType()),
    StructField("Deaths / 100 Recovered", DoubleType()),
    StructField("Confirmed last week", IntegerType()),
    StructField("1 week change", IntegerType()),
    StructField("1 week % increase", DoubleType()),
    StructField("WHO Region", StringType())
])
user_df=spark.read.csv(path='/content/country_wise_latest.csv', header=True, schema=schema)
user_df.show()

#The are some column names which are long, contains special characters, spaces etc. Rename all such column names accordingly. Example Country/Region → country New cases → New_cases etc
renamed_user_df = user_df.withColumnRenamed("Country/Region", "Country_Reg").withColumnRenamed("New cases", "New_Cases").withColumnRenamed("New deaths", "New_deaths").withColumnRenamed("New recovered", "New_recovered").withColumnRenamed("Deaths / 100 Cases", "Deaths_100cases").withColumnRenamed("Recovered / 100 Cases", "Recovered_100cases").withColumnRenamed("Deaths / 100 Recovered", "Deaths_100Recovered").withColumnRenamed("Confirmed last week", "Confirmed_last_week").withColumnRenamed("1 week change", "1_week_change").withColumnRenamed("1 week % increase", "1_week_per_increase").withColumnRenamed("WHO Region", "WHO_Region")
renamed_user_df.show()




+-------------------+---------+------+---------+------+---------+----------+-------------+------------------+---------------------+----------------------+-------------------+-------------+-----------------+--------------------+
|     Country/Region|Confirmed|Deaths|Recovered|Active|New cases|New deaths|New recovered|Deaths / 100 Cases|Recovered / 100 Cases|Deaths / 100 Recovered|Confirmed last week|1 week change|1 week % increase|          WHO Region|
+-------------------+---------+------+---------+------+---------+----------+-------------+------------------+---------------------+----------------------+-------------------+-------------+-----------------+--------------------+
|        Afghanistan|    36263|  1269|    25198|  9796|      106|        10|           18|               3.5|                69.49|                  5.04|              35526|          737|             2.07|Eastern Mediterra...|
|            Albania|     4880|   144|     2745|  1991|      117|         6|           6

In [50]:
# Count and check if there any null values in any of the columns

for column in renamed_user_df.columns:
    null_count = renamed_user_df.where(renamed_user_df[column].isNull()).count()
    print(f"Column '{column}' has {null_count} null values.")

total_null_count = renamed_user_df.na.drop().count()
print(f"\nTotal null values in the DataFrame: {total_null_count}")

Column 'Country_Reg' has 0 null values.
Column 'Confirmed' has 0 null values.
Column 'Deaths' has 0 null values.
Column 'Recovered' has 0 null values.
Column 'Active' has 0 null values.
Column 'New_Cases' has 0 null values.
Column 'New_deaths' has 0 null values.
Column 'New_recovered' has 0 null values.
Column 'Deaths_100cases' has 0 null values.
Column 'Recovered_100cases' has 0 null values.
Column 'Deaths_100Recovered' has 5 null values.
Column 'Confirmed_last_week' has 0 null values.
Column '1_week_change' has 0 null values.
Column '1_week_per_increase' has 0 null values.
Column 'WHO_Region' has 0 null values.

Total null values in the DataFrame: 182


In [54]:
#What are the top 10 countries under the WHO region with covid 19 Confirmed cases


from pyspark.sql import SparkSession
from pyspark.sql.functions import *

top_10 = (renamed_user_df.groupBy("WHO_Region", "Country_Reg") .agg(
    max("Confirmed").alias("max_confirmed")).orderBy("max_confirmed", ascending=False)
    .limit(10))

top_10.show(truncate=False)

+---------------------+--------------+-------------+
|WHO_Region           |Country_Reg   |max_confirmed|
+---------------------+--------------+-------------+
|Americas             |US            |4290259      |
|Americas             |Brazil        |2442375      |
|South-East Asia      |India         |1480073      |
|Europe               |Russia        |816680       |
|Africa               |South Africa  |452529       |
|Americas             |Mexico        |395489       |
|Americas             |Peru          |389717       |
|Americas             |Chile         |347923       |
|Europe               |United Kingdom|301708       |
|Eastern Mediterranean|Iran          |293606       |
+---------------------+--------------+-------------+



In [55]:
#What are the bottom 10 countries under the WHO region with covid 19 Confirmed cases

from pyspark.sql import SparkSession
from pyspark.sql.functions import *

top_10 = (renamed_user_df.groupBy("WHO_Region", "Country_Reg") .agg(
    max("Confirmed").alias("max_confirmed")).orderBy("max_confirmed", ascending=True)
    .limit(10))

top_10.show(truncate=False)

+---------------+---------------------+-------------+
|WHO_Region     |Country_Reg          |max_confirmed|
+---------------+---------------------+-------------+
|Africa         |Western Sahara       |10           |
|Europe         |Holy See             |12           |
|Europe         |Greenland            |14           |
|Americas       |Saint Kitts and Nevis|17           |
|Americas       |Dominica             |18           |
|Western Pacific|Laos                 |20           |
|Americas       |Grenada              |23           |
|Americas       |Saint Lucia          |24           |
|South-East Asia|Timor-Leste          |24           |
|Western Pacific|Fiji                 |27           |
+---------------+---------------------+-------------+



In [58]:
#What are the total number of countries/ total no. of WHO regions and also list the various WHO regions

from pyspark.sql import SparkSession
from pyspark.sql.functions import *

Country_Total = renamed_user_df.select("Country_Reg").distinct().count()
WHO_Total = renamed_user_df.select("WHO_Region").distinct().count()

print(f"Total countries: {Country_Total}")
print(f"Total WHO regions: {WHO_Total}")
print("List of WHO regions:")
renamed_user_df.select("WHO_Region").distinct().show(truncate=False)

Total countries: 187
Total WHO regions: 6
List of WHO regions:
+---------------------+
|WHO_Region           |
+---------------------+
|Europe               |
|Western Pacific      |
|Africa               |
|Eastern Mediterranean|
|Americas             |
|South-East Asia      |
+---------------------+

