<a href="https://colab.research.google.com/github/siddhusalvi/Covid-Data-Visualization/blob/master/Covid_Data_Visualisation_using_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### File System and Spark Setup


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

### Plotly Setup

In [None]:
!pip install plotly==4.7.1
!wget https://github.com/plotly/orca/releases/download/v1.2.1/orca-1.2.1-x86_64.AppImage -O /usr/local/bin/orca
!chmod +x /usr/local/bin/orca
!apt-get install xvfb libgtk2.0-0 libgconf-2-4
import plotly.graph_objects as go

### DataFrame and SQL Operations



In [13]:
path = '/content/drive/My Drive/Siddhu/data/covidData.json'

In [18]:
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType,DateType
from pyspark.sql.functions import col,when

In [89]:
schema = StructType([
          StructField("agebracket", StringType(), True),
          StructField("backupnotes", StringType(), True),
          StructField("contractedfromwhichpatientsuspected", StringType(), True),
          StructField("currentstatus", StringType(), True),
          StructField("dateannounced", DateType(), True),
          StructField("detectedcity", StringType(), True),
          StructField("detecteddistrict", StringType(), True),
          StructField("detectedstate", StringType(), True),
          StructField("estimatedonsetdate", StringType(), True),
          StructField("gender", StringType(), True),
          StructField("nationality", StringType(), True),
          StructField("notes", StringType(), True),
          StructField("numcases", StringType(), True),
          StructField("patientnumber", StringType(), True),
          StructField("source1", StringType(), True),
          StructField("source2", StringType(), True),
          StructField("source3", StringType(), True),
          StructField("statecode", StringType(), True),
          StructField("statepatientnumber", StringType(), True),
          StructField("statuschangedate", DateType(), True),
          StructField("typeoftransmission", StringType(), True)
])

In [90]:
covidDF = spark.read.option("dateFormat", "dd/MM/yyyy").json(path, schema)

In [21]:
covidDF.count()

28183

In [None]:
covidDF.show()

In [None]:
covidDF.printSchema

#### Infected Gender


In [87]:
covidDF.groupby('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|     F| 1766|
|     M| 3547|
|      |22870|
+------+-----+



#### Infected Patient by cities



In [43]:
covidDF.groupby('detectedcity').count().orderBy(col("count").desc()).where(col("detectedcity") != "").show(5)

+------------+-----+
|detectedcity|count|
+------------+-----+
|       Thane|  108|
|        MCGM|  108|
| Navi Mumbai|   71|
|         PMC|   69|
| Vasai-Virar|   61|
+------------+-----+
only showing top 5 rows



#### Recovered Patient by cities



In [45]:
covidDF.filter(col('currentstatus')=='Recovered').groupby('detectedcity').count().orderBy(col("count").desc()).where(col("detectedcity") != "").show(5)


+------------+-----+
|detectedcity|count|
+------------+-----+
|   Bengaluru|   23|
|    Gurugram|   12|
|       Ranni|    7|
|       Kochi|    6|
|Gauribidanur|    5|
+------------+-----+
only showing top 5 rows



####Spliting Patients into AgeGroups




In [109]:
covidDF.withColumn("age",col("agebracket").cast("Integer"))\
.withColumn("agegroup",when(col("age") < 15 ,"Minor").when((col('age') > 14) & (col("age")< 31),"Young").when((col('age') > 30) & (col("age")< 51),"Adult").when(col("age")> 50,"Older"))\
.filter(col('agegroup') != "null" )\
.groupby('agegroup').count().show()

+--------+-----+
|agegroup|count|
+--------+-----+
|   Older|  584|
|   Minor|  161|
|   Adult|  920|
|   Young|  677|
+--------+-----+

