In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Thanks to Greg Hogg for this great [tutorial](https://www.youtube.com/watch?v=v1cKMCSS3PY&ab_channel=GregHogg). 

In [None]:
!pip install pyspark
!pip install findspark

In [None]:
import findspark
findspark.init

In [None]:
from pyspark.sql import SparkSession # required to created a dataframe
spark=SparkSession.builder.appName("Basics").getOrCreate() 

import pyspark.sql.functions

In [None]:
df = spark.read.csv("/kaggle/input/latest-covid19-india-statewise-data/Latest Covid-19 India Status.csv", 
                    inferSchema=True,
                   header=True)

In [None]:
from pyspark.sql import functions as F

In [None]:
renamed_df = df.select([F.col(col).alias(col.replace(' ', '_'))for col in df.columns])

In [None]:
renamed_df.show()

In [None]:
renamed_df = renamed_df.withColumnRenamed('Active_Ratio_(%)', 'Active_Ratio')
renamed_df = renamed_df.withColumnRenamed('Discharge_Ratio_(%)', 'Discharged_Ratio')
renamed_df = renamed_df.withColumnRenamed('Death_Ratio_(%)', 'Death_Ratio')
renamed_df = renamed_df.withColumnRenamed('State/UTs', 'State_UTs')

In [None]:
renamed_df.createOrReplaceTempView('Data')

In [None]:
spark.sql('Select * from Data').show()

In [None]:
spark.sql('select count(State_UTs) from Data').show()

In [None]:
spark.sql('select * from Data order by Active desc limit 5').show()

In [None]:
spark.sql('select * from Data order by Death_Ratio desc limit 5').show()

In [None]:
spark.sql('select sum(Total_Cases), sum(Active) from Data').show()

In [None]:
spark.sql('select * from Data order by Active asc limit 5').show()

In [None]:
import pandas as pd

In [None]:
pd_df = renamed_df.toPandas()
pd_df.head()

In [None]:
pd_df.describe()

In [None]:
pd_df.corr()

In [None]:
# Make 4 histograms: 'Total_Cases', 'Active', 'Deaths', 'Discharge'
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

fig = make_subplots(rows=2, cols=2, subplot_titles=['Total_Cases', 'Active', 'Deaths', 'Discharged'])

total_cases = go.Histogram(x=pd_df['Total_Cases'], nbinsx=20, name='Total Cases')
active = go.Histogram(x=pd_df['Active'], nbinsx=20, name='Active')
deaths = go.Histogram(x=pd_df['Deaths'], nbinsx=20, name='Deaths')
discharged = go.Histogram(x=pd_df['Discharged'], nbinsx=20, name='Discharged')

fig.add_traces(total_cases, 1, 1)
fig.add_traces(active, 1, 2)
fig.add_traces(deaths, 2, 1)
fig.add_traces(discharged, 2, 2)

fig.update_layout(showlegend=False)
fig.show()

In [None]:
fig = make_subplots(rows=2, cols=2, subplot_titles=['Total_Cases', 'Active', 'Deaths', 'Discharged'])

total_cases = go.Box(x=pd_df['Total_Cases'],  name='Total Cases', text= pd_df['State_UTs'])
active = go.Box(x=pd_df['Active'],  name='Active', text= pd_df['State_UTs'])
deaths = go.Box(x=pd_df['Deaths'], name='Deaths', text= pd_df['State_UTs'])
discharged = go.Box(x=pd_df['Discharged'], name='Discharged', text= pd_df['State_UTs'])

fig.add_traces(total_cases, 1, 1)
fig.add_traces(active, 1, 2)
fig.add_traces(deaths, 2, 1)
fig.add_traces(discharged, 2, 2)

fig.update_layout(showlegend=False)
fig.show()

In [None]:
fig = make_subplots(rows=2, cols=2, subplot_titles=['Total_Cases', 'Active', 'Deaths', 'Discharged'])

total_cases = go.Bar(y=pd_df['Total_Cases'],  name='Total Cases', hovertext= pd_df['State_UTs'])
active = go.Bar(y=pd_df['Active'],  name='Active', hovertext= pd_df['State_UTs'])
deaths = go.Bar(y=pd_df['Deaths'], name='Deaths', hovertext= pd_df['State_UTs'])
discharged = go.Bar(y=pd_df['Discharged'], name='Discharged', hovertext= pd_df['State_UTs'])

fig.add_traces(total_cases, 1, 1)
fig.add_traces(active, 1, 2)
fig.add_traces(deaths, 2, 1)
fig.add_traces(discharged, 2, 2)

fig.update_layout(showlegend=False)
fig.show()

In [None]:
fig = go.Figure([go.Bar(y=pd_df['Total_Cases'],  name='Total Cases', hovertext= pd_df['State_UTs'], x=pd_df['State_UTs']),
                go.Bar(y=pd_df['Discharged'], name='Discharged', hovertext= pd_df['State_UTs'], x=pd_df['State_UTs'])])

fig.update_layout(barmode='group')
fig.update_layout(title='Total & Discharged Cases')
fig.show()

In [None]:
fig = make_subplots(rows=2, cols=2, 
                    subplot_titles=['Total_Cases', 'Active', 'Deaths', 'Discharged'],
                    specs=[
                            [{'type':'domain'}, {'type':'domain'}],
                            [{'type':'domain'},{'type':'domain'}]
                          ])


total_cases = go.Pie(values=pd_df['Total_Cases'],  name='Total Cases', labels=pd_df['State_UTs'])
active = go.Pie(values=pd_df['Active'],  name='Active', labels=pd_df['State_UTs'])
deaths = go.Pie(values=pd_df['Deaths'], name='Deaths', labels=pd_df['State_UTs'])
discharged = go.Pie(values=pd_df['Discharged'], name='Discharged', labels=pd_df['State_UTs'])

fig.add_traces(total_cases, 1, 1)
fig.add_traces(active, 1, 2)
fig.add_traces(deaths, 2, 1)
fig.add_traces(discharged, 2, 2)

fig.update_traces(hoverinfo='percent+label')
fig.update_layout(showlegend=False)

fig.update_traces(textposition='inside')

fig = go.Figure(fig)
fig.show()

In [None]:
fig = go.Figure([go.Scatter(x=pd_df['Total_Cases'], y=pd_df['Deaths'], mode='markers')])
fig.update_layout(title='Deaths vs Total Cases', xaxis_title='Total Cases', yaxis_title='Deaths')
fig.show()

In [None]:
x = pd_df['Total_Cases'].to_numpy()
y = pd_df['Deaths'].to_numpy()

x.shape, y.shape

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x.reshape((len(x), 1)), y)
(model.intercept_, model.coef_[0])

In [None]:
model.score(x.reshape((len(x), 1)), y)

In [None]:
0.904746**2

In [None]:
predictions = model.predict(x.reshape((len(x), 1)))
predictions.shape

In [None]:
fig = go.Figure([go.Scatter(x=pd_df['Total_Cases'], mode='markers',y=pd_df['Deaths'], name='Actual Deaths'),
                 go.Scatter(x=pd_df['Total_Cases'], y=predictions, name='Predicted Deaths')
               ])
fig.update_layout(title='Deaths vs Total Cases', xaxis_title='Total Cases', yaxis_title='Deaths')
fig.show()