# Exploring COVID-19 data and analyzing various factors

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing required libraries

In [None]:
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import folium

# Reading required data for further analysis

In [None]:
# 'covid_19_india.csv' contains cured, deaths and confirmed cases on a day-to-day basis.
cases = pd.read_csv("/kaggle/input/covid19-in-india/covid_19_india.csv")
cases.head()

# It contains latitude and longitude coordinates of indian states.
lat_long = pd.read_csv("/kaggle/input/lat-lon-indian-states/datasets_652259_1374178_Total_India_covid-19.csv")
lat_long.head()

# It can provide us the ratio or rate or percentage of positive cases per population statewise.
popul = pd.read_csv("/kaggle/input/covid19-in-india/population_india_census2011.csv")
popul

# Checking whether the data contains any null values

In [None]:
# No null value or no null record is found.
cases.info()

# Renaming state names that've different names for same state
Since the given .csv file has different names for same state like "Telangana" -> "Telangana***" or "Telengana". So, replacing the name as "Telangana" only. We have entries in state names like "Unassigned" or "Cases being reassigned to states" so, for now we are going to ignore them and removing those rows from the data.

In [None]:
cases['State/UnionTerritory'].unique()

In [None]:
cases['State/UnionTerritory'].replace({"Telengana" : "Telangana", "Telengana***" : "Telangana",
                                        "Telangana***" : "Telangana"}, inplace = True)

cases['State/UnionTerritory'].replace({"Daman & Diu" : "Dadra and Nagar Haveli and Daman and Diu",
                                          "Dadar Nagar Haveli" : "Dadra and Nagar Haveli and Daman and Diu"},
                                         inplace = True)
cases = cases[(cases['State/UnionTerritory'] != 'Unassigned') &
                    (cases['State/UnionTerritory'] != 'Cases being reassigned to states')]
cases['State/UnionTerritory'].unique()

In [None]:
popul.rename(columns={'State / Union Territory':'State/UnionTerritory'}, inplace=True)
popul['State/UnionTerritory'].replace({"Telengana" : "Telangana"})

# Changing dates into datetime & removing unnecessary columns

In [None]:
cases.Date = pd.to_datetime(cases.Date, dayfirst=True)

cases.drop(['Sno', 'Time', 'ConfirmedIndianNational', 'ConfirmedForeignNational'], axis = 1, inplace=True)
cases.head()

# Getting the time-period of given data

In [None]:
print("Starting date : ", min(cases.Date.values))
print("Ending date : ", max(cases.Date.values))

# Calculating total active cases per day across India

In [None]:
daily_cases = cases.groupby('Date').sum().reset_index()
daily_cases['Active'] = 1

for val in daily_cases.index:
    if val != 0:
        daily_cases['Active'].loc[val] = daily_cases['Confirmed'].loc[val] - daily_cases['Cured'].loc[val-1] - daily_cases['Deaths'].loc[val-1]
    
daily_cases

# Visualization of Cured, Death, Confirmed & Active Cases by Date 
The following graph shows us how the cases are increasing day by day.

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = daily_cases.Date, y = daily_cases.Confirmed, name = 'Confirmed'))
fig.add_trace(go.Scatter(x = daily_cases.Date, y = daily_cases.Cured, name = 'Cured'))
fig.add_trace(go.Scatter(x = daily_cases.Date, y = daily_cases.Deaths, name = 'Deaths'))
fig.add_trace(go.Scatter(x = daily_cases.Date, y = daily_cases.Active, name = 'Active Cases'))

fig.update_layout(title = 'CORONA VIRUS CASES IN INDIA', yaxis_title = 'Cases Count (in lakhs)')

fig.show()

# Analyzing total cases uptill now 
Considering the total cases uptill now on a day-to-day basis to further visualize the increase in cases.

In [None]:
country_cases = cases[cases.Date == max(cases.Date)]
print(country_cases.shape)
country_cases.head()

This '**lat_long**' dataframe contains the latitude & longitude coordinates which we use to locate the states on the map so to properly visualize the number of cases a state have till the date.

In [None]:
lat_long.rename(columns={"State":"State/UnionTerritory"}, inplace=True)
lat_long = lat_long[['State/UnionTerritory', 'Latitude', 'Longitude']]
country_cases = pd.merge(country_cases, lat_long, on='State/UnionTerritory')
country_cases.head()

# Visualization of Confirmed cases on Indian map using Folium

In [None]:
# Make an empty map
m = folium.Map(location=[28,77], zoom_start=4)
country_cases['Confirmed'] = country_cases['Confirmed'].astype(float)

# I can add marker one by one on the map
for i in range(0,len(country_cases)):
    folium.Circle(location = [country_cases.iloc[i]['Latitude'], country_cases.iloc[i]['Longitude']],
                popup = [country_cases.iloc[i]['State/UnionTerritory'],country_cases.iloc[i]['Confirmed']],
                radius = country_cases.iloc[i]['Confirmed']/2,
                color = 'crimson', fill = True, fill_color='crimson').add_to(m)
    
m

# Calculating how much percentage of population gets infected till the date

In [None]:
# We're saving it for further use
total_pop = popul['Population'].sum()
print("The total population of India is : ", total_pop)

popul = cases.merge(popul[['State/UnionTerritory', 'Population']])
popul['ConfirmPerc'] = 0
popul['ConfirmPerc'] = (popul['Confirmed']/popul['Population'])*100

# Visualizing how different states have increament in positive cases by considering the percentage

In [None]:
fig = go.Figure()
for st in popul['State/UnionTerritory'].unique():
    df = popul[popul['State/UnionTerritory'] == st]
    fig.add_trace(go.Scatter(x = df.Date, y = df.ConfirmPerc, name = st))
    
fig.update_layout(title = 'Positive Cases Percentage Per Population', yaxis_title = 'Percentage (%)')
fig.show()

# Analyzing the data & calculating positive percentage on a day-to-day basis

In [None]:
# Here, we're grouping the data by date bcoz we want to visualize the data on per basis
popul_dates = popul.drop('ConfirmPerc', axis=1).groupby('Date').sum()

# In this, population should be same, as we're talking about the positive percentage per population
popul_dates['Population'] = total_pop

# Calculating total percentage of positive cases
popul_dates['TotConfirmPerc'] = (popul_dates['Confirmed']/popul_dates['Population'])*100
popul_dates

# Visualizing the increase in percentage

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = popul_dates.index, y = popul_dates.TotConfirmPerc))
fig.update_layout(title = 'Percentage of positive cases across India', yaxis_title = 'Percentage (%)')
fig.show()

If you want to know how to flatten the curve or wanted to get the idea of capacity of patients a state can have look at my another notebook where I've described the hospitals capacity of sustaining patients and how many more beds it needed to have. 
**Link:-** https://www.kaggle.com/ayushirastogi15/covid-19-flattening-the-curve

If you want to look at the interactive & interesting visualization of how the corona virus cases increases in each state then take a look at my another notebook which shows you the interesting bar-chart race of Indian states. 
**Link:-** https://www.kaggle.com/ayushirastogi15/covid-19-india-bar-chart-race

You can also look at the analysis & interesting visualizations of corona-virus cases all over the world. How the confirmed cases increasing, death cases increasing & the recovery rate as well. The link for the notebook is given below.
**Link:-** https://www.kaggle.com/ayushirastogi15/covid-19

*If you found anything wrong, do let me know. I'll be happy to get any kind of feed back.
Thanks in advance for the whole data science kaggle community.*