# COVID19-India-Analysis [Kaggle Notebook](https://www.kaggle.com/samacker77k/covid19-india-analysis)
A notebook dedicated to data visualization and analysis of COVID19 Pandemic in India.

---

This notebook visualizes the effects of COVID19 pandemic in India to help understand the effect of the outbreak demographically.

Maintained by:
* Shivani Tyagi [LinkedIn](https://www.linkedin.com/in/shivani-tyagi-09/) [Github](https://github.com/shivitg)
* Nitika Kamboj [LinkedIn](https://linkedin.com/in/nitika-kamboj) [Github](https://github.com/nitika-kamboj)
* Samar Srivastava [LinkedIn](https://linkedin.com/in/samacker77l) [Github](https://github.com/samacker77)
 


<p style="color:red">Since the API that was previously being used to fetch the data has now been revoked. We will be updating the dataset every 24 hours.</p>

---

### Importing libraries
---

In [1]:
import requests
import pandas as pd
import logging
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
import matplotlib.ticker as ticker
import matplotlib.animation as animation
from IPython.display import HTML
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set(font_scale=1.4)

In [4]:
def load_data():
    df = pd.read_csv('../input/data.csv')
    return df

In [2]:
data = load_data()

NameError: name 'load_data' is not defined

---
#### Now we have fetched data successfully. Now we will inspect the data. 

In [None]:
print("Data Shape ~ Rows = {} | Columns = {}".format(data.shape[0],data.shape[1]))

#### Checking dtypes

In [None]:
data.dtypes

> On first look we see that the attributes 'ID' and 'Unique ID' are same. So we check if they have any values that are different. Before that we convert 'Unique ID' to int64 and compare. 

In [None]:
data['Unique id'] = data['Unique id'].astype('int64')

In [None]:
data[data['ID'] == data['Unique id']]

> Since we have same values in both columns. We can drop one of them and make another as the index

In [None]:
data.drop('Unique id',axis=1,inplace=True)

In [None]:
data.set_index('ID',inplace=True)

In [None]:
print("Data Shape ~ Rows = {} | Columns = {}".format(data.shape[0],data.shape[1]))

In [None]:
data.head()

#### Now the data is ready for analysis and preprocessing

> Graph between the count of affected people and Nationality.

In [None]:
data['Nationality'].value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(data['Nationality'].value_counts().values,data['Nationality'].value_counts().index, palette=sns.dark_palette("blue", reverse=True))
plt.title('Count of people affected and their Nationality')
plt.xlabel('Count of people affected', fontsize=12)
plt.ylabel('Nationality', fontsize=12)
plt.show()

> We can see from above graph that 'India' and 'Indian' are same nationality. We can replace 'Indian' with 'India'

In [None]:
data['Nationality']=data['Nationality'].replace('Indian','India')

In [None]:
data['Nationality'].value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(data['Nationality'].value_counts().values,data['Nationality'].value_counts().index, palette=sns.dark_palette("blue", reverse=True))
plt.title('Count of people affected and their Nationality')
plt.xlabel('Count of people affected', fontsize=12)
plt.ylabel('Nationality', fontsize=12)
plt.show()

> Graph between gender and count of affected people.

In [None]:
plt.figure(figsize=(6,5))
sns.barplot(data['Gender'].value_counts().index,data['Gender'].value_counts().values, palette=sns.dark_palette("blue", reverse=True))
plt.title('Count of people affected and their Gender')
plt.ylabel('Count of people affected', fontsize=12)
plt.xlabel('Gender', fontsize=12)
plt.show()

> Graph between different states and number of detected cases.

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data['Detected state'].value_counts().values,data['Detected state'].value_counts().index, palette=sns.dark_palette("blue", reverse=True))
plt.title('Detected Cases in different states of India.')
plt.xlabel('Count of people affected', fontsize=12)
plt.ylabel('State', fontsize=12)
plt.show()

> Graph displaying status and count of affected people.

In [None]:
plt.figure(figsize=(5,5))
sns.barplot(data['Current status'].value_counts().index,data['Current status'].value_counts().values, palette=sns.dark_palette("blue", reverse=True))
plt.title('Status of affected people.')
plt.xlabel('Status', fontsize=12)
plt.ylabel('Count of people affected', fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(25,20))
sns.barplot(data['Detected district'].value_counts()[:10].values,data['Detected district'].value_counts()[:10].index, palette=sns.dark_palette("blue", reverse=True))
plt.title('Count of detected cases in top 10 different districst of India.',fontsize=20)
plt.xlabel('Count of people detected', fontsize=12)
plt.ylabel('District', fontsize=20)
plt.rc('xtick',labelsize=25)
plt.rc('ytick',labelsize=25)
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.swarmplot(data['Current status'],data['Age'],palette=sns.dark_palette("blue", reverse=True))


#### No. of days between change in status

In [None]:
data.head()

#### Days taken to change status(Hospitalized/Death/Recover)

In [None]:
data['status change difference'] = pd.to_datetime(data['Status change date'])-pd.to_datetime(data['Diagnosed date'])

In [None]:
data['status change difference'].value_counts()

### As we can see that many of the dates in the 'Status Change Date' are incorrect due to which the status change difference comes out to be negative.

---

In [None]:
data.drop('status change difference',axis=1,inplace=True
         )

In [None]:
data.head()

In [None]:
data['Transmission info'] = data['Contacts'].apply(lambda x : False if x == None else True)

In [None]:
data.head()

In [None]:
data['Transmission info'].value_counts()

In [None]:
plt.title('False if a patient did not spread COVID19 to others else True')
sns.countplot(data['Transmission info'],palette=sns.dark_palette('blue',reverse=True))

> We can estimate how many people are carriers of COVID19

In [None]:
print("COVID19 estimated to be spread by {} people".format(data['Transmission info'].value_counts()[1]))

In [None]:
diagnose_data=pd.DataFrame({'Dates':pd.to_datetime(data['Diagnosed date']).value_counts().index,
                           'Count':pd.to_datetime(data['Diagnosed date']).value_counts().values})
diagnose_data=diagnose_data.sort_values('Dates')
diagnose_data=diagnose_data.reset_index(drop=True)
diagnose_data=diagnose_data.set_index('Dates')

In [None]:
import matplotlib.dates as mdates
fig, ax = plt.subplots(figsize=(20,10))
graph=ax.plot(diagnose_data.Count,marker='s')
myFmt = mdates.DateFormatter('%d-%m-%y')
ax.xaxis.set_major_formatter(myFmt)
plt.rc('xtick',labelsize=20)
plt.xticks(diagnose_data.index.values,rotation=90)
plt.show()

In [None]:
diagnose_data['cumsum']=diagnose_data['Count'].cumsum()

In [None]:
diagnose_data['cumsum']

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
graph=ax.plot(diagnose_data['cumsum'],marker='s')
myFmt = mdates.DateFormatter('%d-%m-%y')
ax.xaxis.set_major_formatter(myFmt)
plt.rc('xtick',labelsize=20)
plt.xticks(diagnose_data.index.values,rotation=90)
plt.show()

In [None]:
import matplotlib.ticker as ticker
import matplotlib.animation as animation

In [None]:
group = diagnose_data.set_index('diagnosed_date')['detected_state'].to_dict()
group1 = diagnose_data.set_index('detected_state')['diagnosed_date'].value_counts().to_dict()
group_lk = merge_data.set_index('state')['case_count'].to_dict()

In [None]:
df = {'date': list(group.keys()), 'state': list(group.values())}
df =  pd.DataFrame.from_dict(df)
df1 = {'date': list(group1.keys()), 'case_count': list(group1.values())}
df1 =  pd.DataFrame.from_dict(df1)
merge_data = df.merge(df1, left_on='date', right_on='date')

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))

def draw_barchart(current_year):
    dff = merge_data
    ax.clear()
    ax.barh(dff['state'], dff['case_count'])
    dx = dff['case_count']
    for i, (value, name) in enumerate(zip(dff['case_count'], dff['state'])):
        ax.text(value-dx, i,     name,           size=14, weight=600, ha='right', va='bottom')
        ax.text(value-dx, i-.25, group_lk[name], size=10, color='#444444', ha='right', va='baseline')
        ax.text(value+dx, i,     f'{value:,.0f}',  size=14, ha='left',  va='center')
    ax.text(1, 0.4, current_year, transform=ax.transAxes, color='#777777', size=46, ha='right', weight=800)
    ax.text(0, 1.06, 'Population (thousands)', transform=ax.transAxes, size=12, color='#777777')
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.tick_params(axis='x', colors='#777777', labelsize=12)
    ax.set_yticks([])
    ax.margins(0, 0.01)
    ax.grid(which='major', axis='x', linestyle='-')
    ax.set_axisbelow(True)
    ax.text(0, 1.15, 'The most populous cities in the world from 1500 to 2018',
            transform=ax.transAxes, size=24, weight=600, ha='left', va='top')
    ax.text(1, 0, 'by @pratapvardhan; credit @jburnmurdoch', transform=ax.transAxes, color='#777777', ha='right',
            bbox=dict(facecolor='white', alpha=0.8, edgecolor='white'))
    plt.box(False)
    
draw_barchart(2018)