In [2]:
import matplotlib.pyplot as plt # For plots
import seaborn as sns # For graphs

import pandasql as ps # SQL package
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
file_path = os.path.join(dirname, filenames[0])

In [4]:
# Load csv file into dataframe
df = pd.read_csv(file_path)

# Shape of dataset
print(f'Shape of dataset: {df.shape}')

# Print head
df.head()

In [5]:
# Random sample of 5 data points
df.sample(5)

In [6]:
df.info()

In [7]:
# Describe the data
df.describe()

In [8]:
# Rename column names for ease in SQL query
df.columns = ['state', 'total', 'active', 'discharged', 'deaths', 'active_ratio', 'discharge_ratio', 'death_ratio']
print(f'Column names: {df.columns}')

## HeatMap for the correlation matrix of Covid data

In [9]:
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True)
plt.title('HeatMap', fontsize=25)

### Strong positive correlations:  
- Total cases and Discharged
- Total cases and Deaths
- Discharged and Deaths

### Strong negative correlations:
- Discharge Ration (%) and Active Ration (%)

In [17]:
plt.figure(figsize=(15,14))
sns.barplot(y='state', x='death_ratio', data=df)
plt.xlabel("Death Ratio",fontsize=15)
plt.ylabel("States",fontsize=15)
plt.title("Death Ratio per State",fontsize=25)

## Bar Graphs for top tens in every field
Here we will plot the bar graphs of fields like total cases, active, discharged etc wrt top ten states/UTs to understand the rate of spread of Covid cases in every region

In [18]:
# Bar graph of top number of cases
top_ten_total_cases = df.nlargest(10, 'total')

plt.figure(figsize=(8,4))
sns.barplot(y='state', x='total', orient='h', data=top_ten_total_cases)
plt.xlabel("Total Cases",fontsize=15)
plt.ylabel("States",fontsize=15)
plt.title("Total Cases for top ten states",fontsize=25)

In [19]:
# Bar graph of top number of active cases
top_ten_total_cases = df.nlargest(10, 'active')

plt.figure(figsize=(8,4))
sns.barplot(y='state', x='active', orient='h', data=top_ten_total_cases)
plt.xlabel("Active Cases", fontsize=15)
plt.ylabel("States", fontsize=15)
plt.title("Active Cases for top ten states", fontsize=25)

In [21]:
# Bar graph of top number of discharged cases
top_ten_total_cases = df.nlargest(10, 'discharged')

plt.figure(figsize=(8,4))
sns.barplot(y='state', x='discharged', orient='h', data=top_ten_total_cases)
plt.xlabel("Discharged Cases", fontsize=15)
plt.ylabel("States", fontsize=15)
plt.title("Discharged Cases for top ten states", fontsize=25)

In [22]:
# Bar graph of top number of deaths
top_ten_total_cases = df.nlargest(10, 'deaths')

plt.figure(figsize=(8,4))
sns.barplot(y='state', x='deaths', orient='h', data=top_ten_total_cases)
plt.xlabel("Deaths", fontsize=15)
plt.ylabel("States", fontsize=15)
plt.title("Deaths for top ten states", fontsize=25)

In [24]:
# Bar graph of top ratio of active cases
top_ten_total_cases = df.nlargest(10, 'active_ratio')

plt.figure(figsize=(8,4))
sns.barplot(y='state', x='active_ratio', orient='h', data=top_ten_total_cases)
plt.xlabel("Active Ratio (%)", fontsize=15)
plt.ylabel("States", fontsize=15)
plt.title("Active Ratio in percent for top ten states", fontsize=25)

In [26]:
# Bar graph of top ratio of discharge cases
top_ten_total_cases = df.nlargest(10, 'discharge_ratio')

plt.figure(figsize=(8,4))
sns.barplot(y='state', x='discharge_ratio', orient='h', data=top_ten_total_cases)
plt.xlabel("Discharge Ratio (%)", fontsize=15)
plt.ylabel("States", fontsize=15)
plt.title("Discharge ratio in percent for top ten states", fontsize=25)

### Conclusion
- Total cases - Maharashtra
- Active - Kerala
- Discharged - Maharashtra
- Deaths - Maharashtra
- Active Ratio (%) - Mizoram
- Discharge Ratio (%) - Dadra and Nagar Haveli and Daman and Diu	

In [28]:
##### For practice purpose only

q1 = """SELECT state, total
FROM df """
data = ps.sqldf(q1, locals())
data.head()