#**Covid-19 India Analysis**

**Import Necessary Libraries**

In [None]:
import requests
import pandas as pd
from google.colab import files

**URLs for the data sources**

In [None]:
data_url = 'https://data.covid19india.org/v4/min/data.min.json'
timeseries_url = 'https://data.covid19india.org/v4/min/timeseries.min.json'

**Used requests library to send HTTP GET requests to retrieve JSON data from two URLs.**

In [None]:
data_response = requests.get(data_url)
timeseries_response = requests.get(timeseries_url)

**Converting the JSON data into Python dictionaries using the json() method.**


In [None]:
data = data_response.json()
timeseries_data = timeseries_response.json()

**Creating a Variable for future work**

In [None]:
st_name={
    "AN": "Andaman and Nicobar Islands",
    "AP": "Andhra Pradesh",
    "AR": "Arunachal Pradesh",
    "AS": "Assam",
    "BR": "Bihar",
    "CH": "Chandigarh",
    "CT": "Chhattisgarh",
    "DL": "Delhi",
    "DN": "Dadra and Nagar Haveli and Daman and Diu",
    "GA": "Goa",
    "GJ": "Gujarat",
    "HP": "Himachal Pradesh",
    "HR": "Haryana",
    "JH": "Jharkhand",
    "JK": "Jammu and Kashmir",
    "KA": "Karnataka",
    "KL": "Kerala",
    "LA": "Ladakh",
    "LD": "Lakshadweep",
    "MH": "Maharashtra",
    "ML": "Meghalaya",
    "MN": "Manipur",
    "MP": "Madhya Pradesh",
    "MZ": "Mizoram",
    "NL": "Nagaland",
    "OR": "Odisha",
    "PB": "Punjab",
    "PY": "Puducherry",
    "RJ": "Rajasthan",
    "SK": "Sikkim",
    "TG": "Telangana",
    "TN": "Tamil Nadu",
    "TR": "Tripura",
    "TT": "India",
    "UP": "Uttar Pradesh",
    "UT": "Uttarakhand",
    "WB": "West Bengal",
}

**Prepare the state-level data**

In [None]:
state_data = []

for state_code, state_info in data.items():
    state_row = {
        'state_name': state_code,
        'population': state_info.get('meta', {}).get('population', None),
        'total_confirmed': state_info.get('total', {}).get('confirmed', None),
        'total_recovered': state_info.get('total', {}).get('recovered', None),
        'total_deceased': state_info.get('total', {}).get('deceased', None),
        'total_tested': state_info.get('total', {}).get('tested', None),
        'delta_tested': state_info.get('delta', {}).get('tested', None),
        'delta7_tested': state_info.get('delta7', {}).get('tested', None),
        'delta21_14_tested': state_info.get('delta21_14', {}).get('tested', None),
        'total_vaccinated1': state_info.get('total', {}).get('vaccinated1', None),
        'total_vaccinated2': state_info.get('total', {}).get('vaccinated2', None),
        'delta_vaccinated1': state_info.get('delta', {}).get('vaccinated1', None),
        'delta_vaccinated2': state_info.get('delta', {}).get('vaccinated2', None),
        'delta7_vaccinated1': state_info.get('delta7', {}).get('vaccinated1', None),
        'delta7_vaccinated2': state_info.get('delta7', {}).get('vaccinated2', None),
        'delta21_14_confirmed': state_info.get('delta21_14', {}).get('confirmed', None),
        'total_doses': state_info.get('meta', {}).get('vaccinated', {}).get('total_doses', None)
    }
    state_data.append(state_row)


**Create state_df DataFrame**

In [None]:
state_df = pd.DataFrame(state_data)

**Looking how the data looks like**

In [None]:
state_df.head()

Unnamed: 0,state_name,population,total_confirmed,total_recovered,total_deceased,total_tested,delta_tested,delta7_tested,delta21_14_tested,total_vaccinated1,total_vaccinated2,delta_vaccinated1,delta_vaccinated2,delta7_vaccinated1,delta7_vaccinated2,delta21_14_confirmed,total_doses
0,AN,397000,7651,7518,129,598033,1376.0,8936.0,,294001,200157,3.0,13.0,884,10640,9.0,
1,AP,52221000,2066450,2047722,14373,29518787,39848.0,254532.0,,32976969,20375181,20497.0,24137.0,1223010,1887005,3220.0,
2,AR,1504000,55155,54774,280,1185436,334.0,4788.0,,771875,534486,42.0,195.0,3312,23647,87.0,
3,AS,34293000,610645,600974,5997,24712042,15060.0,269097.0,,20172463,8068795,19124.0,37463.0,274869,849889,1499.0,
4,BR,119520000,726098,716390,9661,50531824,226443.0,1378539.0,,49874828,18346781,114694.0,145827.0,1286708,2144970,30.0,


**Replacing State Code with State Name**

In [None]:
state_df['state_name'] = state_df['state_name'].replace(st_name)

**Removing "India/Total" row as it contains sum of all numerical columns, which we are going to check in future**

In [None]:
state_df = state_df[state_df['state_name'] != "India"]

**Prepare the district-level data**

In [None]:
district_data = []

for state_code, state_info in data.items():
    state_name = state_info.get('meta', {}).get('state_name', None)
    districts = state_info.get('districts', {})
    for district_name, district_info in districts.items():
        district_row = {
            'state_name': state_code,
            'district_name': district_name,
            'population': district_info.get('meta', {}).get('population', None),
            'confirmed': district_info.get('total', {}).get('confirmed', None),
            'recovered': district_info.get('total', {}).get('recovered', None),
            'deceased': district_info.get('total', {}).get('deceased', None),
            'tested': district_info.get('total', {}).get('tested', None)
        }
        district_data.append(district_row)


**Create a district_df DataFrame**

In [None]:
district_df = pd.DataFrame(district_data)

**Looking how the data looks like**

In [None]:
district_df.head()

Unnamed: 0,state_name,district_name,population,confirmed,recovered,deceased,tested
0,AN,Nicobars,36842.0,,,,
1,AN,North and Middle Andaman,105597.0,,,,
2,AN,South Andaman,238142.0,,,,
3,AN,Unknown,,7651.0,7518.0,129.0,
4,AP,Anantapur,4083315.0,157843.0,156699.0,1093.0,787085.0


**Replacing State Code with State Name**




In [None]:
district_df['state_name'] = district_df['state_name'].replace(st_name)

**Removing "India/Total" row as it contains sum of all numerical columns, which we are going to check in future**



In [None]:
district_df = district_df[district_df['state_name'] != "India"]

**Prepare the state-wise timeseries data**

In [None]:
state_timeseries = []

for state_code, state_info in timeseries_data.items():
    for date, daily_info in state_info.get('dates', {}).items():
        timeseries_row = {
            'state_name': state_code,
            'date': date,
            'confirmed': daily_info.get('total', {}).get('confirmed', None),
            'recovered': daily_info.get('total', {}).get('recovered', None),
            'deceased': daily_info.get('total', {}).get('deceased', None),
            'tested': daily_info.get('total', {}).get('tested', None),

        }
        state_timeseries.append(timeseries_row)



**Create a timeseries_df DataFrame**

In [None]:
timeseries_df = pd.DataFrame(state_timeseries)

**Having a look at the data**

In [None]:
timeseries_df.tail()

Unnamed: 0,state_name,date,confirmed,recovered,deceased,tested
21673,WB,2021-10-27,1589042.0,1561973.0,19096.0,19039301.0
21674,WB,2021-10-28,1590032.0,1562818.0,19105.0,19084738.0
21675,WB,2021-10-29,1591014.0,1563678.0,19113.0,19133755.0
21676,WB,2021-10-30,1591994.0,1564558.0,19126.0,19180886.0
21677,WB,2021-10-31,1592908.0,1565471.0,19141.0,19228303.0


**Replacing State Code with State Name**

In [None]:
timeseries_df['state_name'] = timeseries_df['state_name'].replace(st_name)

**Removing "India/Total" row as it contains sum of all numerical columns, which we are going to check in future**

In [None]:
timeseries_df = timeseries_df[timeseries_df['state_name'] != "India"]

**Prepare the vaccination data**

In [None]:
vaccination_data = []

for state_code, state_info in data.items():
    state_name = state_info.get('meta', {}).get('state_name', None)
    total_vaccinated1 = state_info.get('total', {}).get('vaccinated1', None)
    total_vaccinated2 = state_info.get('total', {}).get('vaccinated2', None)
    date = state_info.get('meta', {}).get('vaccinated', {}).get('date', None)

    vaccination_row = {
        'state_name': state_code,
        'total_vaccinated1': total_vaccinated1,
        'total_vaccinated2': total_vaccinated2,
        'date': date
    }
    vaccination_data.append(vaccination_row)


**Create a vaccination_df DataFrame**

In [None]:
vaccination_df = pd.DataFrame(vaccination_data)

**Having a look at the data**

In [None]:
vaccination_df.head()

Unnamed: 0,state_name,total_vaccinated1,total_vaccinated2,date
0,AN,294001,200157,2021-10-31
1,AP,32976969,20375181,2021-10-31
2,AR,771875,534486,2021-10-31
3,AS,20172463,8068795,2021-10-31
4,BR,49874828,18346781,2021-10-31


**Replacing State Code with State Name**

In [None]:
vaccination_df['state_name'] = vaccination_df['state_name'].replace(st_name)

**Removing "India/Total" row as it contains sum of all numerical columns, which we are going to check in future**

In [None]:
vaccination_df = vaccination_df[vaccination_df['state_name'] != "India"]

**Checking for null values**

In [None]:
print(state_df.isnull().sum())
print("----------------------")
print(district_df.isnull().sum())
print("----------------------")
print(timeseries_df.isnull().sum())
print("----------------------")
print(vaccination_df.isnull().sum())

state_name               0
population               0
total_confirmed          0
total_recovered          0
total_deceased           0
total_tested             0
delta_tested             1
delta7_tested            1
delta21_14_tested       36
total_vaccinated1        0
total_vaccinated2        0
delta_vaccinated1        3
delta_vaccinated2        1
delta7_vaccinated1       0
delta7_vaccinated2       0
delta21_14_confirmed     1
total_doses             36
dtype: int64
----------------------
state_name         0
district_name      0
population        38
confirmed         93
recovered         94
deceased         108
tested           210
dtype: int64
----------------------
state_name       0
date             0
confirmed      103
recovered      774
deceased      1804
tested         751
dtype: int64
----------------------
state_name           0
total_vaccinated1    0
total_vaccinated2    0
date                 0
dtype: int64


**Handling the null Values**

In [None]:
state_df.drop(columns = ['delta21_14_tested', 'total_doses'], inplace = True)
state_df.fillna(value=0, inplace=True)

In [None]:
district_df.dropna(subset=['population'], inplace=True)
district_df.fillna({'confirmed': 0, 'recovered': 0, 'deceased': 0}, inplace=True)
district_df['tested'].fillna(value=0, inplace=True)

In [None]:
timeseries_df.fillna({'confirmed': 0, 'recovered': 0, 'deceased': 0}, inplace=True)
timeseries_df['tested'].fillna(method='ffill', inplace=True)
timeseries_df.fillna({'tested': 0}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  timeseries_df.fillna({'confirmed': 0, 'recovered': 0, 'deceased': 0}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  timeseries_df['tested'].fillna(method='ffill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  timeseries_df.fillna({'tested': 0}, inplace=True)


**Rechecking Null Values**

In [None]:
print(state_df.isnull().sum())
print("----------------------")
print(district_df.isnull().sum())
print("----------------------")
print(timeseries_df.isnull().sum())
print("----------------------")
print(vaccination_df.isnull().sum())

state_name              0
population              0
total_confirmed         0
total_recovered         0
total_deceased          0
total_tested            0
delta_tested            0
delta7_tested           0
total_vaccinated1       0
total_vaccinated2       0
delta_vaccinated1       0
delta_vaccinated2       0
delta7_vaccinated1      0
delta7_vaccinated2      0
delta21_14_confirmed    0
dtype: int64
----------------------
state_name       0
district_name    0
population       0
confirmed        0
recovered        0
deceased         0
tested           0
dtype: int64
----------------------
state_name    0
date          0
confirmed     0
recovered     0
deceased      0
tested        0
dtype: int64
----------------------
state_name           0
total_vaccinated1    0
total_vaccinated2    0
date                 0
dtype: int64


**Looking at the data types**

In [None]:
print(state_df.dtypes)
print("----------------------")
print(district_df.dtypes)
print("----------------------")
print(timeseries_df.dtypes)
print("----------------------")
print(vaccination_df.dtypes)

state_name               object
population                int64
total_confirmed           int64
total_recovered           int64
total_deceased            int64
total_tested              int64
delta_tested            float64
delta7_tested           float64
total_vaccinated1         int64
total_vaccinated2         int64
delta_vaccinated1       float64
delta_vaccinated2       float64
delta7_vaccinated1        int64
delta7_vaccinated2        int64
delta21_14_confirmed    float64
dtype: object
----------------------
state_name        object
district_name     object
population       float64
confirmed        float64
recovered        float64
deceased         float64
tested           float64
dtype: object
----------------------
state_name     object
date           object
confirmed     float64
recovered     float64
deceased      float64
tested        float64
dtype: object
----------------------
state_name           object
total_vaccinated1     int64
total_vaccinated2     int64
date              

**Changing the datatypes where required**

In [None]:
state_df[['delta_tested','delta7_tested','delta_vaccinated1','delta_vaccinated2','delta21_14_confirmed']]= state_df[['delta_tested','delta7_tested','delta_vaccinated1','delta_vaccinated2','delta21_14_confirmed']].astype("int")

In [None]:
district_df[['population', 'confirmed', 'recovered', 'deceased','tested']] = district_df[['population', 'confirmed', 'recovered', 'deceased','tested']].astype(int)


In [None]:
timeseries_df[['confirmed', 'recovered', 'deceased', 'tested']] = timeseries_df[['confirmed', 'recovered', 'deceased', 'tested']].fillna(0).astype(int)
timeseries_df['date'] = pd.to_datetime(timeseries_df['date'])

In [None]:
vaccination_df['date'] = pd.to_datetime(vaccination_df['date'])

**Rechecking the datatypes**

In [None]:
print(state_df.dtypes)
print("----------------------")
print(district_df.dtypes)
print("----------------------")
print(timeseries_df.dtypes)
print("----------------------")
print(vaccination_df.dtypes)

state_name              object
population               int64
total_confirmed          int64
total_recovered          int64
total_deceased           int64
total_tested             int64
delta_tested             int64
delta7_tested            int64
total_vaccinated1        int64
total_vaccinated2        int64
delta_vaccinated1        int64
delta_vaccinated2        int64
delta7_vaccinated1       int64
delta7_vaccinated2       int64
delta21_14_confirmed     int64
dtype: object
----------------------
state_name       object
district_name    object
population        int64
confirmed         int64
recovered         int64
deceased          int64
tested            int64
dtype: object
----------------------
state_name            object
date          datetime64[ns]
confirmed              int64
recovered              int64
deceased               int64
tested                 int64
dtype: object
----------------------
state_name                   object
total_vaccinated1             int64
total_vacc

**Converting all the dataframe to CSV files**

In [None]:
state_df.to_csv('state_data.csv', index=False)
district_df.to_csv('district_data.csv', index=False)
timeseries_df.to_csv('timeseries_data.csv', index=False)
vaccination_df.to_csv('vaccination_data.csv', index=False)


**Dowloading the CSV files for further analysis in SQL**

In [None]:
files.download('state_data.csv')
files.download('district_data.csv')
files.download('timeseries_data.csv')
files.download('vaccination_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>