In [None]:
import numpy as np 
import pandas as pd 
import re
import warnings

warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
population_df = pd.read_csv("/kaggle/input/covid19-in-india/population_india_census2011.csv", index_col=0) 
statewise_testing_df = pd.read_csv("/kaggle/input/covid19-in-india/StatewiseTestingDetails.csv", index_col=0)

In [None]:
population_df.head()

In [None]:
statewise_testing_df.head()

In [None]:
statewise_testing_df.reset_index(inplace=True)

statewise_testing_df['Date'] = pd.to_datetime(statewise_testing_df['Date'], format="%Y-%m-%d")

statewise_testing_df['Date'].min(), statewise_testing_df['Date'].max()

In [None]:
population_df = population_df[['State / Union Territory','Density']]
statewise_testing_df = statewise_testing_df[['Date','TotalSamples','State','Positive']]

In [None]:
print(population_df['State / Union Territory'].nunique())
pop_states = set(population_df['State / Union Territory'].unique())

In [None]:
print(statewise_testing_df['State'].nunique())
statewise_testing_states = set(statewise_testing_df['State'].unique())

In [None]:
pop_states - statewise_testing_states

In [None]:
statewise_testing_states - pop_states

In [None]:
population_df.loc[population_df['State / Union Territory'].str.contains('ngana')]

In [None]:
population_df.loc[population_df['State / Union Territory'].str.contains('ngana'),'State / Union Territory'] = "Telangana"

### Joining statewise testing details with population information

In [None]:
statewise_features = statewise_testing_df.reset_index().merge(population_df, \
                                    how='inner', \
                                   left_on='State', \
                                   right_on='State / Union Territory')
statewise_features = statewise_features.drop(["State / Union Territory",'index'], axis=1)

In [None]:
statewise_features[statewise_features['State']=='West Bengal'].head()

In [None]:
statewise_features['Density'] = statewise_features['Density'].apply(lambda density: re.sub(",", "",density))
statewise_features['pop_density'] = statewise_features['Density'].str.extract("(\d+)").astype(float)
statewise_features.drop("Density", axis=1, inplace=True)

In [None]:
statewise_features.head()

In [None]:
statewise_features.info()

In [None]:
statewise_daily_df = None
for state in statewise_features['State'].unique():
    covid_data_state = statewise_features[statewise_features['State']==state]
    covid_data_state['previous_day'] = covid_data_state['Positive'].shift(1)
    covid_data_state['new_cases'] = covid_data_state['Positive'] - covid_data_state['previous_day']

    covid_data_state['previous_day'] = covid_data_state['TotalSamples'].shift(1)
    covid_data_state['samples_tested'] = covid_data_state['TotalSamples'] - covid_data_state['previous_day']

    covid_data_state = covid_data_state.drop('previous_day',axis=1)
    statewise_daily_df = pd.concat([statewise_daily_df, covid_data_state], axis=0)
    
statewise_daily_df.set_index('Date', inplace=True)


In [None]:
statewise_daily_df.head()

In [None]:
statewise_daily_df.drop(['TotalSamples','Positive'], axis=1).corr()

In [None]:
statewise_daily_df.dropna(inplace=True)
statewise_daily_df.drop(['TotalSamples','Positive'], axis=1).to_csv("./statewise_features.csv")