# Benford's Law
## Processing the given Data Frame

In [13]:
import pandas as pd
import pprint
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
import plotly.subplots as subplots
import plotly.io as pio
pio.templates.default = 'seaborn'
pio.renderers.default = 'notebook+pdf'
from plotly import tools
#init_notebook_mode()

In [14]:
import os
if not os.path.exists("q3_images"):
    os.mkdir("q3_images")

In [15]:
#Utilities.
my_printer = pprint.PrettyPrinter(depth=None)
get_nrow_ncol = lambda lst,col: [int(len(lst)/col),col]
style_df = lambda df: df.head(5).style.set_table_styles(
    [{
        'selector':
        'th',
        'props': [('background', '#FFFEE3'), ('color', 'black'),
                  ('font-family', 'verdana')]
    }, {
        'selector': 'td',
        'props': [('font-family', 'verdana')]
    }, {
        'selector': 'tr:nth-of-type(odd)',
        'props': [('background', '#ADD8E6')]
    }, {
        'selector': 'tr:nth-of-type(even)',
        'props': [('background', 'white')]
    }, {
        'selector': 'tr:hover',
        'props': [('background-color', '#FFFEE3')]
    }])
pd.set_option('max_rows', None)

In [16]:
states_df = pd.read_csv(r'states.csv')
style_df(states_df)

Unnamed: 0,Date,State,Confirmed,Recovered,Deceased,Other,Tested
0,2020-01-30,Kerala,1,0,0,0,
1,2020-01-30,India,1,0,0,0,
2,2020-02-02,Kerala,2,0,0,0,
3,2020-02-02,India,2,0,0,0,
4,2020-02-03,Kerala,3,0,0,0,


In [17]:
states_df = states_df.filter(['Date', 'State', 'Confirmed', 'Deceased'])
#Selecting the dates of interest
states_df = states_df[(states_df['Date'] >= '2021-03-15')
                      & (states_df['Date'] <= '2021-07-16')]
#Removing India since it's not a state.
states_df = states_df[states_df['State'] != 'India']
states_df = states_df.dropna()
#Selecting the first and second digits.
for col in ['Confirmed', 'Deceased']:
    states_df['First Digit of' + ' ' +
              col] = (states_df[col].astype(str).str[0])
    states_df['Second Digit of' + ' ' +
              col] = (states_df[col].astype(str).str[1])
style_df(states_df)

Unnamed: 0,Date,State,Confirmed,Deceased,First Digit of Confirmed,Second Digit of Confirmed,First Digit of Deceased,Second Digit of Deceased
13128,2021-03-15,Andaman and Nicobar Islands,5031,62,5,0,6,2
13129,2021-03-15,Andhra Pradesh,892008,7185,8,9,7,1
13130,2021-03-15,Arunachal Pradesh,16840,56,1,6,5,6
13131,2021-03-15,Assam,217817,1099,2,1,1,0
13132,2021-03-15,Bihar,263051,1552,2,6,1,5


In [18]:
#Lists of states and columns of importance.
states = states_df['State'].unique()
cols = list(states_df.columns)[4:]

In [19]:
class plotly_plots:
    def __init__(self):
        self.img_bytes = None
        pass

    def get_layout(elf, title):
        #Produces layout for a single plot.
        layout = {
            'title': {
                'text': title,
                'x': 0.4,
                'y': 0.9,
                'xanchor': 'center',
                'yanchor': 'bottom'
            },
            'xaxis': {
                'title': 'Digit',
                'tickvals': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
            },
            'yaxis': {
                'title': 'Count'
            },
            'width': 800,
            'height': 400,
            'barmode': 'group'
        }
        return layout

    def get_trace(self, Dict):
        #Returns traces that need to go into plot as a list.
        trace_names = [*Dict.keys()]
        trace_list = []
        for name in trace_names:
            x = [*Dict[name].keys()]
            y = [*Dict[name].values()]
            trace = go.Bar(x=x, y=y, name=name)
            trace_list.append(trace)
        return trace_list

    def draw_bar_for_state(
            self, state,
            Dict):  #Produces histogram for categorical data of a single state.
        #Dict is a dict of dictinaries of 1st and 2nd digits.
        data = self.get_trace(Dict)
        layout = self.get_layout(state)
        fig = go.Figure(data=data, layout=layout)
        fig.show()

    ##########################################################################
    #####  Don't use what is below now. Under Construction. ###################
    ##########################################################################
    def update_pos_row(self, old_pos, ncols):
        new_pos = old_pos
        if new_pos[-1] > ncols:
            new_pos[0] = old_pos[0] + 1
            new_pos[-1] = 1
        return new_pos

    def add_trace_to_fig(self, fig, trace_list, pos):
        row, col = pos
        for trace in trace_list:
            fig.add_trace(trace, row, col)
        return fig

    def draw_bar_for_all_states(self, DICT,
                                cols):  #Produces plots for all states.
        #Getting a list of all states.
        states = [*DICT.keys()]
        ncols = cols
        nrows = len(states) / cols
        fig = subplots.make_subplots(rows=int(nrows),
                                     cols=ncols,
                                     subplot_titles=tuple(states),
                                     print_grid=False,
                                     horizontal_spacing=0.05,
                                     vertical_spacing=0.05)
        pos = [1, 1]
        #Iterating through states.
        for state in states:
            Dict = DICT[state]
            trace_list = self.get_trace(Dict)
            pos = self.update_pos_row(pos, ncols)
            fig = self.add_trace_to_fig(fig, trace_list, pos)
            fig.update_layout(self.get_layout(""))
            pos[-1] = pos[-1] + 1
        fig['layout'].update(
            showlegend=False,
            height=5000,
            width=800)
        fig.show()

## Getting the State wise First and Second digits of Confirmed and Deceased Cases 

In [20]:
class second_wave_info(plotly_plots):
    def __init__(self, data_frame):
        self.df = data_frame  #The whole data frame
        self.cols = list(self.df.columns)[4:]
        self.states = self.df['State'].unique()
        self.all_state_benford_count = {}

    def get_list(self, df, col):
        #Converts a dataframe column of strings in to list.
        lst = [v for v in df[col].tolist()
               if v == v]  #Removes float nan if any in the list.
        return sorted(list(map(int, lst)))

    def get_all_state_benford_digit_count(self):
        #Iterating through all states.
        for state in self.states:
            temp_df = self.df[(self.df['State'] == state)]
            temp_dict = {}
            #Getting the first and second digits of state in loop.
            for col in self.cols:
                lst = self.get_list(temp_df, col)
                temp_dict[col] = {n: lst.count(n) for n in lst}
            self.all_state_benford_count[state] = temp_dict
        return self.all_state_benford_count

    def get_hist_for_state(self, state, all_state):
        plt = plotly_plots()
        if state:  #Produces histogram of a state passed as a string 'state'.
            state_benford = self.all_state_benford_count[state]
            plt.draw_bar_for_state(state, state_benford)
        ############################################################
        #####  Don't use what is below now. Under Construction #####
        ############################################################
        if all_state:  #Produces histogram for all states.
            plt.draw_bar_for_all_states(self.all_state_benford_count, 2)

In [21]:
India = second_wave_info(states_df)
indian_states_benford_dict = India.get_all_state_benford_digit_count()

* 'indian_states_benford_dict' contains all the statewise information needed for the study of Benford's Law.
* To get information of a state say Tamil Nadu,

In [22]:
my_printer.pprint(indian_states_benford_dict['Tamil Nadu'])

{'First Digit of Confirmed': {1: 39, 2: 50, 8: 21, 9: 14},
 'First Digit of Deceased': {1: 68, 2: 24, 3: 32},
 'Second Digit of Confirmed': {0: 14,
                               1: 11,
                               2: 12,
                               3: 12,
                               4: 23,
                               5: 16,
                               6: 13,
                               7: 8,
                               8: 8,
                               9: 7},
 'Second Digit of Deceased': {0: 7,
                              1: 8,
                              2: 43,
                              3: 29,
                              4: 9,
                              5: 6,
                              6: 5,
                              7: 6,
                              8: 6,
                              9: 5}}


## Plotting the State-Wise Distribution of First and Second Digits

In [11]:
for state in states:
    India.get_hist_for_state(state = state,all_state = False)

### State wise analysis
* States that follow Benford's Law should have a histogram with heavy bias towards smaller digits such as 0 and 1.
* Assam, Goa, Mizoram, Sikkim and Tamil Nadu approximately follows the Benford's law and this can be inferred from their histograms,

In [23]:
states_obey_benford = ['Assam','Goa','Mizoram','Sikkim','Tamil Nadu']
for state in states_obey_benford:
    India.get_hist_for_state(state = state,all_state = False)