# Essential Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from statsmodels.graphics.gofplots import qqplot
%matplotlib inline
import seaborn as sns
import plotly.express as px
import geopandas as gpd
import re

# Lets look at the data...

In [None]:
df = pd.read_csv('../input/india-literacy-data-district-wise/Literacy Data 2011.csv')
df = df.iloc[:, 1:]
df['State'] = df['State'].apply(lambda x: x.lstrip())
df['District'] = df['District'].apply(lambda x: x.lstrip())
df.head()

# EDA

## Distribution of Literacy

In [None]:
def univariate_dist(data, col, color=None, theme='ggplot', figsize=(12, 10), hist_bins='auto'):
    """
    This functions plots the univariate distribution - histogram, boxplot and qqplot, 
    for a pandas dataframe 
    """
    with plt.style.context(theme):
        fig = plt.figure(figsize=figsize)
        plt.subplots_adjust(wspace=0.5, hspace=0.4)
        spec = gridspec.GridSpec(2, 3, figure=fig)
        # grid axis
        ax1 = fig.add_subplot(spec[0, :-1]) # first axis
        ax1.set_title('Histogram', color='crimson')
        ax2 = fig.add_subplot(spec[1, :-1]) # second axis
        ax2.set_title('QQ Plot', color='crimson')
        ax3 = fig.add_subplot(spec[:, -1:]) # third axis
        ax3.set_title('Boxplot', color='crimson')
        # plots
        sns.histplot(data=data, x=col, ax=ax1, color=color, kde=True, bins=hist_bins)
        qqplot(data[col], fit=True, line='45', ax=ax2, color=color)
        sns.boxplot(y=data[col], ax=ax3, color=color)
        #plt.suptitle(col.upper())
        return fig.show()
    
univariate_dist(df, 'Literacy', color='teal')

## Average Literacy Rate of Indian States

In [None]:
avg_state_literacy = df.groupby('State').agg({'Literacy': 'mean'}).reset_index().sort_values('Literacy')

- The figure below illustrates, the average literacy of each state, which is computed by taking the average of literacy rate for all the districts within a state.

In [None]:
# india map shape file
map_df = gpd.read_file('../input/india-states/Igismap/Indian_States.shp')
# corrections
map_df['st_nm'] = map_df['st_nm'].apply(lambda x: re.sub('&', 'and', x))
map
map_df.iloc[0, 0] = 'Andaman And Nicobar Islands'
map_df.iloc[1, 0] = 'Arunachal Pradesh'
map_df.iloc[6, 0] = 'Dadra and Nagar Haveli'
map_df.iloc[34, 0] = 'Orissa'
map_df.iloc[23, 0] = 'Delhi'
new_row = {'State': 'Telangana', 'Literacy': np.NAN}
avg_state_literacy = avg_state_literacy.append(new_row, ignore_index=True)
# merge the data frames 
merged = (map_df.set_index('st_nm').sort_index()).join(avg_state_literacy.set_index('State').sort_index())
# plot
with plt.style.context('ggplot'):
    fig, ax = plt.subplots(1, figsize=(15, 10))
    ax.axis('off')
    ax.set_title('Average Literacy Rates for each state', fontdict={'fontsize': '15', 'fontweight' : '3'})
    merged.plot(column='Literacy', cmap='RdYlBu', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)
    plt.show()

In [None]:
px.scatter(avg_state_literacy, 
           x='Literacy', 
           y='State', 
           template='ggplot2', title='Average Literacy Rate', height=800)

- The figure below illustrates the literacy rate of Indian districts. The literacy rate is divided into 5 equal intervals/categories, as shown in the legend.

In [None]:
# district shape file
district_map = gpd.read_file('../input/india-district-wise-shape-files/output.shp')
district_map = district_map[['geometry', 'distname']]
# join df with map file
merged2 = (district_map.set_index('distname').sort_index()).join(df.set_index('District').sort_index())
merged2 = merged2[['geometry', 'Literacy']].dropna()
# plot
with plt.style.context('ggplot'):
    fig, ax = plt.subplots(1, figsize=(12, 12))
    ax.axis('off')
    ax.set_title('District wise Literacy', fontdict={'fontsize': '25', 'fontweight' : '3'})
    merged2.plot(column='Literacy', cmap='RdYlBu', linewidth=0.8, ax=ax, edgecolor='0.8', 
                 legend=True, scheme='EqualInterval', k=5)
    plt.show()

## Top 10 districts with the highest literacy rate

In [None]:
df.sort_values(by='Literacy').set_index('District')[-10:].style.background_gradient(cmap='YlGnBu')

## Top 10 districts with lowest literacy rate.

In [None]:
df.sort_values(by='Literacy').set_index('District')[:10].style.background_gradient(cmap='RdYlBu')