In [9]:
# In this project we will look at literacy rates across the states of India and see what disparities 
# there are between rural and urban populations
# First, read in the dataset and import the right libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
literacy = pd.read_csv("C:\\Users\\user\\Desktop\\Data\\GOI.csv")
literacy.head()

Unnamed: 0,Category,Country/ States/ Union Territories Name,Literacy Rate (Persons) - Total - 2001,Literacy Rate (Persons) - Total - 2011,Literacy Rate (Persons) - Rural - 2001,Literacy Rate (Persons) - Rural - 2011,Literacy Rate (Persons) - Urban - 2001,Literacy Rate (Persons) - Urban - 2011
0,Country,INDIA,64.8,73.0,58.7,67.8,79.9,84.1
1,State,Andhra Pradesh,60.5,67.0,54.5,60.4,76.1,80.1
2,State,Arunachal Pradesh,54.3,65.4,47.8,59.9,78.3,82.9
3,State,Assam,63.3,72.2,59.7,69.3,85.3,88.5
4,State,Bihar,47.0,61.8,43.9,59.8,71.9,76.9


In [10]:
# First, we'll inspect the dataset to check for any null values
literacy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 8 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Category                                 36 non-null     object 
 1   Country/ States/ Union Territories Name  36 non-null     object 
 2   Literacy Rate (Persons) - Total - 2001   36 non-null     float64
 3   Literacy Rate (Persons) - Total - 2011   36 non-null     float64
 4   Literacy Rate (Persons) - Rural - 2001   36 non-null     float64
 5   Literacy Rate (Persons) - Rural - 2011   36 non-null     float64
 6   Literacy Rate (Persons) - Urban - 2001   36 non-null     float64
 7   Literacy Rate (Persons) - Urban - 2011   36 non-null     float64
dtypes: float64(6), object(2)
memory usage: 2.4+ KB


In [11]:
# So, we have 36 rows, one for each state or territory plus one for India itself. There are no null
# values, making this a nice dataset to work with.

# The columns show the literacy rates in 2001 and 2011, broken down by rural and urban population
# Let's first see which states have seen the biggest increase in literacy rate. We'll also rename our columns
# to give them heading that are easier to work with.
literacy.columns = ["Category", "Name", "2001", "2011", "2001 rural", "2011 rural", "2001 urban", "2011 urban"]
literacy.drop(["Category"], axis=1, inplace=True) # We don't need the category column
literacy.head()

Unnamed: 0,Name,2001,2011,2001 rural,2011 rural,2001 urban,2011 urban
0,INDIA,64.8,73.0,58.7,67.8,79.9,84.1
1,Andhra Pradesh,60.5,67.0,54.5,60.4,76.1,80.1
2,Arunachal Pradesh,54.3,65.4,47.8,59.9,78.3,82.9
3,Assam,63.3,72.2,59.7,69.3,85.3,88.5
4,Bihar,47.0,61.8,43.9,59.8,71.9,76.9


In [12]:
literacy["Change"] = literacy["2011"] - literacy["2001"]
print("Most improved:", literacy["Change"].max())
print("Least improved:", literacy["Change"].min())
print("Average improvement", literacy["Change"].mean())


Most improved: 18.6
Least improved: 2.5
Average improvement 8.27777777777778


In [13]:
# The biggest jump is 18.6, and the smallest 2.5. Let's see which states these were:
print(literacy[literacy["Change"] == 18.6])
print(literacy[literacy["Change"] == 2.5])

            Name  2001  2011  2001 rural  2011 rural  2001 urban  2011 urban  \
31  D & N Haveli  57.6  76.2        49.3        64.1        84.4        89.8   

    Change  
31    18.6  
       Name  2001  2011  2001 rural  2011 rural  2001 urban  2011 urban  \
18  Mizoram  88.8  91.3        81.3        84.1        96.1        97.6   

    Change  
18     2.5  


In [18]:
# So, D&N Haveli has seen the biggest jump, and Mizoram the least.

# If we group together the most and least improved, can we see where the improvements come from?
literacy.sort_values(by=["Change"], ascending=False, inplace=True)
literacy["Rural Change"] = literacy["2011 rural"] - literacy["2001 rural"]
literacy["Urban Change"] = literacy["2011 urban"] - literacy["2001 urban"]
top_5 = literacy[:5]
bottom_5 = literacy[-5:]
print("Top 5 average rural/urban change:", top_5["Rural Change"].mean(), "/", top_5["Urban Change"].mean())
print("Bottom 5 average rural/urban change:", bottom_5["Rural Change"].mean(), "/", bottom_5["Urban Change"].mean())



Top 5 average rural/urban change: 14.76 / 4.559999999999997
Bottom 5 average rural/urban change: 4.160000000000002 / 3.0199999999999987


In [20]:
# So, we can see that for those states that have increased their literacy rates the most, this has
# come through improvements in rural areas, where we can suppose the literacy rate tends to be lower
# We can easily check this:
print(literacy[literacy["Name"] == "INDIA"])


    Name  2001  2011  2001 rural  2011 rural  2001 urban  2011 urban  Change  \
0  INDIA  64.8  73.0        58.7        67.8        79.9        84.1     8.2   

   Rural Change  Urban Change  
0           9.1           4.2  


In [None]:
# Here we see that overall literacy has increased in India, but that rural literacy are generally lower
# than those in cities
# The big improvement across the country has come in rural areas, where there is more ground to make up