In [None]:
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Import Relevant Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

In [None]:
root = '../input/indian-school-education-statistics/' # specify root path of where dataset is stored.

In [None]:
drop_out = pd.read_csv(root+'dropout-ratio-2012-2015.csv')


In [None]:
sum(drop_out.isna().sum()) #Check for missing values

In [None]:
drop_out.head(2)

In [None]:
drop_out['State_UT'].unique()

In [None]:
len(drop_out['State_UT'].unique())

### There are some states where there name is misinterpreted as two different states. "Tamil  Nadu" and "Tamil Nadu" should be same. Lets correct that first.

In [None]:
drop_out['State_UT'] = drop_out['State_UT'].apply(lambda x: "Arunachal Pradesh" if x == 'Arunachal  Pradesh' else x)
drop_out['State_UT'] = drop_out['State_UT'].apply(lambda x: "Madhya Pradesh" if x == 'Madhya  Pradesh' else x)
drop_out['State_UT'] = drop_out['State_UT'].apply(lambda x: "Tamil Nadu" if x == 'Tamil  Nadu' else x)

In [None]:
len(drop_out['State_UT'].unique())

In [None]:
drop_out.info()

In [None]:
drop_out.head(3)

In [None]:
all_cols = drop_out.columns.to_list()

In [None]:
categories = all_cols[2:]

From the data description I have learnt that NR, means Not-Recorded so we can either drop every row that has NR or fill it with 0. We'll go with the latter.
Also discovered that 'Uppe_r_Primary' is in a numerical feature, so we can consider it as NaN.

In [None]:
imputer = SimpleImputer(missing_values = 'NR', strategy='constant', fill_value=0)
imputer_1 = SimpleImputer(missing_values = 'Uppe_r_Primary', strategy='constant', fill_value=0)

In [None]:
drop_out = imputer.fit_transform(drop_out)
drop_out = pd.DataFrame(imputer_1.fit_transform(drop_out), columns=all_cols)

In [None]:
drop_out.head(3)

In [None]:
tmp = []
for categ in categories:
    tmp.append(pd.DataFrame({'mean_' + categ : drop_out.groupby(['State_UT'])[categ].mean()}))

In [None]:
drop_out_per_state = pd.DataFrame(tmp[0]) 
for state in range(1, len(tmp)):
    tmp[state].reset_index(inplace = True)
    drop_out_per_state = pd.merge(drop_out_per_state, tmp[state], on = 'State_UT') 

In [None]:
drop_out_per_state.set_index('State_UT', inplace=True)
drop_out_per_state.head(5)

In [None]:
plt.figure(figsize=(10, 20))
sns.heatmap(drop_out_per_state, annot = True)

From the above heat map we can draw the conclusion that "Odisha" has the top most school drop out.


In [None]:
boys_t = pd.read_csv(root+'schools-with-boys-toilet-2013-2016.csv')
girls_t = pd.read_csv(root+'schools-with-girls-toilet-2013-2016.csv')

In [None]:
boys_t.head(3)

In [None]:
len(boys_t.State_UT.unique())

In [None]:
tmp_1 = []
for col in boys_t.columns.to_list()[2:]:
    tmp_1.append(pd.DataFrame({'mean_' + col : boys_t.groupby(['State_UT'])[col].mean()}))

In [None]:
boys_t_per_state = pd.DataFrame(tmp_1[0]) 
for state in range(1, len(tmp_1)):
    tmp_1[state].reset_index(inplace = True)
    boys_t_per_state = pd.merge(boys_t_per_state, tmp_1[state], on = 'State_UT') 

In [None]:
# sum(boys_t_per_state.loc[boys_t_per_state.index[0]])

In [None]:
boys_t_per_state.set_index('State_UT', inplace=True)
boys_t_per_state.head(5)

In [None]:
boys_t_per_state['mean_All Schools'].plot(kind='bar', figsize =(30,5))

In [None]:
boys_t_per_state['mean_All Schools'].describe().plot(kind='bar', figsize = (10, 4))