## The Task is to use machine learning to forecast gross enrollment in 2015-16, using data of 2012-15 session, such as dropout rate, water and computer facilities. 
## In this notebook I approached the task as a classification problem and seperated the gross enrollment into 2 groups.

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Import Relevant Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer 
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, log_loss, f1_score, mean_squared_error

In [None]:
sm, md, lg = 13, 15, 20 
plt.rc('font', size=sm)
plt.rc('axes', labelsize=sm)
plt.rc('xtick', labelsize=md)
plt.rc('ytick', labelsize=md)
plt.rc('legend', fontsize=md)
plt.rc('figure', titlesize=lg)

In [None]:
root = '../input/indian-school-education-statistics/'

In [None]:
drop_out = pd.read_csv(root+'dropout-ratio-2012-2015.csv')
enrol = pd.read_csv(root+'gross-enrollment-ratio-2013-2016.csv')
comp = pd.read_csv(root+'percentage-of-schools-with-comps-2013-2016.csv')
elect = pd.read_csv(root+'percentage-of-schools-with-electricity-2013-2016.csv')
water = pd.read_csv(root+'percentage-of-schools-with-water-facility-2013-2016.csv')
boys = pd.read_csv(root+'schools-with-boys-toilet-2013-2016.csv')
girls = pd.read_csv(root+'schools-with-girls-toilet-2013-2016.csv')

In [None]:
drop_out['State_UT'] = drop_out['State_UT'].apply(lambda x: "Arunachal Pradesh" if x == 'Arunachal  Pradesh' else x)
drop_out['State_UT'] = drop_out['State_UT'].apply(lambda x: "Madhya Pradesh" if x == 'Madhya  Pradesh' else x)
drop_out['State_UT'] = drop_out['State_UT'].apply(lambda x: "Tamil Nadu" if x == 'Tamil  Nadu' else x)
drop_out['State_UT'] = drop_out['State_UT'].apply(lambda x: "Andaman & Nicobar Islands" if x == 'A & N Islands' else x)

## Data preprocessing

In [None]:
drop_out.sort_values(by=['State_UT','year'],inplace=True,ignore_index=True)
enrol.sort_values(by=['State_UT','Year'], inplace=True,ignore_index=True)
comp.sort_values(by=['State_UT','year'], inplace=True,ignore_index=True)
elect.sort_values(by=['State_UT','year'], inplace=True,ignore_index=True)
water.sort_values(by=['State/UT','Year'], inplace=True,ignore_index=True)
boys.sort_values(by=['State_UT','year'], inplace=True,ignore_index=True)
girls.sort_values(by=['State_UT','year'], inplace=True,ignore_index=True)

In [None]:
elect.head(3)

In [None]:
comp.head(3)

In [None]:
def CreateDataSets(mean_dataframes, tier):
    '''
    tier: which student level to create, 1 for primary,2 for upper primary, 3 for secondary, 4 for Higher secondary.
    result: DataFrame containing only specific student level
    '''
    columns = mean_dataframes.columns.to_list()[0:]
    index = mean_dataframes[columns[0]]
    print('Create Dataset with {} features'.format(len(columns))) 

    if tier == 1:
        coresult_1 = mean_dataframes[columns[1]]
        coresult_2 = mean_dataframes[columns[2]]
        coresult_3 = mean_dataframes[columns[3]]
    elif tier == 2:
        coresult_1 = mean_dataframes[columns[4]]
        coresult_2 = mean_dataframes[columns[5]]
        coresult_3 = mean_dataframes[columns[6]]
    elif tier == 3:
        coresult_1 = mean_dataframes[columns[7]]
        coresult_2 = mean_dataframes[columns[8]]
        coresult_3 = mean_dataframes[columns[9]]

    elif tier == 4:
        coresult_1 = mean_dataframes[columns[10]]
        coresult_2 = mean_dataframes[columns[11]]
        if len(columns)>12: coresult_3 = mean_dataframes[columns[12]]
        else: coresult_3 = pd.Series(np.ones(len(mean_dataframes)))
        
    result = pd.DataFrame(pd.concat([index, coresult_1,coresult_2,coresult_3], axis = 1))
    result.set_index('State_UT', inplace=True)
    return result

In [None]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name + '_'

def PreprocessFiles(dataframe, categories, year_filter=['2013-14','2014-15'], year='year', city = 'State_UT'):
    '''
    year_filter: years to be used while aggregating the dataset
    '''
    
    tmp = []
    if len(year_filter)>1:
        for categ in categories:
            dataframe[categ] = dataframe[categ].astype(float)                # If the datatype of our numerical features is object so we need to change to float.
            tmp.append(pd.DataFrame({'mean_' + categ : dataframe.iloc[np.where( (dataframe[year]==year_filter[0]) | (dataframe[year]==year_filter[1]))].groupby([city])[categ].mean()}))  
    
    else:
        for categ in categories:
            dataframe[categ] = dataframe[categ].astype(float)                # If the datatype of our numerical features is object so we need to change to float.
            tmp.append(pd.DataFrame({'mean_' + categ : dataframe.iloc[np.where( (dataframe[year]==year_filter[0]) )].groupby([city])[categ].mean()}))  
    
    mean_dataframe_per_state = pd.DataFrame(tmp[0])     # Initially add the first Student Category type in the DataFrame so it's easy to use pd.merge()
    for Stu_type in range(1, len(tmp)):           # Starting at an Index of 1 since since I already initialized our dataframe with the first Student type 
        tmp[Stu_type].reset_index(inplace = True)
        mean_dataframe_per_state = pd.merge(mean_dataframe_per_state, tmp[Stu_type], on = city) 
    columns = mean_dataframe_per_state.columns.to_list()

    new_cols_name = [city]
    new_cols_name.extend([get_df_name(dataframe) + col for col in columns if col != city])
    mapper = {columns[i]: new_cols_name[i] for i in range(len(columns))} 
    mean_dataframe_per_state.rename(columns = mapper, inplace=True)
    
    return mean_dataframe_per_state

## Drop Out Preprocessing

In [None]:
drop_out.head(3)

In [None]:
imputer = SimpleImputer(missing_values = 'NR', strategy='constant', fill_value=0)
imputer_1 = SimpleImputer(missing_values = 'Uppe_r_Primary', strategy='constant', fill_value=0)

In [None]:
drop_out_cols = drop_out.columns.to_list()
drop_out = imputer.fit_transform(drop_out)
drop_out = pd.DataFrame(imputer_1.fit_transform(drop_out), columns=drop_out_cols)

In [None]:
mean_drop_out_per_state = PreprocessFiles(drop_out, drop_out.columns[2:], year_filter=['2012-13', '2013-14'])

In [None]:
test_drop_out = PreprocessFiles(drop_out, drop_out.columns[2:], year_filter=['2014-15'])

In [None]:
mean_drop_out_per_state.head(3)

In [None]:
primary_drop_out = CreateDataSets(mean_drop_out_per_state, 1)
upp_drop_out = CreateDataSets(mean_drop_out_per_state, 2)
sec_drop_out = CreateDataSets(mean_drop_out_per_state, 3)
higher_drop_out = CreateDataSets(mean_drop_out_per_state, 4)

In [None]:
eval_primary_drop_out = CreateDataSets(test_drop_out, 1)
eval_upp_drop_out = CreateDataSets(test_drop_out, 2)
eval_sec_drop_out = CreateDataSets(test_drop_out, 3)
eval_higher_drop_out = CreateDataSets(test_drop_out, 4)

## Toilet Facilities Preprocessing

In [None]:
boys.head(3)

In [None]:
mean_boys_per_state = PreprocessFiles(boys, boys.columns[2:])

In [None]:
test_boys = PreprocessFiles(boys, boys.columns[2:], year_filter=['2015-16'])

In [None]:
mean_boys_per_state.head(3)

In [None]:
primary_boys = CreateDataSets(mean_boys_per_state, 1)
upp_boys = CreateDataSets(mean_boys_per_state, 2)
sec_boys = CreateDataSets(mean_boys_per_state, 3)
higher_boys = CreateDataSets(mean_boys_per_state, 4)

In [None]:
higher_boys.drop(columns=[0], inplace=True)

In [None]:
higher_boys.head(2)

In [None]:
eval_primary_boys = CreateDataSets(test_boys, 1)
eval_upp_boys = CreateDataSets(test_boys, 2)
eval_sec_boys = CreateDataSets(test_boys, 3)
eval_higher_boys = CreateDataSets(test_boys, 4)

In [None]:
eval_higher_boys.drop(columns=[0], inplace=True)

In [None]:
mean_girls_per_state = PreprocessFiles(girls, girls.columns[2:])

In [None]:
test_girls = PreprocessFiles(girls, girls.columns[2:], year_filter=['2015-16'])

In [None]:
mean_girls_per_state.head(3)

In [None]:
primary_girls = CreateDataSets(mean_girls_per_state, 1)
upp_girls = CreateDataSets(mean_girls_per_state, 2)
sec_girls = CreateDataSets(mean_girls_per_state, 3)
higher_girls = CreateDataSets(mean_girls_per_state, 4)

In [None]:
higher_girls.drop(columns=[0], inplace=True)

In [None]:
higher_girls.head(3)

In [None]:
eval_primary_girls = CreateDataSets(test_girls, 1)
eval_upp_girls = CreateDataSets(test_girls, 2)
eval_sec_girls = CreateDataSets(test_girls, 3)
eval_higher_girls = CreateDataSets(test_girls, 4)

In [None]:
eval_higher_girls.drop(columns=[0], inplace=True)

## Water Facilities 

In [None]:
water.head(3)

In [None]:
mean_water_facilities = PreprocessFiles(water, water.columns[2:], city ='State/UT', year = 'Year')

In [None]:
test_water_fac = PreprocessFiles(water, water.columns[2:],city ='State/UT', year = 'Year', year_filter=['2015-16'])
test_water_fac.rename(columns={'State/UT':'State_UT'}, inplace=True)

In [None]:
mean_water_facilities.head(3)

In [None]:
mean_water_facilities.rename(columns={'State/UT':'State_UT'}, inplace=True)

In [None]:
primary_water = CreateDataSets(mean_water_facilities, 1)
upp_water = CreateDataSets(mean_water_facilities, 2)
sec_water = CreateDataSets(mean_water_facilities, 3)
higher_water = CreateDataSets(mean_water_facilities, 4)

In [None]:
higher_water.drop(columns=[0], inplace=True)

In [None]:
higher_water.head(2)

In [None]:
eval_primary_water_fac = CreateDataSets(test_water_fac, 1)
eval_upp_water_fac = CreateDataSets(test_water_fac, 2)
eval_sec_water_fac = CreateDataSets(test_water_fac, 3)
eval_higher_water_fac = CreateDataSets(test_water_fac, 4)

In [None]:
eval_higher_water_fac.drop(columns=[0], inplace=True)

## Electricity Facilities

In [None]:
elect.head(3)

In [None]:
mean_elect_facilities = PreprocessFiles(elect, elect.columns[2:])

In [None]:
test_elect_fac = PreprocessFiles(elect, elect.columns[2:], year_filter=['2015-16'])

In [None]:
mean_elect_facilities.head(3)

In [None]:
primary_elect = CreateDataSets(mean_elect_facilities, 1)
upp_elect = CreateDataSets(mean_elect_facilities, 2)
sec_elect = CreateDataSets(mean_elect_facilities, 3)
higher_elect = CreateDataSets(mean_elect_facilities, 4)

In [None]:
higher_elect.drop(columns=[0], inplace=True)

In [None]:
upp_elect.head(2)

In [None]:
eval_primary_elect_fac = CreateDataSets(test_elect_fac, 1)
eval_upp_elect_fac = CreateDataSets(test_elect_fac, 2)
eval_sec_elect_fac = CreateDataSets(test_elect_fac, 3)
eval_higher_elect_fac = CreateDataSets(test_elect_fac, 4)

In [None]:
eval_higher_elect_fac.drop(columns=[0], inplace=True)

## Computer Facilities

In [None]:
comp.head(3)

In [None]:
mean_comp_facilities = PreprocessFiles(comp, comp.columns[2:])

In [None]:
test_comp_fac = PreprocessFiles(comp, comp.columns[2:], year_filter=['2015-16'])

In [None]:
mean_comp_facilities.head(3)

In [None]:
primary_comp = CreateDataSets(mean_comp_facilities, 1)
upp_comp = CreateDataSets(mean_comp_facilities, 2)
sec_comp = CreateDataSets(mean_comp_facilities, 3)
higher_comp = CreateDataSets(mean_comp_facilities, 4)

In [None]:
higher_comp.drop(columns=[0], inplace=True)

In [None]:
primary_comp.head(2)

In [None]:
eval_primary_comp_fac = CreateDataSets(test_comp_fac, 1)
eval_upp_comp_fac = CreateDataSets(test_comp_fac, 2)
eval_sec_comp_fac = CreateDataSets(test_comp_fac, 3)
eval_higher_comp_fac = CreateDataSets(test_comp_fac, 4)

In [None]:
eval_higher_comp_fac.drop(columns=[0], inplace=True)

In [None]:
def GenerateTrain(df_list, indexes=[]):
    '''
    df_list:  a list of similar dataframes to join into one
    '''
    tmp_train = []
    if len(indexes) == 0:
        for col,_ in enumerate(df_list):
            tmp_train.append(df_list[col])
    elif len(indexes)>=0:
        for _,col in enumerate(indexes):
            tmp_train.append(df_list[col])
    
    train = pd.concat(tmp_train, axis=1)
    return train

In [None]:
pry_useful = [primary_comp,primary_elect, primary_drop_out, primary_boys,primary_girls, primary_water]
pry_df = GenerateTrain(pry_useful)

In [None]:
upp_useful = [upp_comp,upp_elect, upp_drop_out, upp_boys,upp_girls, upp_water]
upp_df = GenerateTrain(upp_useful)

In [None]:
sec_useful = [sec_comp,sec_elect, sec_drop_out, sec_boys,sec_girls, sec_water]
sec_df = GenerateTrain(sec_useful)

In [None]:
high_useful = [higher_comp,higher_elect, higher_drop_out, higher_boys,higher_girls, higher_water]
high_df = GenerateTrain(high_useful)

Test

In [None]:
eval_pry_useful = [eval_primary_comp_fac,eval_primary_elect_fac,eval_primary_drop_out, eval_primary_boys, eval_primary_girls,eval_primary_water_fac]
eval_pry_df = GenerateTrain(eval_pry_useful)

In [None]:
eval_upp_useful = [eval_upp_comp_fac,eval_upp_elect_fac,eval_upp_drop_out, eval_upp_boys, eval_upp_girls,eval_upp_water_fac]
eval_upp_df = GenerateTrain(eval_upp_useful)

In [None]:
eval_sec_useful = [eval_sec_comp_fac,eval_sec_elect_fac,eval_sec_drop_out, eval_sec_boys, eval_sec_girls,eval_sec_water_fac]
eval_sec_df = GenerateTrain(eval_sec_useful)

In [None]:
eval_high_useful = [eval_higher_comp_fac,eval_higher_elect_fac,eval_higher_drop_out, eval_higher_boys, eval_higher_girls,eval_higher_water_fac]
eval_high_df = GenerateTrain(eval_high_useful)

## Gross Enrollment

what we're going to predict

In [None]:
enrol['State_UT'].replace({
    'MADHYA PRADESH':'Madhya Pradesh',
    'Pondicherry':'Puducherry',
    'Uttaranchal':'Uttar Pradesh'
},inplace=True)

In [None]:
enrol.head(3)

In [None]:
imputer_2 = SimpleImputer(missing_values = 'NR', strategy='constant', fill_value=0)
imputer_3 = SimpleImputer(missing_values = '@', strategy='constant', fill_value=0)

In [None]:
enrol_col = enrol.columns.to_list()
enrol = imputer_2.fit_transform(enrol)
enrol = pd.DataFrame(imputer_3.fit_transform(enrol), columns=enrol_col)

In [None]:
mean_enrol_per_state = PreprocessFiles(enrol, enrol_col[2:], year='Year')

In [None]:
test_mean_enrol_per_state = PreprocessFiles(enrol, enrol_col[2:], year='Year', year_filter=['2015-16'])

In [None]:
mean_enrol_per_state.head(3)

In [None]:
enrol_primary = CreateDataSets(mean_enrol_per_state, 1)
enrol_upper_primary = CreateDataSets(mean_enrol_per_state, 2)
enrol_secondary = CreateDataSets(mean_enrol_per_state, 3)
enrol_higher = CreateDataSets(mean_enrol_per_state, 4)

In [None]:
enrol_primary.head(3)

In [None]:
test_enrol_primary = CreateDataSets(test_mean_enrol_per_state, 1)
test_enrol_upper_primary = CreateDataSets(test_mean_enrol_per_state, 2)
test_enrol_secondary = CreateDataSets(test_mean_enrol_per_state, 3)
test_enrol_higher = CreateDataSets(test_mean_enrol_per_state, 4)

In [None]:
def CreateTargetFeature(dataframe, gender=2):
    ''' 0 for male, 1 for female, 2 for both male and female.'''
    
    cols = dataframe.columns.to_list()
    threshold = dataframe.describe().loc['50%'][gender]
    
    target = dataframe[cols[gender]]
    def thresholder(data):
        if data < threshold:
            return 0 #BAD
        elif data >= threshold:
            return 1 #GOOD
        else:
            pass
    target = target.apply(thresholder)
    return target

In [None]:
primary_target = CreateTargetFeature(enrol_primary)
upper_primary_target = CreateTargetFeature(enrol_upper_primary)
secondary_target = CreateTargetFeature(enrol_secondary)
higher_target = CreateTargetFeature(enrol_higher)

In [None]:
eval_primary_target = CreateTargetFeature(test_enrol_primary)
eval_upper_primary_target = CreateTargetFeature(test_enrol_upper_primary)
eval_secondary_target = CreateTargetFeature(test_enrol_secondary)
eval_higher_target = CreateTargetFeature(test_enrol_higher)

In [None]:
def DropNa(df, target):
    '''Drop states that may not be present in both datasets'''
    
    uncommon = [i for i in df.index if i not in target.index]
    df.drop([i for i in uncommon], inplace=True)
    df.fillna(method='ffill', inplace=True)
    pass

In [None]:
def blender(x):
    if x<0.5:return 0 
    elif x>=0.5: return 1
    else: pass

In [None]:
def ModelEvaluator(y_true, y_pred):
    eval_dict = {}
    eval_dict['log_loss'] = log_loss(y_true, y_pred)
    eval_dict['roc_auc_score'] = roc_auc_score(y_true, y_pred,)#multi_class="ovr")
    true_classes = pd.Series(y_pred).apply(blender)
    eval_dict['f1_score'] = f1_score(y_true, true_classes,)#average='macro')
    eval_dict['accuracy_score'] = accuracy_score(y_true, true_classes)
    return eval_dict

In [None]:
def Model(train, target, test, algo=RandomForestClassifier):
    errcb1=[]
    y_pred_totcb1=[]
    fold=StratifiedKFold(n_splits=9)
    i=1
    cols = train.columns.to_list()
    for train_index, test_index in fold.split(train,target):
        print(str(i) + ' iter')
        X_train, X_test = train.iloc[train_index], train.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        m1 = algo(n_estimators=100, random_state=2020)
        m1.fit(X_train, y_train)
        
        preds = m1.predict_proba(X_test)[:,1]
        print('err: ', log_loss(y_test, preds))
        errcb1.append(log_loss(y_test, preds))
        p1 = m1.predict_proba(test)[:,1]
        y_pred_totcb1.append(p1)
        
        best_feature = cols[np.argmax(m1.feature_importances_)]
        worst_feature = cols[np.argmin(m1.feature_importances_)]    
        print('Best Feature for the {} iteration is {}, while, the worst feature is {}'.format(i, best_feature, worst_feature))
        
        i+=1
    #np.mean(errcb1)
    return np.mean(y_pred_totcb1, axis=0)

In [None]:
metric_list = []                 ### Store the metric results of all Student category types

## Primary Student Category
### datasets are arranged as comp, elect, drop_out, boys, girls, water
### genders in target are arranged as boys, girls, both_gender

## Boys 

In [None]:
boy_pry_df = GenerateTrain(eval_pry_useful,[0,1,2,3,5])
boy_eval_pry_df = GenerateTrain(eval_pry_useful,[0,1,2,3,5]) 
boy_primary_target = CreateTargetFeature(enrol_primary,0)    #0 for boys
eval_boy_primary_target = CreateTargetFeature(test_enrol_primary,0) 

In [None]:
DropNa(boy_pry_df,boy_primary_target)          # Run cell only once
DropNa(boy_eval_pry_df,eval_boy_primary_target)

In [None]:
boy_primary_target.shape, boy_pry_df.shape, boy_eval_pry_df.shape, eval_boy_primary_target.shape

In [None]:
boy_pry_preds = Model(boy_pry_df, boy_primary_target, boy_eval_pry_df)

In [None]:
ModelEvaluator(eval_boy_primary_target, boy_pry_preds)

In [None]:
metric_list.append(['Primary Student Boys',ModelEvaluator(eval_boy_primary_target, boy_pry_preds)])   # run only once

## Girls

In [None]:
girl_pry_df = GenerateTrain(eval_pry_useful,[0,1,2,4,5])
girl_eval_pry_df = GenerateTrain(eval_pry_useful,[0,1,2,4,5]) 
girl_primary_target = CreateTargetFeature(enrol_primary,1)    #1 for girls
eval_girl_primary_target = CreateTargetFeature(test_enrol_primary, 1)

In [None]:
DropNa(girl_pry_df, girl_primary_target)   # Run  only once
DropNa(girl_eval_pry_df, eval_girl_primary_target)

In [None]:
girl_primary_target.shape, girl_pry_df.shape, girl_eval_pry_df.shape, eval_girl_primary_target.shape

In [None]:
girl_pry_preds = Model(girl_pry_df, girl_primary_target, girl_eval_pry_df)

In [None]:
ModelEvaluator(eval_girl_primary_target, girl_pry_preds)

In [None]:
metric_list.append(['Primary Student Girls', ModelEvaluator(eval_girl_primary_target, girl_pry_preds) ])   # run only once

## Upper Primary Student Category
### datasets are arranged as comp, elect, drop_out, boys, girls, water
### genders in target are arranged as boys, girls, both_gender

## Boys

In [None]:
boy_upp_df = GenerateTrain(eval_upp_useful,[0,1,2,3,5,])
boy_eval_upp_df = GenerateTrain(eval_upp_useful,[0,1,2,3,5]) 
boy_upp_target = CreateTargetFeature(enrol_upper_primary,0)    #0 for boys
eval_boy_upp_target = CreateTargetFeature(test_enrol_upper_primary, 0)

In [None]:
DropNa(boy_upp_df, boy_upp_target)   # Run  only once
DropNa(boy_eval_upp_df, eval_boy_upp_target)

In [None]:
boy_upp_target.shape, boy_upp_df.shape, boy_eval_upp_df.shape, eval_boy_upp_target.shape

In [None]:
boy_upp_preds = Model(boy_upp_df, boy_upp_target, boy_eval_upp_df)

In [None]:
ModelEvaluator(eval_boy_upp_target, boy_upp_preds)

In [None]:
metric_list.append(['Upper Primary Boys',ModelEvaluator(eval_boy_upp_target, boy_upp_preds)])   # run only once 

## Girls

In [None]:
girl_upp_df = GenerateTrain(eval_upp_useful,[0,1,2,4,5])
girl_eval_upp_df = GenerateTrain(eval_upp_useful,[0,1,2,4,5]) 
girl_upp_target = CreateTargetFeature(enrol_upper_primary,1)    #1 for girls
girl_eval_upp_target = CreateTargetFeature(test_enrol_upper_primary, 1)

In [None]:
DropNa(girl_upp_df, girl_upp_target)   # Run  only once
DropNa(girl_eval_upp_df, girl_eval_upp_target)

In [None]:
girl_upp_target.shape, girl_upp_df.shape, girl_eval_upp_df.shape, girl_eval_upp_target.shape

In [None]:
girl_upp_preds = Model(girl_upp_df, girl_upp_target, girl_eval_upp_df)

In [None]:
ModelEvaluator(girl_eval_upp_target, girl_upp_preds)

In [None]:
metric_list.append(['Upper Primary Girls', ModelEvaluator(girl_eval_upp_target, girl_upp_preds)])   #run only once

## Secondary Student Category
### datasets are arranged as comp, elect, drop_out, boys, girls, water
### genders in target are arranged as boys, girls, both_gender

## Boys

In [None]:
boy_sec_df = GenerateTrain(eval_sec_useful,[0,1,2,3,5])
boy_eval_sec_df = GenerateTrain(eval_sec_useful,[0,1,2,3,5]) 
boy_sec_target = CreateTargetFeature(enrol_secondary,0)    #0 for boys
boy_eval_sec_target = CreateTargetFeature(test_enrol_secondary, 0)

In [None]:
DropNa(boy_sec_df, boy_sec_target)   # Run  only once
DropNa(boy_eval_sec_df, boy_eval_sec_target)

In [None]:
boy_sec_target.shape, boy_sec_df.shape, boy_eval_sec_df.shape, boy_eval_sec_target.shape

In [None]:
boy_sec_preds = Model(boy_sec_df, boy_sec_target, boy_eval_sec_df)

In [None]:
ModelEvaluator(boy_eval_sec_target, boy_sec_preds)

In [None]:
metric_list.append(['Secondary Student Boys',ModelEvaluator(boy_eval_sec_target, boy_sec_preds)])   # run only once

## Girls

In [None]:
girl_sec_df = GenerateTrain(eval_sec_useful,[0,1,2,4,5])
girl_eval_sec_df = GenerateTrain(eval_sec_useful,[0,1,2,4,5]) 
girl_sec_target = CreateTargetFeature(enrol_secondary,1)    #1 for girls
girl_eval_sec_target = CreateTargetFeature(test_enrol_secondary,1)

In [None]:
DropNa(girl_sec_df, girl_sec_target)   # Run  only once
DropNa(girl_eval_sec_df, girl_eval_sec_target)

In [None]:
girl_sec_target.shape, girl_sec_df.shape, girl_eval_sec_df.shape, girl_eval_sec_target.shape

In [None]:
girl_sec_preds = Model(girl_sec_df, girl_sec_target, girl_eval_sec_df)

In [None]:
ModelEvaluator(girl_eval_sec_target, girl_sec_preds)

In [None]:
metric_list.append(['Secondary Student Girls', ModelEvaluator(girl_eval_sec_target, girl_sec_preds)])   #run only once

## Higher Secondary Student Category
### datasets are arranged as comp, elect, drop_out, boys, girls, water
### genders in target are arranged as boys, girls, both_gender

## Boys

In [None]:
boy_high_df = GenerateTrain(eval_high_useful,[0,1,2,3,5])
boy_eval_high_df = GenerateTrain(eval_high_useful,[0,1,2,3,5]) 
boy_high_target = CreateTargetFeature(enrol_higher,0)    #0 for boys
boy_eval_high_target = CreateTargetFeature(test_enrol_higher, 0)

In [None]:
DropNa(boy_high_df, boy_high_target)   # Run  only once
DropNa(boy_eval_high_df, boy_eval_high_target)

In [None]:
boy_high_target.shape, boy_high_df.shape, boy_eval_high_df.shape, boy_eval_high_target.shape

In [None]:
boy_high_preds = Model(boy_high_df, boy_high_target, boy_eval_high_df)

In [None]:
ModelEvaluator(boy_eval_high_target, boy_high_preds)

In [None]:
metric_list.append(['Higher Secondary Student Boys', ModelEvaluator(boy_eval_high_target, boy_high_preds)])  # run only once

## Girls

In [None]:
girl_high_df = GenerateTrain(eval_high_useful,[0,1,2,4,5])
girl_eval_high_df = GenerateTrain(eval_high_useful,[0,1,2,4,5]) 
girl_high_target = CreateTargetFeature(enrol_higher,1)    #1 for girls
girl_eval_high_target = CreateTargetFeature(test_enrol_higher,1)

In [None]:
DropNa(girl_high_df, girl_high_target)   # Run  only once
DropNa(girl_eval_high_df, girl_eval_high_target)

In [None]:
girl_high_target.shape, girl_high_df.shape, girl_eval_high_df.shape, girl_eval_high_target.shape

In [None]:
girl_high_preds = Model(girl_high_df, girl_high_target, girl_eval_high_df)

In [None]:
ModelEvaluator(girl_eval_high_target, girl_high_preds)

In [None]:
metric_list.append(['Higher Secondary Student Girls', ModelEvaluator(girl_eval_high_target, girl_high_preds)])  # run only once

In [None]:
len(metric_list)   # should be 8

In [None]:
student_type = pd.Series( [i[0] for i in metric_list], name='student_type' )
log_loss_metric = pd.Series( [i[1]['log_loss'] for i in metric_list], name='log_loss' )
roc_auc_score_metric = pd.Series( [i[1]['roc_auc_score'] for i in metric_list], name='roc_auc_score' )
f1_score_metric = pd.Series( [i[1]['f1_score'] for i in metric_list], name='f1_score' )
accuracy_metric = pd.Series( [i[1]['accuracy_score'] for i in metric_list], name='accuracy_score' )
metric_df = pd.DataFrame(pd.concat([student_type, log_loss_metric, roc_auc_score_metric, f1_score_metric, accuracy_metric], axis=1)).set_index('student_type')

In [None]:
metric_df

In [None]:
metric_df.plot(kind='bar', figsize=(27,7), title='Evaluation metrics')

In [None]:
metric_df.describe().drop('count').plot(kind='bar', figsize=(26,6), title='Distribution of metric performance')

## Translate Problem as a Regression Task

## Primary Student Boy

In [None]:
def CreateContinousTargetFeature(dataframe, gender=2):
    ''' 0 for male, 1 for female, 2 for both male and female.'''
    cols = dataframe.columns.to_list()
    
    target = dataframe[cols[gender]]
    return target

In [None]:
boy_cont_primary_target = CreateContinousTargetFeature(enrol_primary, 0)

In [None]:
eval_boy_cont_primary_target = CreateContinousTargetFeature(test_enrol_primary,0)

In [None]:
boy_cont_primary_target.shape, boy_pry_df.shape, boy_eval_pry_df.shape, eval_boy_cont_primary_target.shape

In [None]:
regressor = RandomForestRegressor(random_state=1960)

In [None]:
regressor.fit(boy_pry_df, boy_cont_primary_target)

In [None]:
boy_pry_cont_preds = regressor.predict(boy_eval_pry_df)

In [None]:
np.sqrt(mean_squared_error(eval_boy_cont_primary_target, boy_pry_cont_preds))

In [None]:
boy_pry_cont_preds[0], eval_boy_cont_primary_target[0]

### same steps can be reapeated for other student categories

In [None]:
!pip list > requirements.txt  