In [1]:
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy import interp

pd.set_option('display.max_columns', None)

In [2]:
DATA = pd.read_csv('ALL_SCHOOLS.csv')
before = DATA.shape[0]

In [3]:
#Only elementary infrastructure schools
#print(DF)
DATA= DATA.loc[(DATA.IN_AGUA_INEXISTENTE == 0) & (DATA.IN_ENERGIA_INEXISTENTE == 0) & (DATA.IN_ESGOTO_INEXISTENTE == 0)]
print('keeping', DATA.shape[0]/before*100, '%')

keeping 99.35674287613739 %


In [4]:
QT_to_IN = ['QT_EQUIP_DVD', 'QT_EQUIP_COPIADORA', 'QT_EQUIP_IMPRESSORA', 'QT_EQUIP_TV']
conds = [DATA[QT_to_IN].values == 0 , DATA[QT_to_IN].values > 0]
choices = [0, 1]
DATA[QT_to_IN] = pd.DataFrame(np.select(conds, choices), index=DATA[QT_to_IN].index, columns=DATA[QT_to_IN].columns)
#DF[QT_to_IN] = DF[QT_to_IN].astype('category')

In [5]:
controls = ["CO_ANO", 'IN_TP_ESCOLA']
infrastructure = [
'IN_LABORATORIO_INFORMATICA',
'IN_LABORATORIO_CIENCIAS',
'IN_SALA_ATENDIMENTO_ESPECIAL',
'IN_BIBLIOTECA',
'IN_SALA_LEITURA',
'IN_BANHEIRO_FORA_PREDIO',
'IN_BANHEIRO_PNE',
'IN_DEPENDENCIAS_PNE',
'QT_SALAS_UTILIZADAS',
'QT_EQUIP_TV',
'QT_EQUIP_DVD',
'QT_EQUIP_COPIADORA',
'QT_EQUIP_IMPRESSORA',
'QT_COMP_ALUNO',
'IN_INTERNET',
'IN_BANDA_LARGA',
'QT_FUNCIONARIOS',
'IN_ALIMENTACAO']

hypothesis = ['EDU_PAI','EDU_MAE', 'TITULACAO']

socioeconomic =['QT_PESSOAS_CASA', 'RENDA_MENSAL', 'TP_COR_RACA']

target = ['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_MT', 'NU_NOTA_REDACAO', 'NU_NOTA_GERAL']


In [6]:
print('Infra',len(infrastructure))
print('Hypothesis',len(hypothesis))
print('Socio',len(socioeconomic))
independents = infrastructure + hypothesis + socioeconomic
DATA = DATA[controls + independents +  target]

Infra 18
Hypothesis 3
Socio 3


Functions that will be called in Model

In [7]:
## Dropping hight mode frequencies (upper 90%)
def mode_high_frequencie(df):
    columns_dropped = []
    ammount=0
    before= df.shape[1]
    #print("Number of features: ", before)
    for i in df:
        mode = df[i].mode()[0]
        threshold = 0.9
        count = df[(df[i]== mode)].shape[0]
        freq = count/df.shape[0]
        if freq >= threshold:
            ammount +=1
            #print("drop out", [i], "mode = ", mode )
            #df.drop([i], inplace = True, axis=1)
            columns_dropped.append(i)
            
    #print("Total Dropped: ",ammount)
    #print ("Remainning: ", before-ammount)  
    return(columns_dropped)

In [8]:
def clip_tail(df):
    quantitative = df[(df.nunique() > 2).index[(df.nunique() > 2)]].columns.to_list()
    #print(len(quantitative)
    df[quantitative]=df[quantitative].apply(lambda x: x.clip(upper = (np.percentile(x, 97.5))))
    df[quantitative]=df[quantitative].apply(lambda x: x.clip(lower = (np.percentile(x, 2.5))))
    return(df)

In [9]:
def scaler (df):
    scaler = MinMaxScaler()
    x = df.values
    x_scaled = scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled, columns = df.columns)
    return (df)

In [10]:
def build_target(df):
    df['TARGET'] = pd.qcut (df.NU_NOTA_GERAL, 4, labels = [1,2,3,4]).map(lambda x : 0 if x!=4 else 1) 
    print((df.TARGET==0).sum()/(df.TARGET.count())*100, '% lowers quartis')
    return df['TARGET']

In [11]:
#DATA = DATA[DATA.CO_ANO == 2015]

In [12]:
skf = StratifiedKFold(n_splits=2,random_state=None)
classifier = LogisticRegression()
data = []
groups ={}        
columns=['year','tp_escola','fold','group', 'features','fpr','tpr','auc']
score = 0
        
        
for year in DATA.CO_ANO.unique().tolist():
    df_y= DATA[DATA['CO_ANO']== year]
    print(year)
    for tp_escola in df_y.IN_TP_ESCOLA.unique().tolist():
        df_tp_sch = df_y[df_y['IN_TP_ESCOLA']== tp_escola]
        y = build_target(df_tp_sch)
        df_tp_sch = df_tp_sch[independents]
        # collect independents variables with mode frequency upper 90%   
        bad_columns = mode_high_frequencie(df_tp_sch[independents])
        # update each group list, removing them
        independents = df_tp_sch[independents].loc[:, ~df_tp_sch[independents].columns.isin(bad_columns)].columns.to_list()
        
        socioeconomic = df_tp_sch[socioeconomic].loc[:, ~df_tp_sch[socioeconomic].columns.isin(bad_columns)].columns.to_list()
        
        infrastructure = df_tp_sch[infrastructure].loc[:, ~df_tp_sch[infrastructure].columns.isin(bad_columns)].columns.to_list()
        
        hypothesis = df_tp_sch[hypothesis].loc[:, ~df_tp_sch[hypothesis].columns.isin(bad_columns)].columns.to_list()
        
        
        groups['FULL'] = df_tp_sch[independents]
        groups['INFRA'] = df_tp_sch[infrastructure]
        groups['SOCIO'] = df_tp_sch[socioeconomic]
        groups['HYPOTHESIS'] = df_tp_sch[hypothesis]
        #groups['INFRA_SOCIO'] = df_tp_sch[infrastructure + socioeconomic]
        #groups['HYPO_SOCIO'] = df_tp_sch[hypothesis + socioeconomic ]
        #groups['HYPO_INFRA'] = df_tp_sch[hypothesis + infrastructure]
        print(tp_escola)

        for gp in groups:
                tprs = []
                aucs = []
                features = groups[gp].columns.tolist()
                mean_tpr = 0.0
                mean_fpr = np.linspace(0, 1, 100)
                i=1
                print(gp, groups[gp].shape)
                
                for train_index, test_index in skf.split(groups[gp],y):
                    #print('{} of KFold {} in group {}'.format(i,skf.n_splits, gp))
                    xtr,xts = groups[gp].iloc[train_index],groups[gp].iloc[test_index]
                    ytr,yts = y.iloc[train_index],y.iloc[test_index]
                   
                    #Clipping long tail in train subset only
                    xtr = clip_tail(xtr) 
                    #Scaling Train and test set with the train metrics
                    scaler = MinMaxScaler()
                    xtr = scaler.fit_transform(xtr)
                    xts = scaler.transform(xts)
                    #model
                    model = classifier.fit(xtr,ytr)       
                    yproba = model.predict_proba(xts)[:,1]
        
                    fpr, tpr, _ = roc_curve(yts,  yproba)
                    score = roc_auc_score(yts,yproba)
                    #print(score)
        
                    interp_tpr = interp(mean_fpr, fpr, tpr)
                    interp_tpr[0] = 0.0
                    tprs.append(interp_tpr)
                    aucs.append(score)
                    
                    values = [year,tp_escola,i, gp, features, fpr,tpr,score]
                    zipped = zip(columns, values)
                    dictionary = dict(zipped) 
                    data.append(dictionary)
             
        
                  
                    i+=1
        #print(values[7].mean())
#df.concat(data, ignore_index=True)
        

2009
74.99691396123936 % lowers quartis
Municipal+Estadual
FULL (16202, 18)
INFRA (16202, 12)
SOCIO (16202, 3)
HYPOTHESIS (16202, 3)
74.99640546369518 % lowers quartis
Privada
FULL (6955, 17)
INFRA (6955, 11)
SOCIO (6955, 3)
HYPOTHESIS (6955, 3)
75.0 % lowers quartis
Federal
FULL (200, 15)
INFRA (200, 9)
SOCIO (200, 3)
HYPOTHESIS (200, 3)
2010
74.99723665303415 % lowers quartis
Municipal+Estadual
FULL (18094, 15)
INFRA (18094, 9)
SOCIO (18094, 3)
HYPOTHESIS (18094, 3)
75.00336519046978 % lowers quartis
Privada
FULL (7429, 14)
INFRA (7429, 8)
SOCIO (7429, 3)
HYPOTHESIS (7429, 3)
74.8792270531401 % lowers quartis
Federal
FULL (207, 14)
INFRA (207, 8)
SOCIO (207, 3)
HYPOTHESIS (207, 3)
2011
75.00332579486498 % lowers quartis
Privada
FULL (7517, 14)
INFRA (7517, 8)
SOCIO (7517, 3)
HYPOTHESIS (7517, 3)
75.00545494217762 % lowers quartis
Municipal+Estadual
FULL (18332, 14)
INFRA (18332, 8)
SOCIO (18332, 3)
HYPOTHESIS (18332, 3)
74.78991596638656 % lowers quartis
Federal
FULL (238, 13)
INFRA 

In [13]:
RT = pd.DataFrame.from_dict(data)
RT.to_csv('RESULTS_GROUP_ANALYSIS.csv', index = False)