In [1]:
import time
import pandas as pd
import numpy as np
import warnings
import feather
warnings.filterwarnings('ignore')
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
pd.set_option('display.max_columns', None)

There are many insignificant features to our work, like codes, PK, FK and out of context. 
Based on prior works and experience of especialists, will be taken off nonsense features. 
A study was carried out on the evolution and correspondence of these resources in the ENEM and Census
data for the period 2009-2018. See (featuresOrganisaton.xls)

In [2]:
enem = ['NU_ANO','CO_ESCOLA','TP_ENSINO','NU_IDADE','TP_SEXO','TP_COR_RACA','TP_ST_CONCLUSAO','NU_NOTA_CN',
        'NU_NOTA_CH','NU_NOTA_LC','NU_NOTA_MT','NU_NOTA_REDACAO','Q02','Q03','Q01','Q04'
]

school = ['ANO_CENSO','PK_COD_ENTIDADE', 'FK_COD_MUNICIPIO','FK_COD_ESTADO','ID_DEPENDENCIA_ADM','ID_AGUA_INEXISTENTE',
                  'ID_ESGOTO_INEXISTENTE','ID_ENERGIA_INEXISTENTE',
                   'ID_LABORATORIO_INFORMATICA','ID_LABORATORIO_CIENCIAS','ID_SALA_ATENDIMENTO_ESPECIAL',
                   'ID_BIBLIOTECA','ID_SALA_LEITURA','ID_SANITARIO_FORA_PREDIO',
                   'ID_SANITARIO_PNE','ID_DEPENDENCIAS_PNE','NUM_SALAS_UTILIZADAS','ID_EQUIP_TV',
                   'ID_EQUIP_DVD','ID_EQUIP_COPIADORA','ID_EQUIP_IMPRESSORA','NUM_COMP_ADMINISTRATIVOS',
                   'NUM_COMP_ALUNOS','ID_INTERNET','ID_BANDA_LARGA','NUM_FUNCIONARIOS','ID_ALIMENTACAO']

teacher = ['FK_COD_DOCENTE', 'PK_COD_ENTIDADE','ID_ESPECIALIZACAO','ID_MESTRADO','ID_DOUTORADO']

In [4]:
#Students of ENEM
E = pd.read_csv('~/data/enem/2010/DADOS/MICRODADOS_ENEM_2010.csv', sep=';', usecols = enem)

In [5]:
#School census data
CE =    pd.read_csv("~/data/censo/2010/DADOS/ESCOLAS.CSV", sep='|', encoding="iso-8859-2", usecols = school)

In [6]:
#Teacher census data by Braazilian mesoregion
CD_CO = pd.read_csv("~/data/censo/2010/DADOS/DOCENTES_CO.CSV", sep='|', encoding="iso-8859-2", usecols=teacher)
CD_NE = pd.read_csv("~/data/censo/2010/DADOS/DOCENTES_NORDESTE.CSV", sep='|', encoding="iso-8859-2", usecols=teacher)
CD_N =  pd.read_csv("~/data/censo/2010/DADOS/DOCENTES_NORTE.CSV", sep='|', encoding="iso-8859-2", usecols=teacher)
CD_SE = pd.read_csv("~/data/censo/2010/DADOS/DOCENTES_SUDESTE.CSV", sep='|', encoding="iso-8859-2", usecols=teacher)
CD_S =  pd.read_csv("~/data/censo/2010/DADOS/DOCENTES_SUL.CSV", sep='|', encoding="iso-8859-2", usecols=teacher)

#All teacher census data
CT = pd.concat([CD_CO, CD_NE, CD_N, CD_SE, CD_S])

In [7]:
ENEM = E.copy()
C_SCHOOL = CE.copy()
C_TEACHER = CT.copy()

In [8]:
#ordering
ENEM = ENEM[enem]

newNames = [
'CO_ANO',
'CO_ESCOLA',
'TP_ENSINO',
'NU_IDADE',
'TP_SEXO',
'TP_COR_RACA',
'TP_ST_CONCLUSAO',
'NU_NOTA_CN',
'NU_NOTA_CH',
'NU_NOTA_LC',
'NU_NOTA_MT',
'NU_NOTA_REDACAO',
'EDU_PAI',
'EDU_MAE',
'QT_PESSOAS_CASA',
'RENDA_MENSAL'
]

#Rename
ENEM.columns = newNames

print('checking year',ENEM.CO_ANO.unique())
print('checking key',ENEM.CO_ESCOLA.dtypes)

checking year [2010]
checking key float64


In [9]:
C_SCHOOL = C_SCHOOL[school]

newNames = ['ANO_CENSO',
'CO_ESCOLA',
 'CO_MUNICIPIO',
 'CO_UF',
 'CO_DEPENDENCIA_ADM',
 'IN_AGUA_INEXISTENTE',
 'IN_ESGOTO_INEXISTENTE',
 'IN_ENERGIA_INEXISTENTE',
 'IN_LABORATORIO_INFORMATICA',
 'IN_LABORATORIO_CIENCIAS',
 'IN_SALA_ATENDIMENTO_ESPECIAL',
 'IN_BIBLIOTECA',
 'IN_SALA_LEITURA',
 'IN_BANHEIRO_FORA_PREDIO',
 'IN_BANHEIRO_PNE',
 'IN_DEPENDENCIAS_PNE',
 'QT_SALAS_UTILIZADAS',
 'QT_EQUIP_TV',
 'QT_EQUIP_DVD',
 'QT_EQUIP_COPIADORA',
 'QT_EQUIP_IMPRESSORA',
 'QT_COM_ADMINISTRATIVO',
 'QT_COMP_ALUNO',
 'IN_INTERNET',
 'IN_BANDA_LARGA',
 'QT_FUNCIONARIOS',
 'IN_ALIMENTACAO'
]

#Rename 
C_SCHOOL.columns =  newNames

print('Checking year of censo',C_SCHOOL.ANO_CENSO.unique())
C_SCHOOL.drop('ANO_CENSO', axis=1, inplace=True)
print('checking key',C_SCHOOL.CO_ESCOLA.dtypes)

Checking year of censo [2010]
checking key int64


In [10]:
C_TEACHER = C_TEACHER[teacher]

# New default column names for all Datasets
newNames = [
'CO_PROFESSOR',
'CO_ESCOLA',  
'IN_ESPECIALIZACAO',
'IN_MESTRADO',
'IN_DOUTORADO'
]
#It's necessary drop duplicates whose are teachers in the same school but in different classes. 
#This way, the teacher appear just one time in each school. 
C_TEACHER.columns = newNames
C_TEACHER.drop_duplicates(subset=['CO_PROFESSOR', 'CO_ESCOLA'], inplace=True)
C_TEACHER.drop('CO_PROFESSOR', axis=1, inplace = True)

print('checking key',C_TEACHER.CO_ESCOLA.dtypes)

checking key int64


### Minimum Scope Definition

In [11]:
ENEM.dropna(inplace = True, axis=0, subset = ['CO_ESCOLA'])


In [12]:
#should care about the nulls into scope filter features?
print(ENEM['TP_ENSINO'].value_counts())
print(ENEM['TP_ENSINO'].isnull().sum())
#NO, IT'S LOWER

1.0    1189039
2.0     132190
3.0      54155
4.0       4062
Name: TP_ENSINO, dtype: int64
1


In [13]:
print('drop out',((ENEM.TP_ENSINO!= 1)|(ENEM.TP_ENSINO!= 3)).sum(), 'who not attend regular and prof approachs schools')
ENEM = ENEM.loc[((ENEM.TP_ENSINO== 1)|(ENEM.TP_ENSINO== 3))]
ENEM.drop('TP_ENSINO', inplace = True, axis =1)

drop out 1379447 who not attend regular and prof approachs schools


In [14]:
fields = (['NU_NOTA_CN', 'NU_NOTA_CH','NU_NOTA_LC','NU_NOTA_MT', 'NU_NOTA_REDACAO'])
for i in fields:
    print ('null:', i, ENEM[i].isnull().sum())
    print ('zero:', i, (ENEM[i]== 0).sum())
    
    #, "and", ENEM.I==0)

null: NU_NOTA_CN 181240
zero: NU_NOTA_CN 0
null: NU_NOTA_CH 181240
zero: NU_NOTA_CH 0
null: NU_NOTA_LC 207128
zero: NU_NOTA_LC 0
null: NU_NOTA_MT 207128
zero: NU_NOTA_MT 0
null: NU_NOTA_REDACAO 1
zero: NU_NOTA_REDACAO 249454


In [15]:
ENEM.dropna(inplace = True, axis=0, subset=fields)
ENEM = ENEM.loc[~(ENEM[fields] == 0).any(axis=1)]

In [16]:
#ONLY STUDENTS AT  17 -19 YEAR OLD. WE WANT TO GET ONLY REGULAR STUDENTS THAT ARE IN LAST YEAR OF SECONDARY SCHOOL. 
#THE MICRODADOS HAVE A FIELD TO CONTROL IT. BUT IT IS NOT CONFIDENCE,DUE A TURN OUT TEAACHERS THAT EXTEND ACADEMIC YEAR
print('null', ENEM.NU_IDADE.isnull().sum())
#firts, fill NA with the mean of student school
ENEM.NU_IDADE = ENEM.groupby(ENEM['CO_ESCOLA'])['NU_IDADE'].apply(lambda x: x.fillna(round(x.mean())))
print('taking off',((ENEM.NU_IDADE <17)|(ENEM.NU_IDADE>19)).sum())
ENEM = ENEM.loc[(ENEM.NU_IDADE >16) &  (ENEM.NU_IDADE<20)]
ENEM.NU_IDADE.value_counts()

null 142
taking off 136715


17.0    496281
18.0    276658
19.0     81040
Name: NU_IDADE, dtype: int64

In [17]:
ENEM[fields].describe()

Unnamed: 0,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,NU_NOTA_REDACAO
count,853979.0,853979.0,853979.0,853979.0,853979.0
mean,492.224591,545.561963,518.482147,519.517297,601.872324
std,81.398116,90.23385,76.841159,115.586347,129.266552
min,297.3,269.1,284.7,313.4,250.0
25%,432.3,482.9,469.2,427.7,500.0
50%,489.6,547.2,523.6,512.4,600.0
75%,548.8,607.3,571.6,597.5,700.0
max,837.6,883.7,795.2,973.2,1000.0


### Transform features to default categories

In [18]:
fields = ['TP_SEXO', 'TP_COR_RACA', 'EDU_PAI', 'EDU_MAE','QT_PESSOAS_CASA', 'RENDA_MENSAL']
ENEM[fields].isnull().sum()

TP_SEXO            0
TP_COR_RACA        0
EDU_PAI            0
EDU_MAE            0
QT_PESSOAS_CASA    0
RENDA_MENSAL       0
dtype: int64

In [19]:
for i in fields:
    print(i,ENEM[i].value_counts().index)

TP_SEXO Index(['F', 'M'], dtype='object')
TP_COR_RACA Float64Index([1.0, 3.0, 2.0, 0.0, 4.0, 5.0], dtype='float64')
EDU_PAI Index(['C', 'A', 'B', 'D', 'I', 'E', 'H', 'F', 'G'], dtype='object')
EDU_MAE Index(['C', 'B', 'A', 'D', 'E', 'I', 'H', 'F', 'G'], dtype='object')
QT_PESSOAS_CASA Index(['B', 'A', 'C', 'D', 'E'], dtype='object')
RENDA_MENSAL Index(['B', 'A', 'C', 'D', 'G', 'E', 'F', 'H'], dtype='object')


In [20]:
#Order relation, is better for numeric categories.
categorical = ['TP_SEXO', 'EDU_PAI', 'EDU_MAE','QT_PESSOAS_CASA', 'RENDA_MENSAL']
for i in categorical:
    codes, uniques = pd.factorize(ENEM[i], sort=True)
    ENEM[i] = codes
    

In [21]:
ENEM['EDU_PAI_TEMP'] = ENEM['EDU_PAI']
ENEM['EDU_MAE_TEMP'] = ENEM['EDU_MAE']
ENEM['QT_PESSOAS_CASA_TEMP'] = ENEM['QT_PESSOAS_CASA']
ENEM['RENDA_MENSAL_TEMP'] = ENEM['RENDA_MENSAL']

In [22]:
# Paring with default categories

ENEM.loc[ENEM['EDU_PAI']==0, 'EDU_PAI_TEMP']=1
ENEM.loc[ENEM['EDU_PAI']==1, 'EDU_PAI_TEMP']=2
ENEM.loc[ENEM['EDU_PAI']==2, 'EDU_PAI_TEMP']=3
ENEM.loc[ENEM['EDU_PAI']==3, 'EDU_PAI_TEMP']=4
ENEM.loc[ENEM['EDU_PAI']==4, 'EDU_PAI_TEMP']=5
ENEM.loc[ENEM['EDU_PAI']==5, 'EDU_PAI_TEMP']=5
ENEM.loc[ENEM['EDU_PAI']==6, 'EDU_PAI_TEMP']=5
ENEM.loc[ENEM['EDU_PAI']==7, 'EDU_PAI_TEMP']=0
ENEM.loc[ENEM['EDU_PAI']==8, 'EDU_PAI_TEMP']=0


ENEM.loc[ENEM['EDU_MAE']==0, 'EDU_MAE_TEMP']=1
ENEM.loc[ENEM['EDU_MAE']==1, 'EDU_MAE_TEMP']=2
ENEM.loc[ENEM['EDU_MAE']==2, 'EDU_MAE_TEMP']=3
ENEM.loc[ENEM['EDU_MAE']==3, 'EDU_MAE_TEMP']=4
ENEM.loc[ENEM['EDU_MAE']==4, 'EDU_MAE_TEMP']=5
ENEM.loc[ENEM['EDU_MAE']==5, 'EDU_MAE_TEMP']=5
ENEM.loc[ENEM['EDU_MAE']==6, 'EDU_MAE_TEMP']=5
ENEM.loc[ENEM['EDU_MAE']==7, 'EDU_MAE_TEMP']=0
ENEM.loc[ENEM['EDU_MAE']==8, 'EDU_MAE_TEMP']=0


ENEM.loc[ENEM['QT_PESSOAS_CASA']==0, 'QT_PESSOAS_CASA_TEMP']=1
ENEM.loc[ENEM['QT_PESSOAS_CASA']==1, 'QT_PESSOAS_CASA_TEMP']=2
ENEM.loc[ENEM['QT_PESSOAS_CASA']==2, 'QT_PESSOAS_CASA_TEMP']=3
ENEM.loc[ENEM['QT_PESSOAS_CASA']==3, 'QT_PESSOAS_CASA_TEMP']=3
ENEM.loc[ENEM['QT_PESSOAS_CASA']==4, 'QT_PESSOAS_CASA_TEMP']=0


ENEM.loc[ENEM['RENDA_MENSAL']==0, 'RENDA_MENSAL_TEMP']=1
ENEM.loc[ENEM['RENDA_MENSAL']==1, 'RENDA_MENSAL_TEMP']=2
ENEM.loc[ENEM['RENDA_MENSAL']==2, 'RENDA_MENSAL_TEMP']=3
ENEM.loc[ENEM['RENDA_MENSAL']==3, 'RENDA_MENSAL_TEMP']=4
ENEM.loc[ENEM['RENDA_MENSAL']==4, 'RENDA_MENSAL_TEMP']=5
ENEM.loc[ENEM['RENDA_MENSAL']==5, 'RENDA_MENSAL_TEMP']=5
ENEM.loc[ENEM['RENDA_MENSAL']==6, 'RENDA_MENSAL_TEMP']=5
ENEM.loc[ENEM['RENDA_MENSAL']==7, 'RENDA_MENSAL_TEMP']=0




In [23]:
#back to original columns
ENEM['EDU_PAI'] = ENEM['EDU_PAI_TEMP']
ENEM['EDU_MAE'] = ENEM['EDU_MAE_TEMP']
ENEM['QT_PESSOAS_CASA'] = ENEM['QT_PESSOAS_CASA_TEMP']
ENEM['RENDA_MENSAL'] = ENEM['RENDA_MENSAL_TEMP']


In [24]:
#drop temps

ENEM.drop(['EDU_PAI_TEMP', 'EDU_MAE_TEMP', 'QT_PESSOAS_CASA_TEMP','RENDA_MENSAL_TEMP'], axis=1, inplace=True)

In [25]:
print('Total Students at ENEM:', ENEM.shape[0], 'of', ENEM.drop_duplicates('CO_ESCOLA').shape[0], 'schools')
print('Total Schools at Census:', C_SCHOOL.shape[0])
           
#######################################
print('########### Only about Schools at ENEM:')

schoolsEnem = ENEM['CO_ESCOLA'].dropna().unique()
schoolsCenso = C_SCHOOL['CO_ESCOLA'].dropna().unique()
schoolsTeacher = C_TEACHER['CO_ESCOLA'].dropna().unique()
schoolsEnem2 = ENEM['CO_ESCOLA'].dropna()

enem_censo = np.setdiff1d(schoolsEnem ,schoolsCenso)
print('Schools at ENEM out of School Census', sum(np.isin(schoolsEnem, schoolsCenso, invert=True)))

print('Schools at ENEM out of Teacher Census', sum(np.isin(schoolsEnem, schoolsTeacher, invert=True)))
print('Schools at ENEM and School Census out of Teacher Census', sum(np.isin(enem_censo, schoolsTeacher)))

print('Students Wasted*****', sum(np.isin(schoolsEnem2, schoolsCenso, invert=True)))

     

Total Students at ENEM: 853979 of 26506 schools
Total Schools at Census: 259831
########### Only about Schools at ENEM:
Schools at ENEM out of School Census 58
Schools at ENEM out of Teacher Census 639
Schools at ENEM and School Census out of Teacher Census 0
Students Wasted***** 92


### New Features

ENEM

In [26]:
#Final Media, Target
ENEM['NU_NOTA_GERAL'] = (ENEM.NU_NOTA_CN + ENEM.NU_NOTA_CH + ENEM.NU_NOTA_LC + ENEM.NU_NOTA_MT + ENEM.NU_NOTA_REDACAO)/5

75.00570857128805 % lowers quartis


SCHOOL CENSUS - Itś better include after merge all datasets years.

TEACHER CENSUS

In [27]:
C_TEACHER.isnull().sum()

CO_ESCOLA                 0
IN_ESPECIALIZACAO    862278
IN_MESTRADO          862278
IN_DOUTORADO         862278
dtype: int64

TEACHER CENSUS

In [28]:
#lets fill it with lower level
print('number of teacher who work at an school listed at ENEM and do not have study level assigned:',
      C_TEACHER['CO_ESCOLA'].isin(ENEM['CO_ESCOLA']).sum())


C_TEACHER['IN_ESPECIALIZACAO'].fillna(0, inplace = True)
C_TEACHER['IN_MESTRADO'].fillna(0, inplace = True)
C_TEACHER['IN_DOUTORADO'].fillna(0, inplace = True)

number of teacher who work at an school listed at ENEM and do not have study level assigned: 939612


In [41]:
C_TEACHER['TITULACAO'] = 0


C_TEACHER.loc[(C_TEACHER.IN_ESPECIALIZACAO==1.0) & (C_TEACHER.IN_MESTRADO==0.0)& 
             (C_TEACHER.IN_DOUTORADO ==0.0), 'TITULACAO'] = 1

C_TEACHER.loc[(C_TEACHER.IN_ESPECIALIZACAO==1.0) & (C_TEACHER.IN_MESTRADO==1.0)& 
             (C_TEACHER.IN_DOUTORADO ==0.0), 'TITULACAO'] = 2

C_TEACHER.loc[(C_TEACHER.IN_ESPECIALIZACAO==0.0) & (C_TEACHER.IN_MESTRADO==1.0)& 
             (C_TEACHER.IN_DOUTORADO ==0.0), 'TITULACAO'] = 2

C_TEACHER.loc[(C_TEACHER.IN_DOUTORADO ==1.0), 'TITULACAO'] = 3

SUM= C_TEACHER.groupby('CO_ESCOLA')['TITULACAO'].transform(sum)
COUNT =  C_TEACHER.groupby('CO_ESCOLA')['TITULACAO'].transform('count')
C_TEACHER['TITULACAO'] = (SUM)/(COUNT*3)   

#transformin to school grain
C_TEACHER.drop_duplicates('CO_ESCOLA', inplace = True)

## STUDENT GRAIN

In [32]:
# teacher to Schoool Grain
print('C_SCHOOL',C_SCHOOL.shape)
print('C_TEACHER',C_TEACHER.shape)
CENSO19 = pd.merge(C_SCHOOL, C_TEACHER, on = 'CO_ESCOLA', how = 'inner')
print('C_SCHOOL + C_TEACHER',CENSO19.shape)

#Now, all to Student Grain
print('ENEM',ENEM.shape)
ENEM_CENSO_19 = pd.merge(ENEM, CENSO19, on = 'CO_ESCOLA', how = 'inner')
print('FINAL',ENEM_CENSO_19.shape)

ENEM_CENSO_19.to_csv('STUDENT.csv')

C_SCHOOL (259831, 26)
C_TEACHER (196266, 5)
C_SCHOOL + C_TEACHER (196266, 30)
ENEM (853979, 17)
FINAL (850064, 46)


In [33]:
ENEM_CENSO_19.head()

Unnamed: 0,CO_ANO,CO_ESCOLA,NU_IDADE,TP_SEXO,TP_COR_RACA,TP_ST_CONCLUSAO,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,NU_NOTA_REDACAO,EDU_PAI,EDU_MAE,QT_PESSOAS_CASA,RENDA_MENSAL,NU_NOTA_GERAL,TARGET,CO_MUNICIPIO,CO_UF,CO_DEPENDENCIA_ADM,IN_AGUA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_LABORATORIO_INFORMATICA,IN_LABORATORIO_CIENCIAS,IN_SALA_ATENDIMENTO_ESPECIAL,IN_BIBLIOTECA,IN_SALA_LEITURA,IN_BANHEIRO_FORA_PREDIO,IN_BANHEIRO_PNE,IN_DEPENDENCIAS_PNE,QT_SALAS_UTILIZADAS,QT_EQUIP_TV,QT_EQUIP_DVD,QT_EQUIP_COPIADORA,QT_EQUIP_IMPRESSORA,QT_COM_ADMINISTRATIVO,QT_COMP_ALUNO,IN_INTERNET,IN_BANDA_LARGA,QT_FUNCIONARIOS,IN_ALIMENTACAO,IN_ESPECIALIZACAO,IN_MESTRADO,IN_DOUTORADO,TITULACAO
0,2010,33105405.0,17.0,0,1.0,2.0,643.1,697.6,635.0,868.3,750.0,4,3,1,5,718.8,1,3304557,33,4,0.0,0.0,0.0,1,1,0,1,0,0,0,0,15.0,1.0,1.0,0.0,1.0,3.0,22.0,1.0,1.0,67.0,0.0,0.0,0.0,0.0,0.031008
1,2010,33105405.0,18.0,0,1.0,2.0,631.5,623.1,584.1,680.0,700.0,3,5,1,5,643.74,1,3304557,33,4,0.0,0.0,0.0,1,1,0,1,0,0,0,0,15.0,1.0,1.0,0.0,1.0,3.0,22.0,1.0,1.0,67.0,0.0,0.0,0.0,0.0,0.031008
2,2010,33105405.0,17.0,0,1.0,2.0,681.6,694.2,678.7,815.6,1000.0,5,5,1,5,774.02,1,3304557,33,4,0.0,0.0,0.0,1,1,0,1,0,0,0,0,15.0,1.0,1.0,0.0,1.0,3.0,22.0,1.0,1.0,67.0,0.0,0.0,0.0,0.0,0.031008
3,2010,33105405.0,17.0,1,1.0,2.0,667.7,757.6,632.8,798.6,700.0,4,4,1,5,711.34,1,3304557,33,4,0.0,0.0,0.0,1,1,0,1,0,0,0,0,15.0,1.0,1.0,0.0,1.0,3.0,22.0,1.0,1.0,67.0,0.0,0.0,0.0,0.0,0.031008
4,2010,33105405.0,17.0,0,1.0,2.0,684.2,735.6,641.4,777.6,700.0,4,4,2,5,707.76,1,3304557,33,4,0.0,0.0,0.0,1,1,0,1,0,0,0,0,15.0,1.0,1.0,0.0,1.0,3.0,22.0,1.0,1.0,67.0,0.0,0.0,0.0,0.0,0.031008


## SCHOOL GRAIN

In [34]:
fields = ['TP_SEXO', 'TP_COR_RACA', 'EDU_PAI', 'EDU_MAE','QT_PESSOAS_CASA', 'RENDA_MENSAL', 'NU_IDADE', 'NU_NOTA_CN',
             'NU_NOTA_CH','NU_NOTA_LC','NU_NOTA_MT','NU_NOTA_REDACAO','NU_NOTA_GERAL' ]

for i in fields:   
    
            ENEM[i] = ENEM.groupby('CO_ESCOLA')[i].transform('mean')
            
            

In [35]:
# SCHOOL GRAIN AND SHOW THE NEW NOTA DISTRIBUTION
ENEM.drop_duplicates('CO_ESCOLA', inplace=True)
print((ENEM.TARGET==0).sum()/(ENEM.TARGET.count())*100, '% lowers quartis')

71.31592846902588 % lowers quartis


In [36]:
#recalculating target for new  Nota Geral distribution 
ENEM['TARGET'] = pd.qcut (ENEM.NU_NOTA_GERAL, 4, labels = [1,2,3,4]).map(lambda x : 0 if x!=4 else 1) 
print((ENEM.TARGET==0).sum()/(ENEM.TARGET.count())*100, '% lowers quartis')

74.99811363464876 % lowers quartis


In [42]:
print('censo before',CENSO19.shape)
print('enem_school before', ENEM.shape)
CENSO_ENEM_19 = pd.merge(CENSO19, ENEM, on='CO_ESCOLA', how='inner')
print(CENSO_ENEM_19.shape)

CENSO_ENEM_19.to_csv('SCHOOL.csv', index=False)


censo before (196266, 30)
enem_school before (26506, 17)
(25867, 46)


In [43]:
CENSO_ENEM_19.head()

Unnamed: 0,CO_ESCOLA,CO_MUNICIPIO,CO_UF,CO_DEPENDENCIA_ADM,IN_AGUA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_LABORATORIO_INFORMATICA,IN_LABORATORIO_CIENCIAS,IN_SALA_ATENDIMENTO_ESPECIAL,IN_BIBLIOTECA,IN_SALA_LEITURA,IN_BANHEIRO_FORA_PREDIO,IN_BANHEIRO_PNE,IN_DEPENDENCIAS_PNE,QT_SALAS_UTILIZADAS,QT_EQUIP_TV,QT_EQUIP_DVD,QT_EQUIP_COPIADORA,QT_EQUIP_IMPRESSORA,QT_COM_ADMINISTRATIVO,QT_COMP_ALUNO,IN_INTERNET,IN_BANDA_LARGA,QT_FUNCIONARIOS,IN_ALIMENTACAO,IN_ESPECIALIZACAO,IN_MESTRADO,IN_DOUTORADO,TITULACAO,CO_ANO,NU_IDADE,TP_SEXO,TP_COR_RACA,TP_ST_CONCLUSAO,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,NU_NOTA_REDACAO,EDU_PAI,EDU_MAE,QT_PESSOAS_CASA,RENDA_MENSAL,NU_NOTA_GERAL,TARGET
0,12001945,1200336,12,2,0.0,0.0,0.0,1,1,1,1,0,0,1,0,7.0,1.0,1.0,1.0,1.0,4.0,15.0,0.0,0.0,62.0,1.0,1.0,0.0,0.0,0.091954,2010,17.764706,0.352941,3.117647,2.0,435.929412,470.094118,436.058824,409.111765,550.0,1.470588,1.882353,1.705882,1.176471,460.238824,0
1,12026476,1200401,12,3,0.0,0.0,0.0,0,0,0,0,0,1,0,0,4.0,1.0,1.0,1.0,1.0,1.0,,0.0,0.0,14.0,1.0,0.0,0.0,0.0,0.208333,2010,18.0,1.0,3.0,2.0,537.6,578.5,574.8,570.7,500.0,1.0,1.0,1.0,2.0,552.32,0
2,12000531,1200203,12,2,0.0,0.0,0.0,1,0,1,1,0,0,0,0,9.0,1.0,1.0,0.0,1.0,6.0,17.0,1.0,1.0,36.0,1.0,1.0,0.0,0.0,0.035088,2010,18.0,0.0,3.0,2.0,395.8,417.9,379.8,375.3,475.0,1.0,1.0,2.0,2.0,408.76,0
3,12001686,1200203,12,4,0.0,0.0,0.0,1,1,0,1,0,0,1,1,35.0,1.0,1.0,1.0,1.0,6.0,15.0,1.0,0.0,62.0,1.0,0.0,0.0,0.0,0.095238,2010,17.387097,0.387097,2.419355,2.0,518.503226,589.816129,565.109677,529.129032,702.419355,3.064516,3.612903,1.677419,3.064516,580.995484,1
4,12026530,1200401,12,4,0.0,0.0,0.0,1,1,0,1,0,1,1,1,12.0,1.0,1.0,1.0,1.0,13.0,49.0,1.0,1.0,53.0,1.0,1.0,0.0,0.0,0.142857,2010,17.33871,0.403226,2.435484,2.0,479.062903,531.237097,505.298387,482.86129,608.870968,1.612903,2.306452,1.709677,1.967742,521.466129,0


In [44]:
CENSO_ENEM_19.isnull().sum()

CO_ESCOLA                          0
CO_MUNICIPIO                       0
CO_UF                              0
CO_DEPENDENCIA_ADM                 0
IN_AGUA_INEXISTENTE                0
IN_ESGOTO_INEXISTENTE              0
IN_ENERGIA_INEXISTENTE             0
IN_LABORATORIO_INFORMATICA         0
IN_LABORATORIO_CIENCIAS            0
IN_SALA_ATENDIMENTO_ESPECIAL       0
IN_BIBLIOTECA                      0
IN_SALA_LEITURA                    0
IN_BANHEIRO_FORA_PREDIO            0
IN_BANHEIRO_PNE                    0
IN_DEPENDENCIAS_PNE                0
QT_SALAS_UTILIZADAS                0
QT_EQUIP_TV                        0
QT_EQUIP_DVD                       0
QT_EQUIP_COPIADORA                 0
QT_EQUIP_IMPRESSORA                0
QT_COM_ADMINISTRATIVO            596
QT_COMP_ALUNO                   2374
IN_INTERNET                        0
IN_BANDA_LARGA                     0
QT_FUNCIONARIOS                    0
IN_ALIMENTACAO                     0
IN_ESPECIALIZACAO                  0
I