In [1]:
import time
import pandas as pd
import numpy as np
import warnings
import feather
warnings.filterwarnings('ignore')
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
pd.set_option('display.max_columns', None)

There are many insignificant features to our work, like codes, PK, FK and out of context. 
Based on prior works and experience of especialists, will be taken off nonsense features. 
A study was carried out on the evolution and correspondence of these resources in the ENEM and Census
data for the period 2009-2018. See (featuresOrganisaton.xls)

In [2]:
teacher = ['CO_PESSOA_FISICA', 'CO_ENTIDADE','IN_ESPECIALIZACAO','IN_MESTRADO','IN_DOUTORADO']

school = ['NU_ANO_CENSO','CO_ENTIDADE', 'CO_MUNICIPIO','CO_UF','TP_DEPENDENCIA','IN_AGUA_INEXISTENTE',
                  'IN_ESGOTO_INEXISTENTE','IN_ENERGIA_INEXISTENTE',
                   'IN_LABORATORIO_INFORMATICA','IN_LABORATORIO_CIENCIAS','IN_SALA_ATENDIMENTO_ESPECIAL',
                   'IN_BIBLIOTECA','IN_SALA_LEITURA','IN_BANHEIRO_FORA_PREDIO',
                   'IN_BANHEIRO_PNE','IN_DEPENDENCIAS_PNE','NU_SALAS_UTILIZADAS','NU_EQUIP_TV',
                   'NU_EQUIP_DVD','NU_EQUIP_COPIADORA','NU_EQUIP_IMPRESSORA','NU_COMP_ADMINISTRATIVO',
                   'NU_COMP_ALUNO','IN_INTERNET','IN_BANDA_LARGA','NU_FUNCIONARIOS','IN_ALIMENTACAO']

enem = ['NU_ANO','CO_ESCOLA','TP_ENSINO','NU_IDADE','TP_SEXO','TP_COR_RACA','TP_ST_CONCLUSAO','NU_NOTA_CN',
        'NU_NOTA_CH','NU_NOTA_LC','NU_NOTA_MT','NU_NOTA_REDACAO','Q001','Q002','Q005','Q006'
]

In [3]:
#Students of ENEM
E = pd.read_csv('~/data/enem/2015/DADOS/MICRODADOS_ENEM_2015.csv', sep=',', encoding="iso-8859-2", usecols = enem)

In [4]:
#School census data
CE =    pd.read_csv("~/data/censo/2015/DADOS/ESCOLAS.CSV", sep='|', encoding="iso-8859-2", usecols = school)

In [5]:
#Teacher census data by Braazilian mesoregion
CD_CO = pd.read_csv("~/data/censo/2015/DADOS/DOCENTES_CO.CSV", sep='|', encoding="iso-8859-2", usecols = teacher)
CD_NE = pd.read_csv("~/data/censo/2015/DADOS/DOCENTES_NORDESTE.CSV", sep='|', encoding="iso-8859-2", usecols = teacher)
CD_N =  pd.read_csv("~/data/censo/2015/DADOS/DOCENTES_NORTE.CSV", sep='|', encoding="iso-8859-2", usecols = teacher)
CD_SE = pd.read_csv("~/data/censo/2015/DADOS/DOCENTES_SUDESTE.CSV", sep='|', encoding="iso-8859-2", usecols = teacher)
CD_S =  pd.read_csv("~/data/censo/2015/DADOS/DOCENTES_SUL.CSV", sep='|', encoding="iso-8859-2", usecols = teacher)

#All teacher census data
CT = pd.concat([CD_CO, CD_NE, CD_N, CD_SE, CD_S])

In [6]:
ENEM = E.copy()
C_SCHOOL = CE.copy()
C_TEACHER = CT.copy()

In [7]:
#ordering
ENEM = ENEM[enem]

newNames = [
'CO_ANO',
'CO_ESCOLA',
'TP_ENSINO',
'NU_IDADE',
'TP_SEXO',
'TP_COR_RACA',
'TP_ST_CONCLUSAO',
'NU_NOTA_CN',
'NU_NOTA_CH',
'NU_NOTA_LC',
'NU_NOTA_MT',
'NU_NOTA_REDACAO',
'EDU_PAI',
'EDU_MAE',
'QT_PESSOAS_CASA',
'RENDA_MENSAL'
]

#Rename
ENEM.columns = newNames

print('checking year',ENEM.CO_ANO.unique())
print('checking key',ENEM.CO_ESCOLA.dtypes)

checking year [2015]
checking key float64


In [8]:
C_SCHOOL = C_SCHOOL[school]

newNames = ['ANO_CENSO',
'CO_ESCOLA',
 'CO_MUNICIPIO',
 'CO_UF',
 'CO_DEPENDENCIA_ADM',
 'IN_AGUA_INEXISTENTE',
 'IN_ESGOTO_INEXISTENTE',
 'IN_ENERGIA_INEXISTENTE',
 'IN_LABORATORIO_INFORMATICA',
 'IN_LABORATORIO_CIENCIAS',
 'IN_SALA_ATENDIMENTO_ESPECIAL',
 'IN_BIBLIOTECA',
 'IN_SALA_LEITURA',
 'IN_BANHEIRO_FORA_PREDIO',
 'IN_BANHEIRO_PNE',
 'IN_DEPENDENCIAS_PNE',
 'QT_SALAS_UTILIZADAS',
 'QT_EQUIP_TV',
 'QT_EQUIP_DVD',
 'QT_EQUIP_COPIADORA',
 'QT_EQUIP_IMPRESSORA',
 'QT_COM_ADMINISTRATIVO',
 'QT_COMP_ALUNO',
 'IN_INTERNET',
 'IN_BANDA_LARGA',
 'QT_FUNCIONARIOS',
 'IN_ALIMENTACAO'
]

#Rename 
C_SCHOOL.columns =  newNames

print('Checking year of censo',C_SCHOOL.ANO_CENSO.unique())
C_SCHOOL.drop('ANO_CENSO', axis=1, inplace=True)
print('checking key',C_SCHOOL.CO_ESCOLA.dtypes)

Checking year of censo [2015]
checking key int64


In [9]:
C_TEACHER = C_TEACHER[teacher]

# New default column names for all Datasets
newNames = [
'CO_PROFESSOR',
'CO_ESCOLA',  
'IN_ESPECIALIZACAO',
'IN_MESTRADO',
'IN_DOUTORADO'
]
#It's necessary drop duplicates whose are teachers in the same school but in different classes. 
#This way, the teacher appear just one time in each school. 
C_TEACHER.columns = newNames
C_TEACHER.drop_duplicates(subset=['CO_PROFESSOR', 'CO_ESCOLA'], inplace=True)
C_TEACHER.drop('CO_PROFESSOR', axis=1, inplace = True)

print('checking key',C_TEACHER.CO_ESCOLA.dtypes)

checking key int64


### Minimum Scope Definition

In [10]:
ENEM.dropna(inplace = True, axis=0, subset = ['CO_ESCOLA'])

In [11]:
#should care about the nulls into scope filter features?
filters = (['TP_ST_CONCLUSAO', 'TP_ENSINO'])
ENEM[filters].isnull().sum()
#NO, IT'S LOWER

TP_ST_CONCLUSAO     0
TP_ENSINO          29
dtype: int64

In [12]:
print('drop out',((ENEM.TP_ENSINO!= 1)|(ENEM.TP_ENSINO!= 3)).sum(), 'who not attend regular and prof approachs schools')
ENEM = ENEM.loc[((ENEM.TP_ENSINO== 1)|(ENEM.TP_ENSINO== 3))]
ENEM.drop('TP_ENSINO', inplace = True, axis =1)


drop out 1649258 who not attend regular and prof approachs schools


In [13]:
fields = (['NU_NOTA_CN', 'NU_NOTA_CH','NU_NOTA_LC','NU_NOTA_MT', 'NU_NOTA_REDACAO'])
for i in fields:
    print ('null:', i, ENEM[i].isnull().sum())
    print ('zero:', i, (ENEM[i]== 0).sum())
    
    #, "and", ENEM.I==0)


null: NU_NOTA_CN 202748
zero: NU_NOTA_CN 416
null: NU_NOTA_CH 202748
zero: NU_NOTA_CH 171
null: NU_NOTA_LC 221025
zero: NU_NOTA_LC 670
null: NU_NOTA_MT 221025
zero: NU_NOTA_MT 1105
null: NU_NOTA_REDACAO 361
zero: NU_NOTA_REDACAO 237685


In [14]:
ENEM.dropna(inplace = True, axis=0, subset=fields)
ENEM = ENEM.loc[~(ENEM[fields] == 0).any(axis=1)]

In [15]:
#ONLY STUDENTS AT  17 -19 YEAR OLD. WE WANT TO GET ONLY REGULAR STUDENTS THAT ARE IN LAST YEAR OF SECONDARY SCHOOL. 
#THE MICRODADOS HAVE A FIELD TO CONTROL IT. BUT IT IS NOT CONFIDENCE,DUE A TURN OUT TEAACHERS THAT EXTEND ACADEMIC YEAR
print('null', ENEM.NU_IDADE.isnull().sum())
#firts, fill NA with the mean of student school
ENEM.NU_IDADE = ENEM.groupby(ENEM['CO_ESCOLA'])['NU_IDADE'].apply(lambda x: x.fillna(round(x.mean())))
ENEM.NU_IDADE = ENEM.NU_IDADE.astype('int32')
print('taking off',((ENEM.NU_IDADE <17)|(ENEM.NU_IDADE>19)).sum())
ENEM = ENEM.loc[(ENEM.NU_IDADE >16) &  (ENEM.NU_IDADE<20)]
ENEM.NU_IDADE.value_counts()

null 24
taking off 194763


17    675128
18    403958
19    123058
Name: NU_IDADE, dtype: int64

In [16]:
ENEM[fields].describe()

Unnamed: 0,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,NU_NOTA_REDACAO
count,1202144.0,1202144.0,1202144.0,1202144.0,1202144.0
mean,481.6616,559.3159,508.5818,480.1588,554.3546
std,74.86859,71.54767,71.44878,110.1492,130.0314
min,335.3,339.8,303.3,285.4,40.0
25%,425.5,513.2,461.3,399.9,480.0
50%,472.2,564.5,511.3,457.4,540.0
75%,527.9,609.0,559.1,534.9,620.0
max,869.0,850.6,825.8,1008.3,1000.0


### Transform features to default categories

In [17]:
fields = ['TP_SEXO', 'TP_COR_RACA', 'EDU_PAI', 'EDU_MAE','QT_PESSOAS_CASA', 'RENDA_MENSAL']
ENEM[fields].isnull().sum()

TP_SEXO              0
TP_COR_RACA          0
EDU_PAI            157
EDU_MAE            158
QT_PESSOAS_CASA    159
RENDA_MENSAL       159
dtype: int64

In [18]:
for i in fields:
    ENEM[i].fillna(ENEM[i].mode()[0], inplace=True)

In [19]:
for i in fields:
    print(i,ENEM[i].value_counts().index)

TP_SEXO Index(['F', 'M'], dtype='object')
TP_COR_RACA Int64Index([1, 3, 2, 4, 0, 5, 6], dtype='int64')
EDU_PAI Index(['E', 'B', 'C', 'D', 'H', 'F', 'G', 'A'], dtype='object')
EDU_MAE Index(['E', 'D', 'C', 'B', 'F', 'G', 'H', 'A'], dtype='object')
QT_PESSOAS_CASA Float64Index([ 4.0,  3.0,  5.0,  6.0,  2.0,  7.0,  8.0,  1.0,  9.0, 10.0, 11.0,
              12.0, 13.0, 14.0, 20.0, 15.0, 16.0, 18.0, 17.0, 19.0],
             dtype='float64')
RENDA_MENSAL Index(['C', 'B', 'D', 'F', 'E', 'G', 'H', 'I', 'J', 'K', 'A', 'Q', 'N', 'O',
       'L', 'P', 'M'],
      dtype='object')


In [20]:
#Order relation, is better for numeric categories.
categorical = ['TP_SEXO','EDU_PAI', 'EDU_MAE', 'RENDA_MENSAL']
for i in categorical:
    codes, uniques = pd.factorize(ENEM[i], sort=True)
    ENEM[i] = codes

In [21]:
ENEM['EDU_PAI_TEMP'] = ENEM['EDU_PAI']
ENEM['EDU_MAE_TEMP'] = ENEM['EDU_MAE']
ENEM['QT_PESSOAS_CASA_TEMP'] = ENEM['QT_PESSOAS_CASA']
ENEM['RENDA_MENSAL_TEMP'] = ENEM['RENDA_MENSAL']


In [22]:
# Paring with default categories

ENEM.loc[ENEM['EDU_PAI']==0, 'EDU_PAI_TEMP']=0
ENEM.loc[ENEM['EDU_PAI']==1, 'EDU_PAI_TEMP']=0
ENEM.loc[ENEM['EDU_PAI']==2, 'EDU_PAI_TEMP']=1
ENEM.loc[ENEM['EDU_PAI']==3, 'EDU_PAI_TEMP']=2
ENEM.loc[ENEM['EDU_PAI']==4, 'EDU_PAI_TEMP']=3
ENEM.loc[ENEM['EDU_PAI']==5, 'EDU_PAI_TEMP']=4
ENEM.loc[ENEM['EDU_PAI']==6, 'EDU_PAI_TEMP']=5
ENEM.loc[ENEM['EDU_PAI']==7, 'EDU_PAI_TEMP']=0

ENEM.loc[ENEM['EDU_MAE']==0, 'EDU_MAE_TEMP']=0
ENEM.loc[ENEM['EDU_MAE']==1, 'EDU_MAE_TEMP']=0
ENEM.loc[ENEM['EDU_MAE']==2, 'EDU_MAE_TEMP']=1
ENEM.loc[ENEM['EDU_MAE']==3, 'EDU_MAE_TEMP']=2
ENEM.loc[ENEM['EDU_MAE']==4, 'EDU_MAE_TEMP']=3
ENEM.loc[ENEM['EDU_MAE']==5, 'EDU_MAE_TEMP']=4
ENEM.loc[ENEM['EDU_MAE']==6, 'EDU_MAE_TEMP']=5
ENEM.loc[ENEM['EDU_MAE']==7, 'EDU_MAE_TEMP']=0



ENEM.loc[ENEM['QT_PESSOAS_CASA']==1, 'QT_PESSOAS_CASA_TEMP']=0
ENEM.loc[ENEM['QT_PESSOAS_CASA']==2, 'QT_PESSOAS_CASA_TEMP']=1
ENEM.loc[ENEM['QT_PESSOAS_CASA']==3, 'QT_PESSOAS_CASA_TEMP']=1
ENEM.loc[ENEM['QT_PESSOAS_CASA']==4, 'QT_PESSOAS_CASA_TEMP']=2
ENEM.loc[ENEM['QT_PESSOAS_CASA']==5, 'QT_PESSOAS_CASA_TEMP']=2
ENEM.loc[ENEM['QT_PESSOAS_CASA']==6, 'QT_PESSOAS_CASA_TEMP']=2
ENEM.loc[ENEM['QT_PESSOAS_CASA']==7, 'QT_PESSOAS_CASA_TEMP']=2
ENEM.loc[ENEM['QT_PESSOAS_CASA']>7, 'QT_PESSOAS_CASA_TEMP']=3

ENEM.loc[ENEM['RENDA_MENSAL']==0, 'RENDA_MENSAL_TEMP']=0
ENEM.loc[ENEM['RENDA_MENSAL']==1, 'RENDA_MENSAL_TEMP']=1
ENEM.loc[ENEM['RENDA_MENSAL']==2, 'RENDA_MENSAL_TEMP']=2
ENEM.loc[ENEM['RENDA_MENSAL']==3, 'RENDA_MENSAL_TEMP']=2
ENEM.loc[(ENEM['RENDA_MENSAL']>3) & (ENEM['RENDA_MENSAL']<8), 'RENDA_MENSAL_TEMP']=3
ENEM.loc[(ENEM['RENDA_MENSAL']>7) & (ENEM['RENDA_MENSAL']<13), 'RENDA_MENSAL_TEMP']=4
ENEM.loc[ENEM['RENDA_MENSAL']>12, 'RENDA_MENSAL_TEMP']=5


ENEM.loc[ENEM['TP_COR_RACA']==6, 'TP_COR_RACA']=0


In [23]:
#back to original columns
ENEM['EDU_PAI'] = ENEM['EDU_PAI_TEMP']
ENEM['EDU_MAE'] = ENEM['EDU_MAE_TEMP']
ENEM['QT_PESSOAS_CASA'] = ENEM['QT_PESSOAS_CASA_TEMP']
ENEM['RENDA_MENSAL'] = ENEM['RENDA_MENSAL_TEMP']

In [24]:
#drop temps

ENEM.drop(['EDU_PAI_TEMP', 'EDU_MAE_TEMP', 'QT_PESSOAS_CASA_TEMP','RENDA_MENSAL_TEMP'], axis=1, inplace=True)

In [25]:
print('Total Students at ENEM:', ENEM.shape[0], 'of', ENEM.drop_duplicates('CO_ESCOLA').shape[0], 'schools')
print('Total Schools at Census:', C_SCHOOL.shape[0])

            

#######################################
print('########### Only about Schools at ENEM:')

schoolsEnem = ENEM['CO_ESCOLA'].dropna().unique()
schoolsCenso = C_SCHOOL['CO_ESCOLA'].dropna().unique()
schoolsTeacher = C_TEACHER['CO_ESCOLA'].dropna().unique()
schoolsEnem2 = ENEM['CO_ESCOLA'].dropna()

enem_censo = np.setdiff1d(schoolsEnem ,schoolsCenso)
print('Schools at ENEM out of School Census', sum(np.isin(schoolsEnem, schoolsCenso, invert=True)))

print('Schools at ENEM out of Teacher Census', sum(np.isin(schoolsEnem, schoolsTeacher, invert=True)))
print('Schools at ENEM and School Census out of Teacher Census', sum(np.isin(enem_censo, schoolsTeacher)))

print('Students Wasted*****', sum(np.isin(schoolsEnem2, schoolsCenso, invert=True)))

Total Students at ENEM: 1202144 of 29628 schools
Total Schools at Census: 272996
########### Only about Schools at ENEM:
Schools at ENEM out of School Census 161
Schools at ENEM out of Teacher Census 745
Schools at ENEM and School Census out of Teacher Census 0
Students Wasted***** 2012


In [26]:
print('number of teacher who work at an school listed at ENEM and do not have study level assigned:',
      C_TEACHER['CO_ESCOLA'].isin(ENEM['CO_ESCOLA']).sum())

#lets fill with lower level

C_TEACHER['IN_ESPECIALIZACAO'].fillna(0, inplace = True)
C_TEACHER['IN_MESTRADO'].fillna(0, inplace = True)
C_TEACHER['IN_DOUTORADO'].fillna(0, inplace = True)

number of teacher who work at an school listed at ENEM and do not have study level assigned: 1056985


### New Features

ENEM

In [27]:
#Final Media, Target
ENEM['NU_NOTA_GERAL'] = (ENEM.NU_NOTA_CN + ENEM.NU_NOTA_CH + ENEM.NU_NOTA_LC + ENEM.NU_NOTA_MT + ENEM.NU_NOTA_REDACAO)/5

SCHOOL CENSUS - Itś better include after merge all datasets years.


TEACHER CENSUS

In [28]:
C_TEACHER.isnull().sum()

CO_ESCOLA            0
IN_ESPECIALIZACAO    0
IN_MESTRADO          0
IN_DOUTORADO         0
dtype: int64

In [29]:
print('number of teacher who work at an school listed at ENEM and do not have study level assigned:',
      C_TEACHER['CO_ESCOLA'].isin(ENEM['CO_ESCOLA']).sum())

#lets fill with lower level

C_TEACHER['IN_ESPECIALIZACAO'].fillna(0, inplace = True)
C_TEACHER['IN_MESTRADO'].fillna(0, inplace = True)
C_TEACHER['IN_DOUTORADO'].fillna(0, inplace = True)

number of teacher who work at an school listed at ENEM and do not have study level assigned: 1056985


In [30]:
C_TEACHER['TITULACAO'] = 0

C_TEACHER.loc[(C_TEACHER.IN_ESPECIALIZACAO==0.0) & (C_TEACHER.IN_MESTRADO==0.0)& 
             (C_TEACHER.IN_DOUTORADO ==0.0), 'TITULACAO'] = 0

C_TEACHER.loc[(C_TEACHER.IN_ESPECIALIZACAO==1.0) & (C_TEACHER.IN_MESTRADO==0.0)& 
             (C_TEACHER.IN_DOUTORADO ==0.0), 'TITULACAO'] = 1

C_TEACHER.loc[(C_TEACHER.IN_ESPECIALIZACAO==1.0) & (C_TEACHER.IN_MESTRADO==1.0)& 
             (C_TEACHER.IN_DOUTORADO ==0.0), 'TITULACAO'] = 2

C_TEACHER.loc[(C_TEACHER.IN_ESPECIALIZACAO==0.0) & (C_TEACHER.IN_MESTRADO==1.0)& 
             (C_TEACHER.IN_DOUTORADO ==0.0), 'TITULACAO'] = 2

C_TEACHER.loc[(C_TEACHER.IN_DOUTORADO ==1.0), 'TITULACAO'] = 3

#Rebuild level education information as indicator of unique level.
C_TEACHER['NU_GRADUACAO'] = 0
C_TEACHER['NU_ESPECIALIZACAO'] = 0
C_TEACHER['NU_MESTRADO'] = 0
C_TEACHER['NU_DOUTORADO'] = 0

C_TEACHER.loc[C_TEACHER.TITULACAO == 0, 'NU_GRADUACAO']= 1
C_TEACHER.loc[C_TEACHER.TITULACAO == 1, 'NU_ESPECIALIZACAO']= 1
C_TEACHER.loc[C_TEACHER.TITULACAO == 2, 'NU_MESTRADO']= 1
C_TEACHER.loc[C_TEACHER.TITULACAO == 3, 'NU_DOUTORADO']= 1


C_TEACHER['NU_GRADUACAO'] = C_TEACHER.groupby('CO_ESCOLA')['NU_GRADUACAO'].transform(sum)
C_TEACHER['NU_ESPECIALIZACAO'] = C_TEACHER.groupby('CO_ESCOLA')['NU_ESPECIALIZACAO'].transform(sum)
C_TEACHER['NU_MESTRADO'] = C_TEACHER.groupby('CO_ESCOLA')['NU_MESTRADO'].transform(sum)
C_TEACHER['NU_DOUTORADO'] = C_TEACHER.groupby('CO_ESCOLA')['NU_DOUTORADO'].transform(sum)



#Calculating weighted indice of teacher education
SUM= C_TEACHER.groupby('CO_ESCOLA')['TITULACAO'].transform(sum)
COUNT =  C_TEACHER.groupby('CO_ESCOLA')['TITULACAO'].transform('count')
C_TEACHER['TITULACAO'] = (SUM)/(COUNT*3)  

##arranjar isso dai



#transformin to school grain
C_TEACHER.drop_duplicates('CO_ESCOLA', inplace = True)

## STUDENT GRAIN 

In [31]:
# teacher to Schoool Grain
print('C_SCHOOL',C_SCHOOL.shape)
print('C_TEACHER',C_TEACHER.shape)
CENSO19 = pd.merge(C_SCHOOL, C_TEACHER, on = 'CO_ESCOLA', how = 'inner')
print('C_SCHOOL + C_TEACHER',CENSO19.shape)

#Now, all to Student Grain
print('ENEM',ENEM.shape)
ENEM_CENSO_19 = pd.merge(ENEM, CENSO19, on = 'CO_ESCOLA', how = 'inner')
print('FINAL',ENEM_CENSO_19.shape)

ENEM_CENSO_19.to_csv('STUDENT.csv')

C_SCHOOL (272996, 26)
C_TEACHER (188689, 9)
C_SCHOOL + C_TEACHER (188689, 34)
ENEM (1202144, 16)
FINAL (1197462, 49)


In [32]:
ENEM_CENSO_19.head()

Unnamed: 0,CO_ANO,CO_ESCOLA,NU_IDADE,TP_SEXO,TP_COR_RACA,TP_ST_CONCLUSAO,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,NU_NOTA_REDACAO,EDU_PAI,EDU_MAE,QT_PESSOAS_CASA,RENDA_MENSAL,NU_NOTA_GERAL,CO_MUNICIPIO,CO_UF,CO_DEPENDENCIA_ADM,IN_AGUA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_LABORATORIO_INFORMATICA,IN_LABORATORIO_CIENCIAS,IN_SALA_ATENDIMENTO_ESPECIAL,IN_BIBLIOTECA,IN_SALA_LEITURA,IN_BANHEIRO_FORA_PREDIO,IN_BANHEIRO_PNE,IN_DEPENDENCIAS_PNE,QT_SALAS_UTILIZADAS,QT_EQUIP_TV,QT_EQUIP_DVD,QT_EQUIP_COPIADORA,QT_EQUIP_IMPRESSORA,QT_COM_ADMINISTRATIVO,QT_COMP_ALUNO,IN_INTERNET,IN_BANDA_LARGA,QT_FUNCIONARIOS,IN_ALIMENTACAO,IN_ESPECIALIZACAO,IN_MESTRADO,IN_DOUTORADO,TITULACAO,NU_GRADUACAO,NU_ESPECIALIZACAO,NU_MESTRADO,NU_DOUTORADO
0,2015,35006373.0,17,0,1,2,652.1,671.9,609.3,685.5,820.0,4,4,1.0,4,687.76,3518800,35,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,14.0,5.0,0.0,3.0,4.0,7.0,27.0,1.0,1.0,66.0,1.0,0.0,0.0,0.0,0.148148,25,20,0,0
1,2015,35006373.0,17,0,3,2,418.6,579.6,489.8,537.8,680.0,3,4,2.0,2,541.16,3518800,35,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,14.0,5.0,0.0,3.0,4.0,7.0,27.0,1.0,1.0,66.0,1.0,0.0,0.0,0.0,0.148148,25,20,0,0
2,2015,35006373.0,17,0,2,2,488.9,488.2,418.8,477.3,560.0,5,1,2.0,3,486.64,3518800,35,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,14.0,5.0,0.0,3.0,4.0,7.0,27.0,1.0,1.0,66.0,1.0,0.0,0.0,0.0,0.148148,25,20,0,0
3,2015,35006373.0,17,0,1,2,377.6,606.9,459.6,464.0,640.0,3,3,1.0,2,509.62,3518800,35,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,14.0,5.0,0.0,3.0,4.0,7.0,27.0,1.0,1.0,66.0,1.0,0.0,0.0,0.0,0.148148,25,20,0,0
4,2015,35006373.0,17,0,1,2,482.7,569.4,584.6,429.5,820.0,3,3,2.0,3,577.24,3518800,35,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,14.0,5.0,0.0,3.0,4.0,7.0,27.0,1.0,1.0,66.0,1.0,0.0,0.0,0.0,0.148148,25,20,0,0


## SCHOOL GRAIN

In [33]:
fields = ['TP_SEXO', 'TP_COR_RACA', 'EDU_PAI', 'EDU_MAE','QT_PESSOAS_CASA', 'RENDA_MENSAL', 'NU_IDADE', 'NU_NOTA_CN',
             'NU_NOTA_CH','NU_NOTA_LC','NU_NOTA_MT','NU_NOTA_REDACAO','NU_NOTA_GERAL' ]

for i in fields:   
    
            ENEM[i] = ENEM.groupby('CO_ESCOLA')[i].transform('mean')

In [34]:
# SCHOOL GRAIN AND SHOW THE NEW NOTA DISTRIBUTION
ENEM.drop_duplicates('CO_ESCOLA', inplace=True)
#print((ENEM.TARGET==0).sum()/(ENEM.TARGET.count())*100, '% lowers quartis')

In [35]:
#recalculating target for new  Nota Geral distribution 
ENEM['TARGET'] = pd.qcut (ENEM.NU_NOTA_GERAL, 4, labels = [1,2,3,4]).map(lambda x : 0 if x!=4 else 1) 
print((ENEM.TARGET==0).sum()/(ENEM.TARGET.count())*100, '% lowers quartis')

75.0 % lowers quartis


In [36]:
print('censo before',CENSO19.shape)
print('enem_school before', ENEM.shape)
CENSO_ENEM_19 = pd.merge(CENSO19, ENEM, on='CO_ESCOLA', how='inner')
print(CENSO_ENEM_19.shape)

CENSO_ENEM_19.to_csv('SCHOOL.csv', index=False)

censo before (188689, 34)
enem_school before (29628, 17)
(28883, 50)


In [37]:
CENSO_ENEM_19.head()

Unnamed: 0,CO_ESCOLA,CO_MUNICIPIO,CO_UF,CO_DEPENDENCIA_ADM,IN_AGUA_INEXISTENTE,IN_ESGOTO_INEXISTENTE,IN_ENERGIA_INEXISTENTE,IN_LABORATORIO_INFORMATICA,IN_LABORATORIO_CIENCIAS,IN_SALA_ATENDIMENTO_ESPECIAL,IN_BIBLIOTECA,IN_SALA_LEITURA,IN_BANHEIRO_FORA_PREDIO,IN_BANHEIRO_PNE,IN_DEPENDENCIAS_PNE,QT_SALAS_UTILIZADAS,QT_EQUIP_TV,QT_EQUIP_DVD,QT_EQUIP_COPIADORA,QT_EQUIP_IMPRESSORA,QT_COM_ADMINISTRATIVO,QT_COMP_ALUNO,IN_INTERNET,IN_BANDA_LARGA,QT_FUNCIONARIOS,IN_ALIMENTACAO,IN_ESPECIALIZACAO,IN_MESTRADO,IN_DOUTORADO,TITULACAO,NU_GRADUACAO,NU_ESPECIALIZACAO,NU_MESTRADO,NU_DOUTORADO,CO_ANO,NU_IDADE,TP_SEXO,TP_COR_RACA,TP_ST_CONCLUSAO,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,NU_NOTA_REDACAO,EDU_PAI,EDU_MAE,QT_PESSOAS_CASA,RENDA_MENSAL,NU_NOTA_GERAL,TARGET
0,23181737,2310308,23,3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,3,0,0,0,2015,17.0,0.0,3.0,2,349.9,405.2,466.0,473.7,420.0,0.0,0.0,2.0,1.0,422.96,0
1,33524211,3305554,33,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,1.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,34.0,1.0,1.0,0.0,0.0,0.027778,11,1,0,0,2015,17.0,1.0,2.0,2,523.9,496.1,338.6,387.0,420.0,4.0,2.0,2.0,3.0,433.12,0
2,11046740,1100049,11,2,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,10.0,2.0,3.0,3.0,6.0,7.0,25.0,1.0,1.0,62.0,1.0,0.0,0.0,0.0,0.121212,14,8,0,0,2015,17.601852,0.398148,2.268519,2,470.165741,550.939815,486.316667,457.622222,509.296296,1.287037,1.990741,1.768519,2.12963,494.868148,0
3,26075687,2606002,26,2,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,15.0,0.0,0.0,0.0,0.0,5.0,13.0,1.0,1.0,64.0,1.0,0.0,0.0,0.0,0.10101,23,10,0,0,2015,17.601695,0.381356,2.466102,2,453.822881,537.981356,493.455932,452.102542,518.983051,1.194915,1.788136,1.686441,1.415254,491.269153,0
4,43071570,4309555,43,2,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,10.0,2.0,1.0,2.0,2.0,5.0,30.0,1.0,1.0,35.0,1.0,0.0,0.0,0.0,0.079365,16,5,0,0,2015,17.263158,0.368421,1.0,2,469.736842,559.784211,523.847368,512.363158,575.789474,1.526316,1.947368,1.473684,2.894737,528.304211,0


In [38]:
CENSO_ENEM_19.isnull().sum()

CO_ESCOLA                       0
CO_MUNICIPIO                    0
CO_UF                           0
CO_DEPENDENCIA_ADM              0
IN_AGUA_INEXISTENTE             0
IN_ESGOTO_INEXISTENTE           0
IN_ENERGIA_INEXISTENTE          0
IN_LABORATORIO_INFORMATICA      0
IN_LABORATORIO_CIENCIAS         0
IN_SALA_ATENDIMENTO_ESPECIAL    0
IN_BIBLIOTECA                   0
IN_SALA_LEITURA                 0
IN_BANHEIRO_FORA_PREDIO         0
IN_BANHEIRO_PNE                 0
IN_DEPENDENCIAS_PNE             0
QT_SALAS_UTILIZADAS             0
QT_EQUIP_TV                     0
QT_EQUIP_DVD                    0
QT_EQUIP_COPIADORA              0
QT_EQUIP_IMPRESSORA             0
QT_COM_ADMINISTRATIVO           0
QT_COMP_ALUNO                   0
IN_INTERNET                     0
IN_BANDA_LARGA                  0
QT_FUNCIONARIOS                 0
IN_ALIMENTACAO                  0
IN_ESPECIALIZACAO               0
IN_MESTRADO                     0
IN_DOUTORADO                    0
TITULACAO     