## Prediction of candidates who took the Enem 2016 as simulated 

### Importing libs

In [53]:
import pandas as pd
import numpy as np


from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

### Loading database

In [108]:

df_test = pd.read_csv('test.csv')
df_train = pd.read_csv('train.csv')


### Exploratory data analysis

#### Do datasets have the same number of columns?

In [109]:
df_train.shape[1] == df_test.shape[1]

False

In [110]:
print(df_train.columns)
print(df_test.columns)

Index(['Unnamed: 0', 'NU_INSCRICAO', 'NU_ANO', 'CO_MUNICIPIO_RESIDENCIA',
       'NO_MUNICIPIO_RESIDENCIA', 'CO_UF_RESIDENCIA', 'SG_UF_RESIDENCIA',
       'NU_IDADE', 'TP_SEXO', 'TP_ESTADO_CIVIL',
       ...
       'Q041', 'Q042', 'Q043', 'Q044', 'Q045', 'Q046', 'Q047', 'Q048', 'Q049',
       'Q050'],
      dtype='object', length=167)
Index(['NU_INSCRICAO', 'CO_UF_RESIDENCIA', 'SG_UF_RESIDENCIA', 'NU_IDADE',
       'TP_SEXO', 'TP_COR_RACA', 'TP_NACIONALIDADE', 'TP_ST_CONCLUSAO',
       'TP_ANO_CONCLUIU', 'TP_ESCOLA', 'TP_ENSINO', 'TP_DEPENDENCIA_ADM_ESC',
       'IN_BAIXA_VISAO', 'IN_CEGUEIRA', 'IN_SURDEZ', 'IN_DISLEXIA',
       'IN_DISCALCULIA', 'IN_SABATISTA', 'IN_GESTANTE', 'IN_IDOSO',
       'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC', 'TP_PRESENCA_MT',
       'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'TP_LINGUA',
       'TP_STATUS_REDACAO', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3',
       'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO', 'Q001', 'Q002',
      

In [111]:
df_train = df_train[['IN_TREINEIRO'] + df_test.columns.to_list()]

In [112]:
df_train.columns

Index(['IN_TREINEIRO', 'NU_INSCRICAO', 'CO_UF_RESIDENCIA', 'SG_UF_RESIDENCIA',
       'NU_IDADE', 'TP_SEXO', 'TP_COR_RACA', 'TP_NACIONALIDADE',
       'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_ESCOLA', 'TP_ENSINO',
       'TP_DEPENDENCIA_ADM_ESC', 'IN_BAIXA_VISAO', 'IN_CEGUEIRA', 'IN_SURDEZ',
       'IN_DISLEXIA', 'IN_DISCALCULIA', 'IN_SABATISTA', 'IN_GESTANTE',
       'IN_IDOSO', 'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC',
       'TP_PRESENCA_MT', 'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'TP_LINGUA',
       'TP_STATUS_REDACAO', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3',
       'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO', 'Q001', 'Q002',
       'Q006', 'Q024', 'Q025', 'Q026', 'Q027', 'Q047'],
      dtype='object')

#### Checking data types

In [113]:
df_train.dtypes

IN_TREINEIRO                int64
NU_INSCRICAO               object
CO_UF_RESIDENCIA            int64
SG_UF_RESIDENCIA           object
NU_IDADE                    int64
TP_SEXO                    object
TP_COR_RACA                 int64
TP_NACIONALIDADE            int64
TP_ST_CONCLUSAO             int64
TP_ANO_CONCLUIU             int64
TP_ESCOLA                   int64
TP_ENSINO                 float64
TP_DEPENDENCIA_ADM_ESC    float64
IN_BAIXA_VISAO              int64
IN_CEGUEIRA                 int64
IN_SURDEZ                   int64
IN_DISLEXIA                 int64
IN_DISCALCULIA              int64
IN_SABATISTA                int64
IN_GESTANTE                 int64
IN_IDOSO                    int64
TP_PRESENCA_CN              int64
TP_PRESENCA_CH              int64
TP_PRESENCA_LC              int64
TP_PRESENCA_MT              int64
NU_NOTA_CN                float64
NU_NOTA_CH                float64
NU_NOTA_LC                float64
TP_LINGUA                   int64
TP_STATUS_REDA

##### Removing the registration number

In [114]:
df_train = df_train.drop(columns='NU_INSCRICAO')

#### Checking missing values in descending order

In [101]:
df_train.isna().sum().sort_values(ascending=False).head(17)

TP_DEPENDENCIA_ADM_ESC    9448
TP_ENSINO                 9448
Q027                      7373
NU_NOTA_COMP1             3597
NU_NOTA_COMP5             3597
TP_STATUS_REDACAO         3597
NU_NOTA_COMP2             3597
NU_NOTA_COMP3             3597
NU_NOTA_COMP4             3597
NU_NOTA_REDACAO           3597
NU_NOTA_LC                3597
NU_NOTA_CH                3389
NU_NOTA_CN                3389
TP_ESCOLA                    0
IN_BAIXA_VISAO               0
TP_ANO_CONCLUIU              0
IN_CEGUEIRA                  0
dtype: int64

In [102]:
def get_numeric_columns(df):
    
    """ Return int and float columns from pd.DataFrame """
    return df.select_dtypes('int64', 'float64').columns


# imputing missing values with the mean
df_train[get_numeric_columns(df_train)] = df_train[get_numeric_columns(df_train)].fillna(df_train[get_numeric_columns(df_train)].mean())

#### Checking the balance of classes

In [115]:
df_train.nunique()

IN_TREINEIRO                 2
CO_UF_RESIDENCIA            27
SG_UF_RESIDENCIA            27
NU_IDADE                    55
TP_SEXO                      2
TP_COR_RACA                  6
TP_NACIONALIDADE             5
TP_ST_CONCLUSAO              4
TP_ANO_CONCLUIU             11
TP_ESCOLA                    4
TP_ENSINO                    3
TP_DEPENDENCIA_ADM_ESC       4
IN_BAIXA_VISAO               2
IN_CEGUEIRA                  1
IN_SURDEZ                    2
IN_DISLEXIA                  2
IN_DISCALCULIA               2
IN_SABATISTA                 2
IN_GESTANTE                  2
IN_IDOSO                     2
TP_PRESENCA_CN               3
TP_PRESENCA_CH               3
TP_PRESENCA_LC               3
TP_PRESENCA_MT               3
NU_NOTA_CN                2692
NU_NOTA_CH                2978
NU_NOTA_LC                2774
TP_LINGUA                    2
TP_STATUS_REDACAO            9
NU_NOTA_COMP1               15
NU_NOTA_COMP2               13
NU_NOTA_COMP3               12
NU_NOTA_

In [118]:
df_train['IN_TREINEIRO'].value_counts()

0    11947
1     1783
Name: IN_TREINEIRO, dtype: int64

In [119]:
df_train['NU_IDADE'].value_counts().sort_values(ascending=False)

17    2630
18    2278
19    1455
20    1036
16    1018
21     757
22     600
23     496
24     387
25     342
15     272
26     241
27     235
29     193
28     177
30     147
31     143
32     136
34     116
35     115
33     114
36      87
37      84
40      72
38      69
39      67
41      47
42      44
44      36
48      36
45      34
47      34
43      34
46      31
51      21
50      21
14      21
54      17
52      17
53      12
49      11
57       8
56       8
61       5
65       4
58       4
59       4
62       3
66       3
55       3
64       1
60       1
13       1
63       1
67       1
Name: NU_IDADE, dtype: int64

## Feature Selection and Model Building

In [103]:
X = df_train[get_numeric_columns(df_train.drop(columns='IN_TREINEIRO'))]
y = df_train['IN_TREINEIRO']


In [104]:
# using model Random Forest Classifier
model = RandomForestClassifier(n_estimators = 200, random_state= 42, n_jobs = -1)

# Use RFE to select the most important features
rfe = RFE(estimator=model, n_features_to_select=2)
# create pipeline
pipeline = Pipeline(steps=[('s',rfe),('m',model)])

# evaluate model
# validator with different randomization in each repetition.
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# Evaluating a score with F1-Score
n_scores = cross_val_score(pipeline, X, y, scoring='f1', cv=cv, n_jobs=-1, error_score='raise')
# fit the model on all available data
pipeline.fit(X, y)
# report performance
print('Accuracy: %.3f (%.3f)' % (n_scores.mean(), n_scores.std()))


Accuracy: 0.988 (0.004)


### Build model using unprocessed data for comparison

In [105]:
df_answer = pd.DataFrame()
df_answer[ID] = df_test['NU_INSCRICAO']



df_test = df_test[get_numeric_columns(df_test)].fillna(df_test[get_numeric_columns(df_test)].mean())

yhat = pipeline.predict(df_test)

df_answer['IN_TREINEIRO'] = list(yhat)

#df_answer.head()
# yhat

### Exporting results

In [107]:
df_answer.to_csv('answer.csv', index=False)