## Importando as bibliotecas

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import numpy as np
pd.set_option("display.max_rows", 200)
pd.set_option('display.max_columns', 500)

### Leitura do arquivo de treino do dataset

In [2]:
train = pd.read_csv('train.csv')

### Verificando a correlação das variáveis com a target

In [3]:
train.corr().round(4)['NU_NOTA_MT']

Unnamed: 0                     0.0065
NU_ANO                            NaN
CO_MUNICIPIO_RESIDENCIA        0.1089
CO_UF_RESIDENCIA               0.1075
NU_IDADE                      -0.0615
TP_ESTADO_CIVIL               -0.0354
TP_COR_RACA                   -0.1574
TP_NACIONALIDADE              -0.0350
CO_MUNICIPIO_NASCIMENTO        0.1134
CO_UF_NASCIMENTO               0.1122
TP_ST_CONCLUSAO               -0.0879
TP_ANO_CONCLUIU                0.0290
TP_ESCOLA                      0.0693
TP_ENSINO                     -0.0864
IN_TREINEIRO                  -0.0223
CO_ESCOLA                      0.1021
CO_MUNICIPIO_ESC               0.1025
CO_UF_ESC                      0.1009
TP_DEPENDENCIA_ADM_ESC         0.3732
TP_LOCALIZACAO_ESC            -0.0530
TP_SIT_FUNC_ESC                0.0263
IN_BAIXA_VISAO                 0.0007
IN_CEGUEIRA                       NaN
IN_SURDEZ                     -0.0152
IN_DEFICIENCIA_AUDITIVA       -0.0148
IN_SURDO_CEGUEIRA                 NaN
IN_DEFICIENC

### Selecionando as variáveis para o modelo. A de menor correlação foi  NU_NOTA_COMP1 = 0.299

In [4]:
colunas = ['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3',
           'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO', 'NU_NOTA_COMP1',
           'NU_NOTA_MT']

In [5]:
# dataframe somente com as variáveis escolhidas
train = train[colunas]

In [6]:
# verificando a correlação no novo dataframe
train.corr().round(4)

Unnamed: 0,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_REDACAO,NU_NOTA_COMP1,NU_NOTA_MT
NU_NOTA_CN,1.0,0.6095,0.5457,0.4129,0.4294,0.4077,0.4163,0.4599,0.3577,0.5849
NU_NOTA_CH,0.6095,1.0,0.693,0.4881,0.5105,0.4888,0.4948,0.5506,0.4438,0.5296
NU_NOTA_LC,0.5457,0.693,1.0,0.4533,0.4754,0.4674,0.4417,0.5129,0.4306,0.4947
NU_NOTA_COMP2,0.4129,0.4881,0.4533,1.0,0.8754,0.808,0.6559,0.9183,0.7567,0.3356
NU_NOTA_COMP3,0.4294,0.5105,0.4754,0.8754,1.0,0.8272,0.6643,0.9258,0.7595,0.3503
NU_NOTA_COMP4,0.4077,0.4888,0.4674,0.808,0.8272,1.0,0.635,0.9145,0.8245,0.3423
NU_NOTA_COMP5,0.4163,0.4948,0.4417,0.6559,0.6643,0.635,1.0,0.8162,0.5586,0.3433
NU_NOTA_REDACAO,0.4599,0.5506,0.5129,0.9183,0.9258,0.9145,0.8162,1.0,0.8627,0.3794
NU_NOTA_COMP1,0.3577,0.4438,0.4306,0.7567,0.7595,0.8245,0.5586,0.8627,1.0,0.2994
NU_NOTA_MT,0.5849,0.5296,0.4947,0.3356,0.3503,0.3423,0.3433,0.3794,0.2994,1.0


#### Verificando mais informações do dataframe

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13730 entries, 0 to 13729
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   NU_NOTA_CN       10341 non-null  float64
 1   NU_NOTA_CH       10341 non-null  float64
 2   NU_NOTA_LC       10133 non-null  float64
 3   NU_NOTA_COMP2    10133 non-null  float64
 4   NU_NOTA_COMP3    10133 non-null  float64
 5   NU_NOTA_COMP4    10133 non-null  float64
 6   NU_NOTA_COMP5    10133 non-null  float64
 7   NU_NOTA_REDACAO  10133 non-null  float64
 8   NU_NOTA_COMP1    10133 non-null  float64
 9   NU_NOTA_MT       10133 non-null  float64
dtypes: float64(10)
memory usage: 1.0 MB


### Verificando estatísticas descritivas das variáveis

In [8]:
train.describe().round(2)

Unnamed: 0,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_REDACAO,NU_NOTA_COMP1,NU_NOTA_MT
count,10341.0,10341.0,10133.0,10133.0,10133.0,10133.0,10133.0,10133.0,10133.0,10133.0
mean,473.5,529.66,516.47,111.92,106.82,113.12,77.37,529.05,119.81,482.5
std,71.09,73.73,68.69,33.74,34.34,33.48,43.07,154.29,29.85,99.83
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,419.9,480.4,468.1,100.0,80.0,100.0,40.0,440.0,120.0,408.9
50%,459.8,532.0,520.9,120.0,120.0,120.0,80.0,540.0,120.0,461.2
75%,514.5,581.2,564.9,120.0,120.0,120.0,100.0,600.0,140.0,537.6
max,806.4,807.0,763.6,200.0,200.0,200.0,200.0,1000.0,200.0,952.0


#### Verificando a quantidade de nulos nas variáveis

In [9]:
train.isnull().sum()

NU_NOTA_CN         3389
NU_NOTA_CH         3389
NU_NOTA_LC         3597
NU_NOTA_COMP2      3597
NU_NOTA_COMP3      3597
NU_NOTA_COMP4      3597
NU_NOTA_COMP5      3597
NU_NOTA_REDACAO    3597
NU_NOTA_COMP1      3597
NU_NOTA_MT         3597
dtype: int64

### Tratando os valores faltantes preenchendo com -1

In [10]:
train.fillna(-1, inplace=True)

#### Selecionando a target do modelo

In [11]:
y_train = train.NU_NOTA_MT
y_train

0        399.4
1        459.8
2         -1.0
3         -1.0
4         -1.0
         ...  
13725    403.2
13726    452.4
13727    398.0
13728    386.6
13729    428.9
Name: NU_NOTA_MT, Length: 13730, dtype: float64

#### Selecionando as variáveis independente

In [12]:
X_train = train
X_train.drop(columns=['NU_NOTA_MT'], inplace=True)

In [13]:
# Verificando quais são as variáveis independentes
X_train.columns

Index(['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_COMP2',
       'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO',
       'NU_NOTA_COMP1'],
      dtype='object')

In [14]:
colunas

['NU_NOTA_CN',
 'NU_NOTA_CH',
 'NU_NOTA_LC',
 'NU_NOTA_COMP2',
 'NU_NOTA_COMP3',
 'NU_NOTA_COMP4',
 'NU_NOTA_COMP5',
 'NU_NOTA_REDACAO',
 'NU_NOTA_COMP1',
 'NU_NOTA_MT']

### Criando lista com as colunas que foram usadas para a parte do treino, sabendo que tem que ser igual para o teste

In [15]:
# criando uma cópia da lista
colunas_test = colunas.copy()

# removendo a última variável da lista, neste caso é NU_NOTA_MT, que no caso é a target e não existe no teste
colunas_test.pop()

# adicionando o NU_INSCRICAO
colunas_test.append('NU_INSCRICAO')

### Leitura do arquivo de teste do dataset

In [16]:
test = pd.read_csv('test.csv', usecols=colunas_test)

#### Salvando a lista de identificação dos participantes do enem para montar o arquivo ao final do modelo

In [17]:
num_inscricao = test['NU_INSCRICAO']

#### Verificando a quantidade de nulos nas variáveis

In [18]:
test.isnull().sum()

NU_INSCRICAO          0
NU_NOTA_CN         1134
NU_NOTA_CH         1134
NU_NOTA_LC         1199
NU_NOTA_COMP1      1199
NU_NOTA_COMP2      1199
NU_NOTA_COMP3      1199
NU_NOTA_COMP4      1199
NU_NOTA_COMP5      1199
NU_NOTA_REDACAO    1199
dtype: int64

### Tratando os valores faltantes preenchendo com -1, igual ao treino

In [19]:
test.fillna(-1, inplace=True)
test

Unnamed: 0,NU_INSCRICAO,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_REDACAO
0,73ff9fcc02f0a99919906c942c2e1a1042cdcf98,464.8,443.5,431.8,120.0,80.0,80.0,100.0,40.0,420.0
1,71a95f9f1b91a82c65ad94abbdf9f54e6066f968,391.1,491.1,548.0,120.0,120.0,120.0,120.0,100.0,580.0
2,b38a03232f43b11c9d0788abaf060f7366053b6d,595.9,622.7,613.6,80.0,40.0,40.0,80.0,80.0,320.0
3,70b682d9a3636be23f6120fa9d6b164eb3c6002d,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,715494628a50142ce8cb17191cfe6d0f3cae0934,592.9,492.6,571.4,100.0,80.0,60.0,80.0,0.0,320.0
...,...,...,...,...,...,...,...,...,...,...
4571,dac0f22429c7f8e3931d0abaf5dfc8e5c772a48b,398.3,558.2,511.6,120.0,120.0,120.0,100.0,40.0,500.0
4572,a75fa8770257e7c9368d059fe53d9ef431f4bdef,427.6,579.7,471.1,100.0,100.0,100.0,120.0,100.0,520.0
4573,655fa6306720ff16e825903b5422a46608a77545,639.2,643.8,604.9,160.0,140.0,120.0,140.0,80.0,640.0
4574,1f4bc3e3d56212d500625bfe8ac78ccff4362293,427.1,467.9,540.2,140.0,80.0,80.0,140.0,80.0,520.0


#### Criando coluna target no dataframe de teste, valores nulos pois o modelo irá fazer a predição

In [20]:
test['NU_NOTA_MT'] = None

#### Selecionando a target do modelo

In [21]:
y_test = test['NU_NOTA_MT']
y_test

0       None
1       None
2       None
3       None
4       None
        ... 
4571    None
4572    None
4573    None
4574    None
4575    None
Name: NU_NOTA_MT, Length: 4576, dtype: object

#### Selecionando as variáveis independente

In [22]:
# excluindo NU_INSCRICAO, porque não faz sentido para o modelo
test.drop(columns=['NU_INSCRICAO'], inplace=True)
X_test = test

In [23]:
X_test.drop(columns=['NU_NOTA_MT'], inplace=True)
X_test

Unnamed: 0,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_REDACAO
0,464.8,443.5,431.8,120.0,80.0,80.0,100.0,40.0,420.0
1,391.1,491.1,548.0,120.0,120.0,120.0,120.0,100.0,580.0
2,595.9,622.7,613.6,80.0,40.0,40.0,80.0,80.0,320.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,592.9,492.6,571.4,100.0,80.0,60.0,80.0,0.0,320.0
...,...,...,...,...,...,...,...,...,...
4571,398.3,558.2,511.6,120.0,120.0,120.0,100.0,40.0,500.0
4572,427.6,579.7,471.1,100.0,100.0,100.0,120.0,100.0,520.0
4573,639.2,643.8,604.9,160.0,140.0,120.0,140.0,80.0,640.0
4574,427.1,467.9,540.2,140.0,80.0,80.0,140.0,80.0,520.0


## Instanciando modelo RandomForestRegressor

In [24]:
regr = RandomForestRegressor(n_estimators=2000, n_jobs=-1, max_depth=100, min_samples_leaf=4,
                             min_samples_split=10, random_state=0)

### Treinando modelo

In [25]:
%%time
regr.fit(X_train, y_train)

CPU times: user 1min 35s, sys: 353 ms, total: 1min 35s
Wall time: 15.7 s


RandomForestRegressor(max_depth=100, min_samples_leaf=4, min_samples_split=10,
                      n_estimators=2000, n_jobs=-1, random_state=0)

### Verificando a métrica R²

#### A métrica R² varia entre 0 e 1, expressa a quantidade da variância dos dados que é explicada pelo modelo. Sendo assim, quanto maior o R², mais explicativo é o modelo, ou seja, melhor ele se ajusta à amostra.

In [26]:
print('R² = {}'.format(regr.score(X_train, y_train).round(2)))

R² = 0.96


### Prevendo a variável target para o teste

In [27]:
y_predict = regr.predict(X_test)
y_predict

array([429.19162362, 448.20404426, 601.5789247 , ..., 664.02614271,
       454.0928351 ,  -0.99995593])

In [28]:
# Tratando para ficar com apenas uma casa decimal
y_predict = y_predict.round(1)

# limitar resultados entre 0 e 1000
y_predict = np.clip(y_predict, 0, 1000)

y_predict

array([429.2, 448.2, 601.6, ..., 664. , 454.1,   0. ])

In [29]:
# Criando uma Serie para as notas previstas
y_predict = pd.Series(y_predict)
y_predict

0       429.2
1       448.2
2       601.6
3         0.0
4       525.6
        ...  
4571    452.3
4572    486.3
4573    664.0
4574    454.1
4575      0.0
Length: 4576, dtype: float64

In [30]:
# Criando dataframe com NUM_INSCRICAO e NU_NOTA_MT (prevista)
answer = pd.concat([num_inscricao, y_predict], axis=1)
# Renomeando coluna
answer.rename(columns={0:'NU_NOTA_MT'}, inplace=True)

In [31]:
answer

Unnamed: 0,NU_INSCRICAO,NU_NOTA_MT
0,73ff9fcc02f0a99919906c942c2e1a1042cdcf98,429.2
1,71a95f9f1b91a82c65ad94abbdf9f54e6066f968,448.2
2,b38a03232f43b11c9d0788abaf060f7366053b6d,601.6
3,70b682d9a3636be23f6120fa9d6b164eb3c6002d,0.0
4,715494628a50142ce8cb17191cfe6d0f3cae0934,525.6
...,...,...
4571,dac0f22429c7f8e3931d0abaf5dfc8e5c772a48b,452.3
4572,a75fa8770257e7c9368d059fe53d9ef431f4bdef,486.3
4573,655fa6306720ff16e825903b5422a46608a77545,664.0
4574,1f4bc3e3d56212d500625bfe8ac78ccff4362293,454.1


## Criando arquivo para submissão do desafio

In [32]:
answer.to_csv('answer.csv',columns=['NU_INSCRICAO', 'NU_NOTA_MT'], sep=',', index=False)