In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [35]:
scaler = StandardScaler()

In [36]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Criando o df resposta
answer = pd.DataFrame()
answer['NU_INSCRICAO'] = df_test['NU_INSCRICAO']

In [37]:
# Verificando se as colunas da base de teste estão na base de treino
print(set(df_test.columns).issubset(set(df_train.columns)))

True


In [38]:
colunas = list(df_test.columns) #Selecionando apenas as colunas que existem na base de teste
colunas.append('IN_TREINEIRO') #Adicionando a coluna 'target'

In [39]:
df_train = df_train[colunas] # Selecionando na base treino apenas as colunas escolhidas 
df_train.shape

(13730, 44)

In [40]:
#Dataframe auxiliar na análise
df_aux = pd.DataFrame({'Type': df_train.dtypes,
                      'Missing': df_train.isna().sum(),
                      'Size': df_train.shape[0],
                       'Unique': df_train.nunique()
                     })
df_aux['Missing_%']= df_aux.Missing/df_aux.Size * 100
df_aux.sort_values(by='Missing_%', ascending=False)

Unnamed: 0,Type,Missing,Size,Unique,Missing_%
TP_DEPENDENCIA_ADM_ESC,float64,9448,13730,4,68.812819
TP_ENSINO,float64,9448,13730,3,68.812819
Q027,object,7373,13730,13,53.699927
NU_NOTA_REDACAO,float64,3597,13730,53,26.198106
NU_NOTA_LC,float64,3597,13730,2774,26.198106
TP_STATUS_REDACAO,float64,3597,13730,9,26.198106
NU_NOTA_COMP1,float64,3597,13730,15,26.198106
NU_NOTA_COMP2,float64,3597,13730,13,26.198106
NU_NOTA_COMP3,float64,3597,13730,12,26.198106
NU_NOTA_COMP4,float64,3597,13730,14,26.198106


In [41]:
#Apagando as colunas com muitos dados faltantes
df_train.drop(columns=['TP_DEPENDENCIA_ADM_ESC','TP_ENSINO','Q027'], inplace=True)
df_test.drop(columns=['TP_DEPENDENCIA_ADM_ESC','TP_ENSINO','Q027'], inplace=True)

In [42]:
# Dados faltantes restantes
df_train.isna().sum().sort_values(ascending = False)

NU_NOTA_COMP2        3597
NU_NOTA_LC           3597
NU_NOTA_REDACAO      3597
NU_NOTA_COMP5        3597
NU_NOTA_COMP4        3597
NU_NOTA_COMP3        3597
NU_NOTA_COMP1        3597
TP_STATUS_REDACAO    3597
NU_NOTA_CN           3389
NU_NOTA_CH           3389
IN_SURDEZ               0
IN_CEGUEIRA             0
IN_BAIXA_VISAO          0
TP_ESCOLA               0
TP_ANO_CONCLUIU         0
IN_TREINEIRO            0
TP_ST_CONCLUSAO         0
IN_DISCALCULIA          0
TP_NACIONALIDADE        0
TP_COR_RACA             0
TP_SEXO                 0
NU_IDADE                0
SG_UF_RESIDENCIA        0
CO_UF_RESIDENCIA        0
IN_DISLEXIA             0
TP_PRESENCA_LC          0
IN_SABATISTA            0
IN_GESTANTE             0
IN_IDOSO                0
TP_PRESENCA_CN          0
TP_PRESENCA_CH          0
Q047                    0
TP_PRESENCA_MT          0
TP_LINGUA               0
Q001                    0
Q002                    0
Q006                    0
Q024                    0
Q025        

In [43]:
# Preecher dados faltantes com zero 
df_train = df_train.fillna(0)
df_test = df_test.fillna(8)

In [52]:
# Verificando a correlação das variáveis com o target
corr = df_train.corr()['IN_TREINEIRO'].sort_values()

NU_IDADE          -0.295091
TP_ANO_CONCLUIU   -0.257710
TP_ESCOLA         -0.244562
TP_ST_CONCLUSAO    0.533983
IN_TREINEIRO       1.000000
Name: IN_TREINEIRO, dtype: float64

In [45]:
# features selecionadas
selecao = corr[(corr > 0.2) | (corr < -0.2)]
selecao = selecao.index

In [46]:
df_train = df_train[selecao]

In [47]:
y_train = df_train['IN_TREINEIRO']
X_train = df_train.drop(columns = 'IN_TREINEIRO')

In [53]:
X_test = df_test[X_train.columns]

Unnamed: 0,NU_IDADE,TP_ANO_CONCLUIU,TP_ESCOLA,TP_ST_CONCLUSAO
0,19,3,1,1
1,24,4,1,1
2,16,0,1,3
3,17,0,2,2
4,19,1,1,1
...,...,...,...,...
4565,17,0,3,2
4566,20,2,1,1
4567,22,5,1,1
4568,19,0,2,2


In [55]:
#normalizar
X_train = scaler.fit_transform(X_train) 
X_test = scaler.transform(X_test)

In [56]:
#Instanciando o modelo Random Forest Classifier
classifier = RandomForestClassifier(n_estimators = 100, random_state = 42)

In [57]:
#Treinando o modelo
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [58]:
y_pred_test = classifier.predict(X_test)

In [59]:
answer['IN_TREINEIRO'] = y_pred_test
answer

Unnamed: 0,NU_INSCRICAO,IN_TREINEIRO
0,ba0cc30ba34e7a46764c09dfc38ed83d15828897,0
1,177f281c68fa032aedbd842a745da68490926cd2,0
2,6cf0d8b97597d7625cdedc7bdb6c0f052286c334,1
3,5c356d810fa57671402502cd0933e5601a2ebf1e,0
4,df47c07bd881c2db3f38c6048bf77c132ad0ceb3,0
...,...,...
4565,361b7fcd8867119550fe2af5aa729ffad89a7cf5,0
4566,d8a0e4c9e29494cc9bba2422bd79333931475ee1,0
4567,3f1c3388244df8d6521e983a809292d9f3bca643,0
4568,1778e9c4cef591beb6b986d191d15ed05de816b0,0


In [61]:
answer.to_csv('answer.csv', index=False, header=True)