In [76]:
import pandas as pd
import numpy as np

In [77]:
from settings import INITIAL_FEATURES, TARGET
from settings import CATEGORICAL_COLS, NUMERICAL_COLS, SELECTED_FEATURES

from sklearn.pipeline import Pipeline
from transformers import *

<div style="background-color:#ddeedd;padding:20px;">
<h2> Config </h2>
</div>

In [78]:
input_data_path = './data/input/'
train_raw_filename = 'train.csv'
test_raw_filename = 'test.csv'

SEED = 42

<div style="background-color:#ddeedd;padding:20px;">
<h2> Preprocessing Raw Data </h2>
</div>

In [79]:
train = pd.read_csv(input_data_path+train_raw_filename)
test = pd.read_csv(input_data_path+test_raw_filename)

In [80]:
train = train[pd.isnull(train[TARGET])==False]
X = train[SELECTED_FEATURES]
y = train[TARGET]

__Pipeline__

In [81]:
pp_pipe = Pipeline([
    ("1. Treat New Labels", TreatNewLabels(variables = CATEGORICAL_COLS)),
    ("2. Missing Numerical", NumericalNAImputerValue(variables=NUMERICAL_COLS,value=-99,add_column=False)),
    ("3. Missing Categorical",CategoricalMostFrequentImputer(variables=CATEGORICAL_COLS,add_column=False)),
    #("4. Numerical Transformations",PowerTransformations(variables=NUMERICAL_COLS)),
    ("5. Cat Encod - Target Mean",CategoricalTargetEncoder(variables = CATEGORICAL_COLS)),
    #("5. OHE Cat Enc", CategoricalEncoderOHE()),
    ("6. Scaling",AdjustedScaler())
])

In [82]:
pp_pipe.fit(X,y)
X_train = pp_pipe.transform(X)

In [83]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10133 entries, 0 to 10132
Data columns (total 45 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   SG_UF_RESIDENCIA        10133 non-null  float64
 1   TP_SEXO                 10133 non-null  float64
 2   TP_COR_RACA             10133 non-null  float64
 3   TP_NACIONALIDADE        10133 non-null  float64
 4   TP_ESCOLA               10133 non-null  float64
 5   TP_ENSINO               10133 non-null  float64
 6   IN_TREINEIRO            10133 non-null  float64
 7   TP_DEPENDENCIA_ADM_ESC  10133 non-null  float64
 8   TP_PRESENCA_CN          10133 non-null  float64
 9   TP_PRESENCA_CH          10133 non-null  float64
 10  TP_PRESENCA_LC          10133 non-null  float64
 11  CO_PROVA_CN             10133 non-null  float64
 12  CO_PROVA_CH             10133 non-null  float64
 13  CO_PROVA_LC             10133 non-null  float64
 14  CO_PROVA_MT             10133 non-null

In [103]:
X_test = pp_pipe.transform(test[SELECTED_FEATURES])

# GridSearch CV

In [99]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# Initiating the Model
model = RandomForestRegressor(random_state=SEED)
# Defining the CV approach to control reproducibility
cv = KFold(5, random_state=SEED)

# Defining the Hyperparameters grid
grid_parameters = {
    "n_estimators":[300,400,500],
    "max_depth":[7,10,12],
    "min_samples_split":[2,4,6],
    "random_state":[SEED]}

# Initiating the Grid Search Object
grid = GridSearchCV(estimator=model,scoring='neg_root_mean_squared_error',
                    param_grid=grid_parameters,
                    cv = cv,verbose=2,
                    n_jobs= -1)
                    
# Train gridsearch 
grid.fit(X_train, y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:  6.4min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=False),
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbos

In [100]:
# best estimator 
grid.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=400, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [101]:
# best score --> avg CV score for best model
grid.best_score_

-72.58168578242275

In [102]:
# best parameters → from best model
grid.best_params_

{'max_depth': 10,
 'min_samples_split': 2,
 'n_estimators': 400,
 'random_state': 42}

# Test Prediction

In [104]:
final_model = grid.best_estimator_
final_model.fit(X_train,y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=400, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [105]:
test_predictions = final_model.predict(X_test)
test_predictions

array([429.14031018, 442.94655832, 592.91195371, ..., 674.09753155,
       456.3453116 ,   3.11021343])

__Creating Submission File__

In [106]:
sub_file = test[['NU_INSCRICAO']].copy()

In [110]:
sub_file['NU_NOTA_MT'] = test_predictions
sub_file['NU_NOTA_MT'] = sub_file['NU_NOTA_MT'].apply(lambda x: round(x,1))

In [112]:
sub_file.head()

Unnamed: 0,NU_INSCRICAO,NU_NOTA_MT
0,73ff9fcc02f0a99919906c942c2e1a1042cdcf98,429.1
1,71a95f9f1b91a82c65ad94abbdf9f54e6066f968,442.9
2,b38a03232f43b11c9d0788abaf060f7366053b6d,592.9
3,70b682d9a3636be23f6120fa9d6b164eb3c6002d,2.4
4,715494628a50142ce8cb17191cfe6d0f3cae0934,538.6


In [114]:
sub_file.to_csv('answer.csv',index=False)

## XGBoost

In [133]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# Initiating the Model
model = XGBRegressor(random_state=SEED)
# Defining the CV approach to control reproducibility
cv = KFold(5, random_state=SEED)

# Defining the Hyperparameters grid
grid_parameters = {
    "learning_rate":[0.07,0.05],
    "max_depth":[5,7,10],
    "n_estimators":[100,150,200], 
     "objective":['reg:squarederror'], 
    #'min_child_weight': 1,
    'subsample': [1],
    'colsample_bytree': [0.7,0.8,1],
    "random_state":[SEED]
}


# Initiating the Grid Search Object
grid = GridSearchCV(estimator=model,scoring='neg_root_mean_squared_error',
                    param_grid=grid_parameters,
                    cv = cv,verbose=2,
                    n_jobs= -1)
                    
# Train gridsearch 
grid.fit(X_train, y)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:  3.2min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=False),
             error_score=nan,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,...
                                    validate_parameters=False, verbosity=None),
             iid='deprecated', n_jobs=-1,
             param_grid={'colsample_bytree': [0.7, 0.8, 1],
                         'learning_rate': [0.07, 0.05], 'max_depth': [5, 7, 10],
                         'n_estimators': [100, 150, 200],
                         'objec

In [134]:
# best parameters → from best model
grid.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.07,
 'max_depth': 5,
 'n_estimators': 100,
 'objective': 'reg:squarederror',
 'random_state': 42,
 'subsample': 1}

In [135]:
# best score --> avg CV score for best model
grid.best_score_

-72.00957310669064

__Predictions__

In [136]:
final_model = grid.best_estimator_
final_model.fit(X_train,y)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.07, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=42, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)

In [137]:
test_predictions = final_model.predict(X_test)
test_predictions

array([437.4405  , 442.75388 , 587.1726  , ..., 666.36414 , 444.7955  ,
        53.711155], dtype=float32)

In [138]:
sub_file = test[['NU_INSCRICAO']].copy()
sub_file['NU_NOTA_MT'] = test_predictions
sub_file.to_csv('answer.csv',index=False)