In [1]:
#@title **Main**
from functions import *
from urllib.request import urlopen
import json
import numpy as np
import pandas as pd
import os
import copy
import math
import statistics
import sklearn.metrics as metrics
 
# Evitar truncar data mostrada al usar jupyter notebook
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
 
# Constante que aloja el diccionario JSON con toda la data
DATA = None

# Obtener data JSON
if os.path.exists('./out/dataout.json'):
    DATA = json.load(open('./out/dataout.json', 'r'))
else:
    data_url = urlopen('http://nutriexcel.cl/UMDU/dataout_v2.json')
    DATA = json.loads(data_url.read())
 
# Labels base de las columnas
LABELS_BASE = {
    # Parámetros del alumno (Target)
    'p1':                            ['p1'],
    'p2':                            ['p2'],
    'np':                            ['np'],
    'p1p2':                          ['p1p2'], # Promedio p1p2 y p2p2
    'p2p2':                          ['p2p2'],
    
    # Parámetros del laboratorio (Features)
    'grade':                         ['g_lab#'],
    'attempts':                      ['a_lab#'],
    'usedtime':                      ['ut_lab#'],
    'activetime':                    ['act_lab#'],
    'disconnections':                ['dis_lab#'],      # log
    'compilationtime':               ['ct_lab#'],
    'runtimedebuggingtime':          ['rt_lab#'],
    'compilationtimeratio':          ['ctr_lab#'],
    'runtimedebuggingtimeratio':     ['rtr_lab#'],
    'errorsreductionratio':          ['err_lab#'],
    'compilationerrorsratio':        ['cer_lab#'],
    'activequartiles':               ['actq1_lab#','actq2_lab#','actq3_lab#'],
    'questionsdifficulty':           ['qd$_lab#'],
    'questionsgrades':               ['qg$_lab#'],      # Promedio
    'questionsattempts':             ['qat$_lab#'],     # Sumar - Max   # log
    'questionsactivetime':           ['qact$_lab#'],    # Promedio
    'questionsavgtime':              ['qavt$_lab#'],    # Promedio
    'questionsmaxerrors':            ['qme$_lab#'],     # Max
    'questionsmaxconsecutiveerrors': ['qmce$_lab#'],    # Max
    'questionsmaxsimilarityratio':   ['qmsr$_lab#'],    # Promedio
    'questionscorrectness':          ['qc$_lab#']       # Promedio
}
 
 
# Cantidad de preguntas por lab
LABS_LENGTHS = {
    '1': 7,
    '2': 6,
    '3': 6,
    '4': 5,
    '5': 3
}

In [2]:
#@title **Data preparation**

# Get dataframe
datalab1_all = get_custom_dataframe(DATA, [1], ['p1','p2'], 'all', labels=True, index=None)

datalab1 = copy.deepcopy(datalab1_all)

# Remove questionsdifficulty
remove_col(datalab1, 'qd?')
# Group columns
datalab1_all = apply(datalab1_all, ['p1','p2'], statistics.mean)
datalab1 = apply(datalab1, ['p1','p2'], statistics.mean)
datalab1 = apply(datalab1, 'dis_lab1', norm_log)
datalab1 = apply(datalab1, 'qg?', statistics.mean)
datalab1 = apply(datalab1, 'qat?', sum, replace=False)
datalab1 = apply(datalab1, 'sum(qat$_lab1)', norm_log, replace=False)
datalab1 = apply(datalab1, 'qat?', max)
datalab1 = apply(datalab1, 'qact?', statistics.mean)
datalab1 = apply(datalab1, 'qavt?', statistics.mean)
datalab1 = apply(datalab1, 'qme?', max)
datalab1 = apply(datalab1, 'qmce?', max)
datalab1 = apply(datalab1, 'qmsr?', statistics.mean)
datalab1 = apply(datalab1, 'qc?', statistics.mean)
aux = datalab1['act_lab1'] / datalab1['sum(qat$_lab1)']
for i in range(len(aux)):
    if not aux[i] > 0:
        aux[i] = 0
datalab1['avgtime_lab1'] = aux
datalab1 = datalab1.round(4)
 
 
# Get dataframe
datalab2_all = get_custom_dataframe(DATA, [2], ['p1','p2'], 'all', labels=True, index=None)
 
datalab2 = copy.deepcopy(datalab2_all)
 
# Remove questionsdifficulty
remove_col(datalab2, 'qd?')
# Group columns
datalab2_all = apply(datalab2_all, ['p1','p2'], statistics.mean)
datalab2 = apply(datalab2, ['p1','p2'], statistics.mean)
datalab2 = apply(datalab2, 'dis_lab2', norm_log)
datalab2 = apply(datalab2, 'qg?', statistics.mean)
datalab2 = apply(datalab2, 'qat?', sum, replace=False)
datalab2 = apply(datalab2, 'sum(qat$_lab2)', norm_log, replace=False)
datalab2 = apply(datalab2, 'qat?', max)
datalab2 = apply(datalab2, 'qact?', statistics.mean)
datalab2 = apply(datalab2, 'qavt?', statistics.mean)
datalab2 = apply(datalab2, 'qme?', max)
datalab2 = apply(datalab2, 'qmce?', max)
datalab2 = apply(datalab2, 'qmsr?', statistics.mean)
datalab2 = apply(datalab2, 'qc?', statistics.mean)
aux = datalab2['act_lab2'] / datalab2['sum(qat$_lab2)']
for i in range(len(aux)):
    if not aux[i] > 0:
        aux[i] = 0
datalab2['avgtime_lab2'] = aux
datalab2 = datalab2.round(4)
 
 
# Get dataframe
datalab3_all = get_custom_dataframe(DATA, [3], ['p1','p2'], 'all', labels=True, index=None)
 
datalab3 = copy.deepcopy(datalab3_all)
 
# Remove questionsdifficulty
remove_col(datalab3, 'qd?')
# Group columns
datalab3_all = apply(datalab3_all, ['p1','p2'], statistics.mean)
datalab3 = apply(datalab3, ['p1','p2'], statistics.mean)
datalab3 = apply(datalab3, 'dis_lab3', norm_log)
datalab3 = apply(datalab3, 'qg?', statistics.mean)
datalab3 = apply(datalab3, 'qat?', sum, replace=False)
datalab3 = apply(datalab3, 'sum(qat$_lab3)', norm_log, replace=False)
datalab3 = apply(datalab3, 'qat?', max)
datalab3 = apply(datalab3, 'qact?', statistics.mean)
datalab3 = apply(datalab3, 'qavt?', statistics.mean)
datalab3 = apply(datalab3, 'qme?', max)
datalab3 = apply(datalab3, 'qmce?', max)
datalab3 = apply(datalab3, 'qmsr?', statistics.mean)
datalab3 = apply(datalab3, 'qc?', statistics.mean)
aux = datalab3['act_lab3'] / datalab3['sum(qat$_lab3)']
for i in range(len(aux)):
    if not aux[i] > 0:
        aux[i] = 0
datalab3['avgtime_lab3'] = aux
datalab3 = datalab3.round(4)
 
 
# Get dataframe
datalab4_all = get_custom_dataframe(DATA, [4], ['p1','p2'], 'all', labels=True, index=None)
 
datalab4 = copy.deepcopy(datalab4_all)
 
# Remove questionsdifficulty
remove_col(datalab4, 'qd?')
# Group columns
datalab4_all = apply(datalab4_all, ['p1','p2'], statistics.mean)
datalab4 = apply(datalab4, ['p1','p2'], statistics.mean)
datalab4 = apply(datalab4, 'dis_lab4', norm_log)
datalab4 = apply(datalab4, 'qg?', statistics.mean)
datalab4 = apply(datalab4, 'qat?', sum, replace=False)
datalab4 = apply(datalab4, 'sum(qat$_lab4)', norm_log, replace=False)
datalab4 = apply(datalab4, 'qat?', max)
datalab4 = apply(datalab4, 'qact?', statistics.mean)
datalab4 = apply(datalab4, 'qavt?', statistics.mean)
datalab4 = apply(datalab4, 'qme?', max)
datalab4 = apply(datalab4, 'qmce?', max)
datalab4 = apply(datalab4, 'qmsr?', statistics.mean)
datalab4 = apply(datalab4, 'qc?', statistics.mean)
aux = datalab4['act_lab4'] / datalab4['sum(qat$_lab4)']
for i in range(len(aux)):
    if not aux[i] > 0:
        aux[i] = 0
datalab4['avgtime_lab4'] = aux
datalab4 = datalab4.round(4)

In [3]:
#@title **Parameters**

# Objective vector
TARGET = 'mean(p$)'
NORM_TYPE = 'col'
N_FEATURES = 5
 
 
# Import needed libraries ----------------------------------------
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.ensemble import GradientBoostingRegressor
import sklearn.metrics as metrics
from sklearn.model_selection import KFold
 
random_state = None # Random state for train_test_split

In [4]:
#@title **Scale features**
 
# Scale features -------------------------------------------------
from sklearn.preprocessing import StandardScaler

scaler1 = StandardScaler()
scaler2 = StandardScaler()
scaler3 = StandardScaler()
scaler4 = StandardScaler()
scaler_all = StandardScaler()

datalab1 = datalab1[[TARGET]].join(pd.DataFrame(scaler1.fit_transform(datalab1), columns=datalab1.columns)[datalab1.columns[1:]])
datalab2 = datalab2[[TARGET]].join(pd.DataFrame(scaler2.fit_transform(datalab2), columns=datalab2.columns)[datalab2.columns[1:]])
datalab3 = datalab3[[TARGET]].join(pd.DataFrame(scaler3.fit_transform(datalab3), columns=datalab3.columns)[datalab3.columns[1:]])
datalab4 = datalab4[[TARGET]].join(pd.DataFrame(scaler4.fit_transform(datalab4), columns=datalab4.columns)[datalab4.columns[1:]])

In [5]:
#@title **Grid/Random-SearchCV process**   
 
def run_process(dataset, grid_cv, target=TARGET):
    X, y = dataset.drop(target, axis=1), np.array(dataset[target])
   
    grid_cv.fit(X,y)
    print('R2:', max(grid_cv.cv_results_['mean_test_score']))
    
    try:
        selected_features = X.columns[grid_cv.best_estimator_.steps[0][-1].get_support()]
    except:
        return list(dataset.columns[1:])
    
    return list(selected_features)
    
def run_process_obsolete(dataset, grid_cv, target=TARGET):
    X, y = dataset.drop(target, axis=1), np.array(dataset[target])
   
    grid_cv.fit(X,y)
 
    try:
        print('R2-test-fit:', max(grid_cv.cv_results_['mean_test_score']))
    except:
        pass

    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        grid_cv.best_estimator_.fit(X_train, y_train)
        print('R2-test', grid_cv.best_estimator_.score(X_test, y_test))
        print('MSE-test', metrics.mean_squared_error(y_test,grid_cv.best_estimator_.predict(X_test)))

        print('Best params:', grid_cv.best_params_)

        selected_features = X.columns[grid_cv.best_estimator_.steps[0][-1].get_support()]
        print('Selected features:', list(selected_features))

        return list(selected_features)
    except:
        return list(dataset.columns[1:])

In [6]:
#@title **SVR - Recursive Features Elimination**

from sklearn.svm import SVR

sel_estimator = SVR(kernel='linear')
selector = RFE(sel_estimator)
estimator = SVR()

pipe = Pipeline([
    ('sel', selector),
    ('est', estimator)
])

params = {
    'sel__n_features_to_select' : [5],
    'sel__step'                 : [1,2],
    'est__C'                    : [0.01,0.1,1],
    'est__gamma'                : ['scale','auto'],
    'est__kernel'               : ['linear','poly','rbf']
}

grid_svr = GridSearchCV(estimator=pipe,
                        param_grid=params,
                        scoring='r2',
                        verbose=1,
                        n_jobs=-1,
                        return_train_score=True,
                        cv=KFold(n_splits=10, shuffle=False))

In [7]:
datalab1_shuffle = datalab1.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_svr_1 = run_process(datalab1_shuffle,grid_svr)
selected_features_svr_1

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    7.3s


R2: 0.008046114472525257


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   12.3s finished


['g_lab1', 'actq2_lab1', 'actq3_lab1', 'mean(qg$_lab1)', 'mean(qmsr$_lab1)']

In [8]:
datalab2_shuffle = datalab2.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_svr_2 = run_process(datalab2_shuffle,grid_svr)
selected_features_svr_2

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   31.8s finished


R2: 0.03677130801986146


['rt_lab2',
 'mean(qg$_lab2)',
 'mean(qact$_lab2)',
 'mean(qmsr$_lab2)',
 'sum(qat$_lab2)']

In [9]:
datalab3_shuffle = datalab3.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_svr_3 = run_process(datalab3_shuffle,grid_svr)
selected_features_svr_3

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   30.9s finished


R2: 0.12918450879232526


['act_lab3',
 'cer_lab3',
 'mean(qg$_lab3)',
 'mean(qmsr$_lab3)',
 'norm_log(sum(qat$_lab3))']

In [10]:
datalab4_shuffle = datalab4.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_svr_4 = run_process(datalab4_shuffle,grid_svr)
selected_features_svr_4

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   23.5s finished


R2: 0.2190664446977674


['g_lab4',
 'a_lab4',
 'cer_lab4',
 'mean(qmsr$_lab4)',
 'norm_log(sum(qat$_lab4))']

In [11]:
estimator = SVR()

params = {
    'C'         : [0.01,0.1,1],
    'gamma'     : ['scale','auto'],
    'kernel'    : ['linear','poly','rbf']
}

grid_svr = GridSearchCV(estimator=estimator,
                        param_grid=params,
                        scoring='r2',
                        verbose=1,
                        n_jobs=-1,
                        return_train_score=True,
                        cv=KFold(n_splits=10, shuffle=False))

In [12]:
#Combinacion lab 1 y 2 
dataset = datalab1[[TARGET] + selected_features_svr_1].join(datalab2[selected_features_svr_2])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_svr)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s


R2: 0.08083328709952423


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    0.8s finished


['g_lab1',
 'actq2_lab1',
 'actq3_lab1',
 'mean(qg$_lab1)',
 'mean(qmsr$_lab1)',
 'rt_lab2',
 'mean(qg$_lab2)',
 'mean(qact$_lab2)',
 'mean(qmsr$_lab2)',
 'sum(qat$_lab2)']

In [16]:
#Combinacion lab 1, 2 y 3
dataset = datalab1[[TARGET] + selected_features_svr_1].join(datalab2[selected_features_svr_2]).join(datalab3[selected_features_svr_3])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_svr)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s


R2: 0.15100361966348766


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.1s finished


['g_lab1',
 'actq2_lab1',
 'actq3_lab1',
 'mean(qg$_lab1)',
 'mean(qmsr$_lab1)',
 'rt_lab2',
 'mean(qg$_lab2)',
 'mean(qact$_lab2)',
 'mean(qmsr$_lab2)',
 'sum(qat$_lab2)',
 'act_lab3',
 'cer_lab3',
 'mean(qg$_lab3)',
 'mean(qmsr$_lab3)',
 'norm_log(sum(qat$_lab3))']

In [14]:
#Combinacion lab 1, 2 , 3 y 4
dataset = datalab1[[TARGET] + selected_features_svr_1].join(datalab2[selected_features_svr_2]).join(datalab3[selected_features_svr_3]).join(datalab4[selected_features_svr_4])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_svr)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s


R2: 0.26583993519783283


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.1s finished


['g_lab1',
 'actq2_lab1',
 'actq3_lab1',
 'mean(qg$_lab1)',
 'mean(qmsr$_lab1)',
 'rt_lab2',
 'mean(qg$_lab2)',
 'mean(qact$_lab2)',
 'mean(qmsr$_lab2)',
 'sum(qat$_lab2)',
 'act_lab3',
 'cer_lab3',
 'mean(qg$_lab3)',
 'mean(qmsr$_lab3)',
 'norm_log(sum(qat$_lab3))',
 'g_lab4',
 'a_lab4',
 'cer_lab4',
 'mean(qmsr$_lab4)',
 'norm_log(sum(qat$_lab4))']

In [7]:
#@title **Random Forest Regressor** 

from sklearn.ensemble import  RandomForestRegressor
 
# GradientBoostingRegressor / RandomForestRegressor / SVR(kernel='linear')
sel_estimator = GradientBoostingRegressor(random_state=1)
#sel_estimator = GradientBoostingRegressor(random_state=random_state) 
# RFE / SelectFromModel
selector = RFE(sel_estimator)
estimator = RandomForestRegressor(random_state=1, n_jobs=-1)
#estimator = RandomForestRegressor(random_state=random_state, n_jobs=-1)

 
pipe = Pipeline([
    ('sel', selector),
    ('est', estimator)
])
 
params = {
    'sel__estimator__learning_rate': [0.05,0.1,0.2],
    'sel__n_features_to_select'    : [5],
    'est__n_estimators'            : [50,100,200,400],
    'est__criterion'               : ['mse','mae'],
    'est__max_features'            : ['auto','sqrt','log2']
}
 
grid_rfr = GridSearchCV(estimator=pipe,
                        param_grid=params,
                        scoring='r2',
                        verbose=1,
                        n_jobs=-1,
                        return_train_score=True,
                        cv=KFold(n_splits=10, shuffle=False))

In [8]:
datalab1_shuffle = datalab1.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_rfr_1 = run_process(datalab1_shuffle,grid_rfr)
selected_features_rfr_1

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 11.9min finished


R2: -0.08559909492717728


['act_lab1', 'rtr_lab1', 'actq1_lab1', 'actq3_lab1', 'mean(qc$_lab1)']

In [10]:
datalab2_shuffle = datalab2.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_rfr_2 = run_process(datalab2_shuffle,grid_rfr)
selected_features_rfr_2

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  8.6min finished


R2: -0.03735556481679305


['rtr_lab2', 'actq1_lab2', 'actq2_lab2', 'mean(qc$_lab2)', 'avgtime_lab2']

In [11]:
datalab3_shuffle = datalab3.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_rfr_3 = run_process(datalab3_shuffle,grid_rfr)
selected_features_rfr_3

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 11.0min finished


R2: 0.0970994300785136


['cer_lab3', 'actq1_lab3', 'actq2_lab3', 'max(qat$_lab3)', 'mean(qmsr$_lab3)']

In [13]:
datalab4_shuffle = datalab4.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_rfr_4 = run_process(datalab4_shuffle,grid_rfr)
selected_features_rfr_4

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  9.8min finished


R2: 0.15544919898639079


['ut_lab4', 'cer_lab4', 'actq2_lab4', 'mean(qg$_lab4)', 'mean(qact$_lab4)']

In [14]:
estimator = RandomForestRegressor(random_state=1, n_jobs=-1)
 
params = {
    'n_estimators'  : [50,100,200,400],
    'criterion'     : ['mse','mae'],
    'max_features'  : ['auto','sqrt','log2']
}
 
grid_rfr = GridSearchCV(estimator=estimator,
                        param_grid=params,
                        scoring='r2',
                        verbose=1,
                        n_jobs=-1,
                        return_train_score=True,
                        cv=KFold(n_splits=10, shuffle=False))

In [15]:
#Combinación de laboratorios
dataset = datalab1[[TARGET] + selected_features_rfr_1].join(datalab2[selected_features_rfr_2])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_rfr)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   41.7s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   57.7s finished


R2: 0.060978454463522305


['act_lab1',
 'rtr_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qc$_lab1)',
 'rtr_lab2',
 'actq1_lab2',
 'actq2_lab2',
 'mean(qc$_lab2)',
 'avgtime_lab2']

In [16]:
dataset = datalab1[[TARGET] + selected_features_rfr_1].join(datalab2[selected_features_rfr_2]).join(datalab3[selected_features_rfr_3])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_rfr)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   42.2s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   59.0s finished


R2: 0.16840420101769527


['act_lab1',
 'rtr_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qc$_lab1)',
 'rtr_lab2',
 'actq1_lab2',
 'actq2_lab2',
 'mean(qc$_lab2)',
 'avgtime_lab2',
 'cer_lab3',
 'actq1_lab3',
 'actq2_lab3',
 'max(qat$_lab3)',
 'mean(qmsr$_lab3)']

In [17]:
dataset = datalab1[[TARGET] + selected_features_rfr_1].join(datalab2[selected_features_rfr_2]).join(datalab3[selected_features_rfr_3]).join(datalab4[selected_features_rfr_4])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_rfr)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   43.5s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   59.4s finished


R2: 0.2619950206533569


['act_lab1',
 'rtr_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qc$_lab1)',
 'rtr_lab2',
 'actq1_lab2',
 'actq2_lab2',
 'mean(qc$_lab2)',
 'avgtime_lab2',
 'cer_lab3',
 'actq1_lab3',
 'actq2_lab3',
 'max(qat$_lab3)',
 'mean(qmsr$_lab3)',
 'ut_lab4',
 'cer_lab4',
 'actq2_lab4',
 'mean(qg$_lab4)',
 'mean(qact$_lab4)']

In [17]:
#@title **Linear Regression**

from sklearn.linear_model import LinearRegression
 
# GradientBoostingRegressor / RandomForestRegressor / SVR(kernel='linear')
sel_estimator = GradientBoostingRegressor(random_state=1)
 
# RFE / SelectFromModel
selector = RFE(sel_estimator)
estimator = LinearRegression()

pipe = Pipeline([
    ('sel', selector),
    ('est', estimator)
])
 
# params = {'est__n_jobs': [-1], 
#           'est__normalize': [True], 
#           'sel__estimator__learning_rate': [0.1], 
#           'sel__estimator__n_estimators': [100], 
#           'sel__max_features': [10], 
#           'sel__prefit': [False]}

params = {
    'sel__n_features_to_select' : [5],
    'sel__step'                 : [1,2],
    'est__n_jobs'               : [-1],
}
 
grid_lr = GridSearchCV(estimator=pipe,
                       param_grid=params,
                       scoring='r2',
                       verbose=1,
                       n_jobs=-1,
                       return_train_score=True,
                       cv=KFold(n_splits=10, shuffle=False))

In [19]:
datalab1_shuffle = datalab1.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_lr_1 = run_process(datalab1_shuffle,grid_lr)
selected_features_lr_1

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    9.8s finished


R2: -0.028609160426679003


['act_lab1', 'rtr_lab1', 'actq1_lab1', 'actq3_lab1', 'mean(qc$_lab1)']

In [20]:
datalab2_shuffle = datalab2.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_lr_2 = run_process(datalab2_shuffle,grid_lr)
selected_features_lr_2

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   11.6s finished


R2: 0.04318641891908598


['rtr_lab2', 'actq1_lab2', 'actq2_lab2', 'mean(qc$_lab2)', 'avgtime_lab2']

In [22]:
datalab3_shuffle = datalab3.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_lr_3 = run_process(datalab3_shuffle,grid_lr)
selected_features_lr_3

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   10.1s finished


R2: -0.00904676912959267


['cer_lab3', 'actq1_lab3', 'actq2_lab3', 'max(qat$_lab3)', 'mean(qmsr$_lab3)']

In [23]:
datalab4_shuffle = datalab4.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_lr_4 = run_process(datalab4_shuffle,grid_lr)
selected_features_lr_4

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   10.7s finished


R2: 0.1168873333934279


['ut_lab4', 'cer_lab4', 'actq2_lab4', 'mean(qg$_lab4)', 'mean(qact$_lab4)']

In [24]:
estimator = LinearRegression()
 
params = {
    'n_jobs'    : [-1],
}
 
grid_lr = GridSearchCV(estimator=estimator,
                       param_grid=params,
                       scoring='r2',
                       verbose=1,
                       n_jobs=-1,
                       return_train_score=True,
                       cv=KFold(n_splits=10, shuffle=False))

In [25]:
#Combinación de laboratorios 1 y 2
dataset = datalab1[[TARGET] + selected_features_lr_1].join(datalab2[selected_features_lr_2])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_lr)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
R2: 0.029536203962807196


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


['act_lab1',
 'rtr_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qc$_lab1)',
 'rtr_lab2',
 'actq1_lab2',
 'actq2_lab2',
 'mean(qc$_lab2)',
 'avgtime_lab2']

In [26]:
#Combinación de laboratorios 1, 2 y 3
dataset = datalab1[[TARGET] + selected_features_lr_1].join(datalab2[selected_features_lr_2]).join(datalab3[selected_features_lr_3])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_lr)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
R2: 0.03890986982868912


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


['act_lab1',
 'rtr_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qc$_lab1)',
 'rtr_lab2',
 'actq1_lab2',
 'actq2_lab2',
 'mean(qc$_lab2)',
 'avgtime_lab2',
 'cer_lab3',
 'actq1_lab3',
 'actq2_lab3',
 'max(qat$_lab3)',
 'mean(qmsr$_lab3)']

In [27]:
#Combinación de laboratorios 1, 2, 3 y 4
dataset = datalab1[[TARGET] + selected_features_lr_1].join(datalab2[selected_features_lr_2]).join(datalab3[selected_features_lr_3]).join(datalab4[selected_features_lr_4])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_lr)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
R2: 0.14070597606982785


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


['act_lab1',
 'rtr_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qc$_lab1)',
 'rtr_lab2',
 'actq1_lab2',
 'actq2_lab2',
 'mean(qc$_lab2)',
 'avgtime_lab2',
 'cer_lab3',
 'actq1_lab3',
 'actq2_lab3',
 'max(qat$_lab3)',
 'mean(qmsr$_lab3)',
 'ut_lab4',
 'cer_lab4',
 'actq2_lab4',
 'mean(qg$_lab4)',
 'mean(qact$_lab4)']