In [3]:
#@title **Main**
from functions import *
from urllib.request import urlopen
import json
import numpy as np
import pandas as pd
import os
import copy
import math
import statistics
import sklearn.metrics as metrics
 
# Evitar truncar data mostrada al usar jupyter notebook
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
 
# Constante que aloja el diccionario JSON con toda la data
DATA = None

# Obtener data JSON
if os.path.exists('./out/dataout.json'):
    DATA = json.load(open('./out/dataout.json', 'r'))
else:
    data_url = urlopen('http://nutriexcel.cl/UMDU/dataout_v2.json')
    DATA = json.loads(data_url.read())
 
# Labels base de las columnas
LABELS_BASE = {
    # Parámetros del alumno (Target)
    'p1':                            ['p1'],
    'p2':                            ['p2'],
    'np':                            ['np'],
    'p1p2':                          ['p1p2'], # Promedio p1p2 y p2p2
    'p2p2':                          ['p2p2'],
    
    # Parámetros del laboratorio (Features)
    'grade':                         ['g_lab#'],
    'attempts':                      ['a_lab#'],
    'usedtime':                      ['ut_lab#'],
    'activetime':                    ['act_lab#'],
    'disconnections':                ['dis_lab#'],      # log
    'compilationtime':               ['ct_lab#'],
    'runtimedebuggingtime':          ['rt_lab#'],
    'compilationtimeratio':          ['ctr_lab#'],
    'runtimedebuggingtimeratio':     ['rtr_lab#'],
    'errorsreductionratio':          ['err_lab#'],
    'compilationerrorsratio':        ['cer_lab#'],
    'activequartiles':               ['actq1_lab#','actq2_lab#','actq3_lab#'],
    'questionsdifficulty':           ['qd$_lab#'],
    'questionsgrades':               ['qg$_lab#'],      # Promedio
    'questionsattempts':             ['qat$_lab#'],     # Sumar - Max   # log
    'questionsactivetime':           ['qact$_lab#'],    # Promedio
    'questionsavgtime':              ['qavt$_lab#'],    # Promedio
    'questionsmaxerrors':            ['qme$_lab#'],     # Max
    'questionsmaxconsecutiveerrors': ['qmce$_lab#'],    # Max
    'questionsmaxsimilarityratio':   ['qmsr$_lab#'],    # Promedio
    'questionscorrectness':          ['qc$_lab#']       # Promedio
}
 
 
# Cantidad de preguntas por lab
LABS_LENGTHS = {
    '1': 7,
    '2': 6,
    '3': 6,
    '4': 5,
    '5': 3
}

In [13]:
#@title **Data preparation**

# Get dataframe
datalab1_all = get_custom_dataframe(DATA, [1], ['p1p2','p2p2'], 'all', labels=True, index=None)

datalab1 = copy.deepcopy(datalab1_all)

# Remove questionsdifficulty
remove_col(datalab1, 'qd?')
# Group columns
datalab1_all = apply(datalab1_all, ['p1p2','p2p2'], statistics.mean)
datalab1 = apply(datalab1, ['p1p2','p2p2'], statistics.mean)
datalab1 = apply(datalab1, 'dis_lab1', norm_log)
datalab1 = apply(datalab1, 'qg?', statistics.mean)
datalab1 = apply(datalab1, 'qat?', sum, replace=False)
datalab1 = apply(datalab1, 'sum(qat$_lab1)', norm_log, replace=False)
datalab1 = apply(datalab1, 'qat?', max)
datalab1 = apply(datalab1, 'qact?', statistics.mean)
datalab1 = apply(datalab1, 'qavt?', statistics.mean)
datalab1 = apply(datalab1, 'qme?', max)
datalab1 = apply(datalab1, 'qmce?', max)
datalab1 = apply(datalab1, 'qmsr?', statistics.mean)
datalab1 = apply(datalab1, 'qc?', statistics.mean)
aux = datalab1['act_lab1'] / datalab1['sum(qat$_lab1)']
for i in range(len(aux)):
    if not aux[i] > 0:
        aux[i] = 0
datalab1['avgtime_lab1'] = aux
datalab1 = datalab1.round(4)
 
 
# Get dataframe
datalab2_all = get_custom_dataframe(DATA, [2], ['p1p2','p2p2'], 'all', labels=True, index=None)
 
datalab2 = copy.deepcopy(datalab2_all)
 
# Remove questionsdifficulty
remove_col(datalab2, 'qd?')
# Group columns
datalab2_all = apply(datalab2_all, ['p1p2','p2p2'], statistics.mean)
datalab2 = apply(datalab2, ['p1p2','p2p2'], statistics.mean)
datalab2 = apply(datalab2, 'dis_lab2', norm_log)
datalab2 = apply(datalab2, 'qg?', statistics.mean)
datalab2 = apply(datalab2, 'qat?', sum, replace=False)
datalab2 = apply(datalab2, 'sum(qat$_lab2)', norm_log, replace=False)
datalab2 = apply(datalab2, 'qat?', max)
datalab2 = apply(datalab2, 'qact?', statistics.mean)
datalab2 = apply(datalab2, 'qavt?', statistics.mean)
datalab2 = apply(datalab2, 'qme?', max)
datalab2 = apply(datalab2, 'qmce?', max)
datalab2 = apply(datalab2, 'qmsr?', statistics.mean)
datalab2 = apply(datalab2, 'qc?', statistics.mean)
aux = datalab2['act_lab2'] / datalab2['sum(qat$_lab2)']
for i in range(len(aux)):
    if not aux[i] > 0:
        aux[i] = 0
datalab2['avgtime_lab2'] = aux
datalab2 = datalab2.round(4)
 
 
# Get dataframe
datalab3_all = get_custom_dataframe(DATA, [3], ['p1p2','p2p2'], 'all', labels=True, index=None)
 
datalab3 = copy.deepcopy(datalab3_all)
 
# Remove questionsdifficulty
remove_col(datalab3, 'qd?')
# Group columns
datalab3_all = apply(datalab3_all, ['p1p2','p2p2'], statistics.mean)
datalab3 = apply(datalab3, ['p1p2','p2p2'], statistics.mean)
datalab3 = apply(datalab3, 'dis_lab3', norm_log)
datalab3 = apply(datalab3, 'qg?', statistics.mean)
datalab3 = apply(datalab3, 'qat?', sum, replace=False)
datalab3 = apply(datalab3, 'sum(qat$_lab3)', norm_log, replace=False)
datalab3 = apply(datalab3, 'qat?', max)
datalab3 = apply(datalab3, 'qact?', statistics.mean)
datalab3 = apply(datalab3, 'qavt?', statistics.mean)
datalab3 = apply(datalab3, 'qme?', max)
datalab3 = apply(datalab3, 'qmce?', max)
datalab3 = apply(datalab3, 'qmsr?', statistics.mean)
datalab3 = apply(datalab3, 'qc?', statistics.mean)
aux = datalab3['act_lab3'] / datalab3['sum(qat$_lab3)']
for i in range(len(aux)):
    if not aux[i] > 0:
        aux[i] = 0
datalab3['avgtime_lab3'] = aux
datalab3 = datalab3.round(4)
 
 
# Get dataframe
datalab4_all = get_custom_dataframe(DATA, [4], ['p1p2','p2p2'], 'all', labels=True, index=None)
 
datalab4 = copy.deepcopy(datalab4_all)
 
# Remove questionsdifficulty
remove_col(datalab4, 'qd?')
# Group columns
datalab4_all = apply(datalab4_all, ['p1p2','p2p2'], statistics.mean)
datalab4 = apply(datalab4, ['p1p2','p2p2'], statistics.mean)
datalab4 = apply(datalab4, 'dis_lab4', norm_log)
datalab4 = apply(datalab4, 'qg?', statistics.mean)
datalab4 = apply(datalab4, 'qat?', sum, replace=False)
datalab4 = apply(datalab4, 'sum(qat$_lab4)', norm_log, replace=False)
datalab4 = apply(datalab4, 'qat?', max)
datalab4 = apply(datalab4, 'qact?', statistics.mean)
datalab4 = apply(datalab4, 'qavt?', statistics.mean)
datalab4 = apply(datalab4, 'qme?', max)
datalab4 = apply(datalab4, 'qmce?', max)
datalab4 = apply(datalab4, 'qmsr?', statistics.mean)
datalab4 = apply(datalab4, 'qc?', statistics.mean)
aux = datalab4['act_lab4'] / datalab4['sum(qat$_lab4)']
for i in range(len(aux)):
    if not aux[i] > 0:
        aux[i] = 0
datalab4['avgtime_lab4'] = aux
datalab4 = datalab4.round(4)
datalab4

Unnamed: 0,mean(p$p2),g_lab4,a_lab4,ut_lab4,act_lab4,norm_log(dis_lab4),ct_lab4,rt_lab4,ctr_lab4,rtr_lab4,err_lab4,cer_lab4,actq1_lab4,actq2_lab4,actq3_lab4,mean(qg$_lab4),max(qat$_lab4),mean(qact$_lab4),mean(qavt$_lab4),max(qme$_lab4),max(qmce$_lab4),mean(qmsr$_lab4),mean(qc$_lab4),sum(qat$_lab4),norm_log(sum(qat$_lab4)),avgtime_lab4
0,2.0,5.5,1.0,480581.0,9555.0,1.9459,263.0,7172.0,0.0275,0.7506,0.5938,0.3095,0.1103,0.4155,0.4188,1.0,45.0,1632.6,62.1927,9.0,7.0,0.6286,0.288,84.0,4.4427,113.75
1,2.25,5.0,1.0,129692.0,5827.0,2.3026,0.0,4361.0,0.0,0.7484,0.5217,0.2133,0.2693,0.2929,0.3051,0.8,33.0,1031.2,51.3671,4.0,4.0,0.4358,0.3171,75.0,4.3307,77.6933
2,1.25,6.0,1.0,531739.0,5088.0,2.8904,280.0,2461.0,0.055,0.4837,0.6667,0.4107,0.4177,0.9825,0.9838,1.0,22.0,661.8,82.6714,5.0,6.0,0.7251,0.2951,56.0,4.0431,90.8571
3,3.0,6.0,1.0,651203.0,1928.0,2.0794,87.0,1109.0,0.0451,0.5752,0.6,0.5294,0.7351,0.7358,0.7394,1.0,23.0,362.8,44.2824,8.0,6.0,0.604,0.3103,34.0,3.5553,56.7059
4,2.0,3.0,1.0,9382.0,2560.0,1.6094,74.0,1930.0,0.0289,0.7539,0.8,0.1905,0.3499,0.3508,0.3519,0.6,15.0,432.0,79.3909,1.0,1.0,0.4246,0.2509,21.0,3.091,121.9048
5,3.0,5.5,1.0,519819.0,4304.0,1.6094,88.0,2753.0,0.0204,0.6396,0.6364,0.303,0.2889,0.2909,0.2929,1.0,17.0,615.0,62.3983,6.0,3.0,0.9523,0.3211,33.0,3.5264,130.4242
6,3.5,6.0,1.0,720357.0,3071.0,2.3979,73.0,2850.0,0.0238,0.928,0.6923,0.2703,0.9855,0.9862,0.9891,1.0,21.0,624.2,123.325,2.0,2.0,0.848,0.3277,37.0,3.6376,83.0
7,1.0,5.0,1.0,261064.0,7204.0,2.0794,715.0,3978.0,0.0993,0.5522,0.4754,0.5714,0.1795,0.1839,0.3481,1.0,31.0,1453.2,83.0922,9.0,8.0,0.5352,0.2657,91.0,4.5218,79.1648
8,3.5,6.0,1.0,520822.0,4520.0,2.0794,263.0,3313.0,0.0582,0.733,0.6667,0.2381,0.2597,0.261,0.3451,1.0,33.0,723.0,57.8357,22.0,3.0,0.578,0.2821,42.0,3.7612,107.619
9,2.5,6.0,1.0,269117.0,3306.0,2.3979,554.0,1759.0,0.1676,0.5321,0.5385,0.3721,0.293,0.3763,0.6023,1.0,22.0,474.4,85.8549,9.0,8.0,0.5915,0.3066,43.0,3.7842,76.8837


In [5]:
#@title **Parameters**

# Objective vector
TARGET = 'mean(p$p2)'
NORM_TYPE = 'col'
N_FEATURES = 5
 
 
# Import needed libraries ----------------------------------------
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.ensemble import GradientBoostingRegressor
import sklearn.metrics as metrics
from sklearn.model_selection import KFold
 
random_state = None # Random state for train_test_split

In [14]:
#@title **Scale features**
 
# Scale features -------------------------------------------------
from sklearn.preprocessing import StandardScaler

scaler1 = StandardScaler()
scaler2 = StandardScaler()
scaler3 = StandardScaler()
scaler4 = StandardScaler()
scaler_all = StandardScaler()

datalab1 = datalab1[[TARGET]].join(pd.DataFrame(scaler1.fit_transform(datalab1), columns=datalab1.columns)[datalab1.columns[1:]])
datalab2 = datalab2[[TARGET]].join(pd.DataFrame(scaler2.fit_transform(datalab2), columns=datalab2.columns)[datalab2.columns[1:]])
datalab3 = datalab3[[TARGET]].join(pd.DataFrame(scaler3.fit_transform(datalab3), columns=datalab3.columns)[datalab3.columns[1:]])
datalab4 = datalab4[[TARGET]].join(pd.DataFrame(scaler4.fit_transform(datalab4), columns=datalab4.columns)[datalab4.columns[1:]])

In [7]:
#@title **Grid/Random-SearchCV process**   
 
def run_process(dataset, grid_cv, target=TARGET):
    X, y = dataset.drop(target, axis=1), np.array(dataset[target])
   
    grid_cv.fit(X,y)
    print('R2:', max(grid_cv.cv_results_['mean_test_score']))
    
    try:
        selected_features = X.columns[grid_cv.best_estimator_.steps[0][-1].get_support()]
    except:
        return list(dataset.columns[1:])
    
    return list(selected_features)
    
def run_process_obsolete(dataset, grid_cv, target=TARGET):
    X, y = dataset.drop(target, axis=1), np.array(dataset[target])
   
    grid_cv.fit(X,y)
 
    try:
        print('R2-test-fit:', max(grid_cv.cv_results_['mean_test_score']))
    except:
        pass

    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        grid_cv.best_estimator_.fit(X_train, y_train)
        print('R2-test', grid_cv.best_estimator_.score(X_test, y_test))
        print('MSE-test', metrics.mean_squared_error(y_test,grid_cv.best_estimator_.predict(X_test)))

        print('Best params:', grid_cv.best_params_)

        selected_features = X.columns[grid_cv.best_estimator_.steps[0][-1].get_support()]
        print('Selected features:', list(selected_features))

        return list(selected_features)
    except:
        return list(dataset.columns[1:])

In [6]:
#@title **SVR - Recursive Features Elimination**

from sklearn.svm import SVR

sel_estimator = SVR(kernel='linear')
selector = RFE(sel_estimator)
estimator = SVR()

pipe = Pipeline([
    ('sel', selector),
    ('est', estimator)
])

params = {
    'sel__n_features_to_select' : [5],
    'sel__step'                 : [1,2],
    'est__C'                    : [0.01,0.1,1],
    'est__gamma'                : ['scale','auto'],
    'est__kernel'               : ['linear','poly','rbf']
}

grid_svr = GridSearchCV(estimator=pipe,
                        param_grid=params,
                        scoring='r2',
                        verbose=1,
                        n_jobs=-1,
                        return_train_score=True,
                        cv=KFold(n_splits=10, shuffle=False))

In [7]:
datalab1_shuffle = datalab1.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_svr_1 = run_process(datalab1_shuffle,grid_svr)
selected_features_svr_1

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    6.0s


R2: -0.057243701195589546


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    9.8s finished


['ct_lab1', 'actq1_lab1', 'actq3_lab1', 'mean(qg$_lab1)', 'mean(qmsr$_lab1)']

In [13]:
datalab2_shuffle = datalab2.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_svr_2 = run_process(datalab2_shuffle,grid_svr)
selected_features_svr_2

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.5s


R2: -0.03501747436365442


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    8.8s finished


['g_lab2', 'act_lab2', 'max(qat$_lab2)', 'mean(qact$_lab2)', 'sum(qat$_lab2)']

In [16]:
datalab3_shuffle = datalab3.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_svr_3 = run_process(datalab3_shuffle,grid_svr)
selected_features_svr_3

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.5s


R2: 0.10863697870548439


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    9.0s finished


['g_lab3',
 'rt_lab3',
 'mean(qact$_lab3)',
 'mean(qmsr$_lab3)',
 'norm_log(sum(qat$_lab3))']

In [18]:
datalab4_shuffle = datalab4.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_svr_4 = run_process(datalab4_shuffle,grid_svr)
selected_features_svr_4

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.5s


R2: 0.19666919930390753


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    8.6s finished


['act_lab4',
 'cer_lab4',
 'mean(qg$_lab4)',
 'max(qat$_lab4)',
 'norm_log(sum(qat$_lab4))']

In [19]:
estimator = SVR()

params = {
    'C'         : [0.01,0.1,1],
    'gamma'     : ['scale','auto'],
    'kernel'    : ['linear','poly','rbf']
}

grid_svr = GridSearchCV(estimator=estimator,
                        param_grid=params,
                        scoring='r2',
                        verbose=1,
                        n_jobs=-1,
                        return_train_score=True,
                        cv=KFold(n_splits=10, shuffle=False))

In [22]:
dataset = datalab1[[TARGET] + selected_features_svr_1].join(datalab2[selected_features_svr_2])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_svr)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


R2: 0.002951403041078926


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    0.3s finished


['ct_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qg$_lab1)',
 'mean(qmsr$_lab1)',
 'g_lab2',
 'act_lab2',
 'max(qat$_lab2)',
 'mean(qact$_lab2)',
 'sum(qat$_lab2)']

In [25]:
dataset = datalab1[[TARGET] + selected_features_svr_1].join(datalab2[selected_features_svr_2]).join(datalab3[selected_features_svr_3])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_svr)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    0.1s


R2: 0.07500996812845116


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    0.3s finished


['ct_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qg$_lab1)',
 'mean(qmsr$_lab1)',
 'g_lab2',
 'act_lab2',
 'max(qat$_lab2)',
 'mean(qact$_lab2)',
 'sum(qat$_lab2)',
 'g_lab3',
 'rt_lab3',
 'mean(qact$_lab3)',
 'mean(qmsr$_lab3)',
 'norm_log(sum(qat$_lab3))']

In [27]:
dataset = datalab1[[TARGET] + selected_features_svr_1].join(datalab2[selected_features_svr_2]).join(datalab3[selected_features_svr_3]).join(datalab4[selected_features_svr_4])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_svr)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


R2: 0.23667974020054966


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    0.3s finished


['ct_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qg$_lab1)',
 'mean(qmsr$_lab1)',
 'g_lab2',
 'act_lab2',
 'max(qat$_lab2)',
 'mean(qact$_lab2)',
 'sum(qat$_lab2)',
 'g_lab3',
 'rt_lab3',
 'mean(qact$_lab3)',
 'mean(qmsr$_lab3)',
 'norm_log(sum(qat$_lab3))',
 'act_lab4',
 'cer_lab4',
 'mean(qg$_lab4)',
 'max(qat$_lab4)',
 'norm_log(sum(qat$_lab4))']

In [8]:
#@title **Random Forest Regressor** 

from sklearn.ensemble import  RandomForestRegressor
 
# GradientBoostingRegressor / RandomForestRegressor / SVR(kernel='linear')
sel_estimator = GradientBoostingRegressor(random_state=1)
#sel_estimator = GradientBoostingRegressor(random_state=random_state) 
# RFE / SelectFromModel
selector = RFE(sel_estimator)
estimator = RandomForestRegressor(random_state=1, n_jobs=-1)
#estimator = RandomForestRegressor(random_state=random_state, n_jobs=-1)

 
pipe = Pipeline([
    ('sel', selector),
    ('est', estimator)
])
 
params = {
    'sel__estimator__learning_rate': [0.05,0.1,0.2],
    'sel__n_features_to_select'    : [5],
    'est__n_estimators'            : [50,100,200,400],
    'est__criterion'               : ['mse','mae'],
    'est__max_features'            : ['auto','sqrt','log2']
}
 
grid_rfr = GridSearchCV(estimator=pipe,
                        param_grid=params,
                        scoring='r2',
                        verbose=1,
                        n_jobs=-1,
                        return_train_score=True,
                        cv=KFold(n_splits=10, shuffle=False))

In [9]:
datalab1_shuffle = datalab1.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_rfr_1 = run_process(datalab1_shuffle,grid_rfr)
selected_features_rfr_1

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  5.5min finished


R2: 0.07774716837836562


['cer_lab1', 'actq1_lab1', 'actq3_lab1', 'mean(qmsr$_lab1)', 'mean(qc$_lab1)']

In [10]:
datalab2_shuffle = datalab2.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_rfr_2 = run_process(datalab2_shuffle,grid_rfr)
selected_features_rfr_2

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  7.3min finished


R2: -0.0858482273258781


['act_lab2', 'actq1_lab2', 'actq3_lab2', 'mean(qavt$_lab2)', 'avgtime_lab2']

In [11]:
datalab3_shuffle = datalab3.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_rfr_3 = run_process(datalab3_shuffle,grid_rfr)
selected_features_rfr_3

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  9.1min finished


R2: 0.18346837688730658


['g_lab3', 'ut_lab3', 'rt_lab3', 'actq2_lab3', 'mean(qmsr$_lab3)']

In [16]:
datalab4_shuffle = datalab4.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_rfr_4 = run_process(datalab4_shuffle,grid_rfr)
selected_features_rfr_4

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 11.5min finished


R2: 0.19748201032301424


['rt_lab4', 'rtr_lab4', 'cer_lab4', 'actq1_lab4', 'mean(qc$_lab4)']

In [17]:
estimator = RandomForestRegressor(random_state=1, n_jobs=-1)
 
params = {
    'n_estimators'  : [50,100,200,400],
    'criterion'     : ['mse','mae'],
    'max_features'  : ['auto','sqrt','log2']
}
 
grid_rfr = GridSearchCV(estimator=estimator,
                        param_grid=params,
                        scoring='r2',
                        verbose=1,
                        n_jobs=-1,
                        return_train_score=True,
                        cv=KFold(n_splits=10, shuffle=False))

In [18]:
#Combinación de laboratorios
dataset = datalab1[[TARGET] + selected_features_rfr_1].join(datalab2[selected_features_rfr_2])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_rfr)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   41.3s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   58.9s finished


R2: 0.05908803995428012


['cer_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qmsr$_lab1)',
 'mean(qc$_lab1)',
 'act_lab2',
 'actq1_lab2',
 'actq3_lab2',
 'mean(qavt$_lab2)',
 'avgtime_lab2']

In [19]:
dataset = datalab1[[TARGET] + selected_features_rfr_1].join(datalab2[selected_features_rfr_2]).join(datalab3[selected_features_rfr_3])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_rfr)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   41.1s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   57.6s finished


R2: 0.1988647448387974


['cer_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qmsr$_lab1)',
 'mean(qc$_lab1)',
 'act_lab2',
 'actq1_lab2',
 'actq3_lab2',
 'mean(qavt$_lab2)',
 'avgtime_lab2',
 'g_lab3',
 'ut_lab3',
 'rt_lab3',
 'actq2_lab3',
 'mean(qmsr$_lab3)']

In [22]:
dataset = datalab1[[TARGET] + selected_features_rfr_1].join(datalab2[selected_features_rfr_2]).join(datalab3[selected_features_rfr_3]).join(datalab4[selected_features_rfr_4])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_rfr)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   43.4s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.0min finished


R2: 0.24071212480947582


['cer_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qmsr$_lab1)',
 'mean(qc$_lab1)',
 'act_lab2',
 'actq1_lab2',
 'actq3_lab2',
 'mean(qavt$_lab2)',
 'avgtime_lab2',
 'g_lab3',
 'ut_lab3',
 'rt_lab3',
 'actq2_lab3',
 'mean(qmsr$_lab3)',
 'rt_lab4',
 'rtr_lab4',
 'cer_lab4',
 'actq1_lab4',
 'mean(qc$_lab4)']

In [29]:
#@title **Linear Regression**

from sklearn.linear_model import LinearRegression
 
# GradientBoostingRegressor / RandomForestRegressor / SVR(kernel='linear')
sel_estimator = GradientBoostingRegressor(random_state=1)
 
# RFE / SelectFromModel
selector = RFE(sel_estimator)
estimator = LinearRegression()

pipe = Pipeline([
    ('sel', selector),
    ('est', estimator)
])
 
# params = {'est__n_jobs': [-1], 
#           'est__normalize': [True], 
#           'sel__estimator__learning_rate': [0.1], 
#           'sel__estimator__n_estimators': [100], 
#           'sel__max_features': [10], 
#           'sel__prefit': [False]}

params = {
    'sel__n_features_to_select' : [5],
    'sel__step'                 : [1,2],
    'est__n_jobs'               : [-1],
}
 
grid_lr = GridSearchCV(estimator=pipe,
                       param_grid=params,
                       scoring='r2',
                       verbose=1,
                       n_jobs=-1,
                       return_train_score=True,
                       cv=KFold(n_splits=10, shuffle=False))

In [32]:
datalab1_shuffle = datalab1.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_lr_1 = run_process(datalab1_shuffle,grid_lr)
selected_features_lr_1

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    4.7s finished


R2: -0.06225467326652258


['rtr_lab1', 'actq1_lab1', 'actq3_lab1', 'mean(qmsr$_lab1)', 'mean(qc$_lab1)']

In [34]:
datalab2_shuffle = datalab2.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_lr_2 = run_process(datalab2_shuffle,grid_lr)
selected_features_lr_2

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    4.9s finished


R2: -0.05669070586466388


['act_lab2', 'actq1_lab2', 'actq3_lab2', 'mean(qmsr$_lab2)', 'avgtime_lab2']

In [36]:
datalab3_shuffle = datalab3.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_lr_3 = run_process(datalab3_shuffle,grid_lr)
selected_features_lr_3

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    4.5s finished


R2: 0.004734992195396836


['ut_lab3', 'act_lab3', 'rt_lab3', 'cer_lab3', 'actq2_lab3']

In [38]:
datalab4_shuffle = datalab4.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_lr_4 = run_process(datalab4_shuffle,grid_lr)
selected_features_lr_4

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    4.4s finished


R2: 0.1224315930882384


['rt_lab4', 'rtr_lab4', 'cer_lab4', 'actq1_lab4', 'mean(qc$_lab4)']

In [39]:
estimator = LinearRegression()
 
params = {
    'n_jobs'    : [-1],
}
 
grid_lr = GridSearchCV(estimator=estimator,
                       param_grid=params,
                       scoring='r2',
                       verbose=1,
                       n_jobs=-1,
                       return_train_score=True,
                       cv=KFold(n_splits=10, shuffle=False))

In [42]:
#Combinación de laboratorios 1 y 2
dataset = datalab1[[TARGET] + selected_features_lr_1].join(datalab2[selected_features_lr_2])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_lr)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
R2: -0.015205415433621916


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


['rtr_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qmsr$_lab1)',
 'mean(qc$_lab1)',
 'act_lab2',
 'actq1_lab2',
 'actq3_lab2',
 'mean(qmsr$_lab2)',
 'avgtime_lab2']

In [43]:
#Combinación de laboratorios 1, 2 y 3
dataset = datalab1[[TARGET] + selected_features_lr_1].join(datalab2[selected_features_lr_2]).join(datalab3[selected_features_lr_3])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_lr)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
R2: -0.00807851993786859


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


['rtr_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qmsr$_lab1)',
 'mean(qc$_lab1)',
 'act_lab2',
 'actq1_lab2',
 'actq3_lab2',
 'mean(qmsr$_lab2)',
 'avgtime_lab2',
 'ut_lab3',
 'act_lab3',
 'rt_lab3',
 'cer_lab3',
 'actq2_lab3']

In [46]:
#Combinación de laboratorios 1, 2, 3 y 4
dataset = datalab1[[TARGET] + selected_features_lr_1].join(datalab2[selected_features_lr_2]).join(datalab3[selected_features_lr_3]).join(datalab4[selected_features_lr_4])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_lr)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
R2: 0.034085800629088114


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


['rtr_lab1',
 'actq1_lab1',
 'actq3_lab1',
 'mean(qmsr$_lab1)',
 'mean(qc$_lab1)',
 'act_lab2',
 'actq1_lab2',
 'actq3_lab2',
 'mean(qmsr$_lab2)',
 'avgtime_lab2',
 'ut_lab3',
 'act_lab3',
 'rt_lab3',
 'cer_lab3',
 'actq2_lab3',
 'rt_lab4',
 'rtr_lab4',
 'cer_lab4',
 'actq1_lab4',
 'mean(qc$_lab4)']