In [1]:
from b1 import *
from urllib.request import urlopen
import json
import numpy as np
import pandas as pd
import os
import copy
import math
import statistics
import sklearn.metrics as metrics
 
# Evitar truncar data mostrada al usar jupyter notebook
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
 
# Constante que aloja el diccionario JSON con toda la data
DATA = None

# Obtener data JSON
if os.path.exists('./out/dataout.json'):
    DATA = json.load(open('./out/dataout.json', 'r'))
else:
    data_url = urlopen('http://nutriexcel.cl/UMDU/dataout_v2.json')
    DATA = json.loads(data_url.read())
 
# Labels base de las columnas
LABELS_BASE = {
    # Parámetros del alumno (Target)
    'p1':                            ['p1'],
    'p2':                            ['p2'],
    'np':                            ['np'],
    'p1p2':                          ['p1p2'], # Promedio p1p2 y p2p2
    'p2p2':                          ['p2p2'],
    
    # Parámetros del laboratorio (Features)
    'grade':                         ['g_lab#'],
    'attempts':                      ['a_lab#'],
    'usedtime':                      ['ut_lab#'],
    'activetime':                    ['act_lab#'],
    'disconnections':                ['dis_lab#'],      # log
    'compilationtime':               ['ct_lab#'],
    'runtimedebuggingtime':          ['rt_lab#'],
    'compilationtimeratio':          ['ctr_lab#'],
    'runtimedebuggingtimeratio':     ['rtr_lab#'],
    'errorsreductionratio':          ['err_lab#'],
    'compilationerrorsratio':        ['cer_lab#'],
    'activequartiles':               ['actq1_lab#','actq2_lab#','actq3_lab#'],
    'questionsdifficulty':           ['qd$_lab#'],
    'questionsgrades':               ['qg$_lab#'],      # Promedio
    'questionsattempts':             ['qat$_lab#'],     # Sumar - Max   # log
    'questionsactivetime':           ['qact$_lab#'],    # Promedio
    'questionsavgtime':              ['qavt$_lab#'],    # Promedio
    'questionsmaxerrors':            ['qme$_lab#'],     # Max
    'questionsmaxconsecutiveerrors': ['qmce$_lab#'],    # Max
    'questionsmaxsimilarityratio':   ['qmsr$_lab#'],    # Promedio
    'questionscorrectness':          ['qc$_lab#']       # Promedio
}
 
 
# Cantidad de preguntas por lab
LABS_LENGTHS = {
    '1': 7,
    '2': 6,
    '3': 6,
    '4': 5,
    '5': 3
}

In [2]:
total=0
for id in DATA["courses"]:
    students=len(DATA["courses"][id]["students"])
    total+=students
    print("curso ",id,":",students)
print("total:",total)

curso  7 : 55
curso  13 : 22
curso  19 : 54
curso  24 : 28
curso  30 : 53
curso  36 : 41
total: 253


In [3]:
#@title **Parameters**

# Objective vector
TARGET = 'mean(p$p2)'
NORM_TYPE = 'col'
N_FEATURES = 5
 
 
# Import needed libraries ----------------------------------------
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.ensemble import GradientBoostingRegressor
import sklearn.metrics as metrics
from sklearn.model_selection import KFold
 
random_state = None # Random state for train_test_split

In [4]:
# CursoData retorna el curso de los alumnos del lab Correspondiente
datalab1_all,cursoData = get_custom_dataframe(DATA, [1], ['p1p2','p2p2'], 'all', labels=True, index=None)
#@title **Data preparation**

datalab1 = copy.deepcopy(datalab1_all)

# Remove questionsdifficulty
remove_col(datalab1, 'qd?')
# Group columns
datalab1_all = apply(datalab1_all, ['p1p2','p2p2'], statistics.mean)
datalab1 = apply(datalab1, ['p1p2','p2p2'], statistics.mean)
datalab1 = apply(datalab1, 'dis_lab1', norm_log)
datalab1 = apply(datalab1, 'qg?', statistics.mean)
datalab1 = apply(datalab1, 'qat?', sum, replace=False)
datalab1 = apply(datalab1, 'sum(qat$_lab1)', norm_log, replace=False)
datalab1 = apply(datalab1, 'qat?', max)
datalab1 = apply(datalab1, 'qact?', statistics.mean)
datalab1 = apply(datalab1, 'qavt?', statistics.mean)
datalab1 = apply(datalab1, 'qme?', max)
datalab1 = apply(datalab1, 'qmce?', max)
datalab1 = apply(datalab1, 'qmsr?', statistics.mean)
datalab1 = apply(datalab1, 'qc?', statistics.mean)
aux = datalab1['act_lab1'] / datalab1['sum(qat$_lab1)']
for i in range(len(aux)):
    if not aux[i] > 0:
        aux[i] = 0
datalab1['avgtime_lab1'] = aux
datalab1 = datalab1.round(4)

7
13
19
24
30
36


In [5]:
# Se transforma a dataframe la info de ese curso en particular
cursoDF = pd.DataFrame(cursoData,columns=['curso'])

# Se concatenan los dos dataframes 
dfFinlab1 = pd.concat([datalab1,cursoDF],axis=1)


In [6]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab1Curso7 = dfFinlab1.loc[dfFinlab1['curso']=='7']
scaler1 = StandardScaler()

# Se obtiene la columna con el promedio del curso X
promCurso = pd.DataFrame(dfLab1Curso7.reset_index()[TARGET])

# Se obtiene los datos del curso X para el lab Y normalizados Excluyendo la fila mean(p$p2)
# ------------               Función que normaliza la data      , el nombre de las col a colocar en el DF  [desde cual columna hasta cual]
DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab1Curso7),columns=dfLab1Curso7.columns)[dfLab1Curso7.columns[1:26]]
                       
datalab1Normc7 = pd.concat([promCurso,DFnormalizado],axis=1)
datalab1Normc7

# Sentencia anterior
#datalab1Normc7 = dfLab1Curso7[['mean(p$p2)']].join(pd.DataFrame(scaler1.fit_transform(dfLab1Curso7), columns=dfLab1Curso7.columns)[dfLab1Curso7.columns[1:26]])

Unnamed: 0,mean(p$p2),g_lab1,a_lab1,ut_lab1,act_lab1,norm_log(dis_lab1),ct_lab1,rt_lab1,ctr_lab1,rtr_lab1,err_lab1,cer_lab1,actq1_lab1,actq2_lab1,actq3_lab1,mean(qg$_lab1),max(qat$_lab1),mean(qact$_lab1),mean(qavt$_lab1),max(qme$_lab1),max(qmce$_lab1),mean(qmsr$_lab1),mean(qc$_lab1),sum(qat$_lab1),norm_log(sum(qat$_lab1)),avgtime_lab1
0,2.0,0.217428,0.136083,-0.455061,-0.761629,-0.123236,-0.049073,-0.890975,0.786036,-0.87495,-0.637584,0.383103,-0.514311,-0.665163,-0.709985,0.217431,-0.4896,-0.852848,-0.743363,-0.148155,0.017702,0.475535,0.930445,-0.554857,-0.231774,-0.390801
1,2.25,0.217428,0.136083,-0.81762,-0.125169,-0.664505,-0.603989,-0.385012,-0.690473,-0.336087,0.553319,-0.195346,-1.19697,-1.260748,-1.289504,0.217431,-0.350438,-0.503732,0.468175,-0.805294,-0.523207,-0.975358,0.456763,-0.370575,-0.038347,0.34893
2,1.25,0.217428,0.136083,1.01624,2.141457,1.727227,-0.647569,0.618115,-1.033409,-0.37436,1.189007,-0.304597,0.559647,0.83854,0.771844,0.217431,0.275796,0.43611,0.508278,1.95469,-0.523207,-0.20731,-0.793488,0.366554,0.533396,1.6687
3,3.0,0.217428,0.136083,-0.911666,-0.877879,-1.58967,-0.537167,0.54222,-0.117179,3.331465,-0.191185,-0.49671,-1.108546,-1.255053,-1.286561,0.217431,-0.837508,-0.257596,1.066704,-0.673866,-0.631388,0.653245,0.519023,-0.73914,-0.459359,-0.220591
4,2.0,0.217428,0.136083,-0.88597,-0.428801,0.260792,0.25308,-0.496104,0.812065,-0.275336,-0.042385,0.19574,1.020738,0.854618,0.777731,0.217431,-0.350438,-0.519575,-0.314631,-0.148155,0.017702,0.578996,0.505561,-0.481144,-0.15091,0.074525
5,3.0,0.217428,0.136083,-0.223061,0.834033,0.802062,1.102885,-0.5478,0.596023,-0.98977,-0.457405,0.725633,-0.13724,-0.287981,0.11318,0.217431,0.48454,0.449124,0.280275,-0.148155,0.017702,-0.127584,-0.066559,0.440267,0.57868,0.190098
6,3.5,0.217428,0.136083,0.525164,1.814468,1.834012,3.475076,0.554319,1.62548,-0.308141,-0.973142,2.074644,-0.264286,-0.396178,0.761379,0.217431,4.311521,2.609458,0.865472,0.246129,4.344975,0.193146,0.29943,3.241359,1.662132,-0.970884
7,1.0,0.217428,0.136083,0.259429,0.711943,1.007758,0.5102,-0.317917,0.142463,-0.698772,-0.682122,1.538945,-0.123689,-0.268217,-0.29039,0.217431,1.528262,1.082286,-0.103934,-0.016727,2.722248,0.455451,0.113491,1.435392,1.073311,-0.787474
8,3.5,0.217428,0.136083,-0.911666,-0.877879,-1.58967,-0.470344,-0.636893,0.038346,-0.051771,1.214819,-0.448682,-0.939152,-1.083546,-1.122386,0.217431,-0.698345,-0.886232,-0.533004,-0.016727,-0.631388,-0.011951,0.042817,-0.849709,-0.617982,0.102401
9,2.5,0.217428,0.136083,0.963889,2.519935,1.185957,-0.329437,3.794682,-0.881138,1.618888,-0.240785,0.22424,-0.280886,0.404077,0.386915,0.217431,1.180355,2.582864,0.545034,1.034696,-0.090479,-0.273038,-0.788439,1.361679,1.042388,0.534182


In [7]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab1Curso13 = dfFinlab1.loc[dfFinlab1['curso']=='13']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab1Curso13.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab1Curso13),columns=dfLab1Curso13.columns)[dfLab1Curso13.columns[1:26]]
                       
datalab1Normc13 = pd.concat([promCurso,DFnormalizado],axis=1)


# Sentencia anterior
#datalab1Normc13 = dfLab1Curso13[['mean(p$p2)']].join(pd.DataFrame(scaler2.fit_transform(dfLab1Curso13), columns=dfLab1Curso13.columns)[dfLab1Curso13.columns[1:26]]) 

In [8]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab1Curso19 = dfFinlab1.loc[dfFinlab1['curso']=='19']

scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab1Curso19.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab1Curso19),columns=dfLab1Curso19.columns)[dfLab1Curso19.columns[1:26]]
                       
datalab1Normc19 = pd.concat([promCurso,DFnormalizado],axis=1)


In [9]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab1Curso24 = dfFinlab1.loc[dfFinlab1['curso']=='24']

scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab1Curso24.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab1Curso24),columns=dfLab1Curso24.columns)[dfLab1Curso24.columns[1:26]]
                       
datalab1Normc24 = pd.concat([promCurso,DFnormalizado],axis=1)

In [10]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab1Curso30 = dfFinlab1.loc[dfFinlab1['curso']=='30']

scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab1Curso30.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab1Curso30),columns=dfLab1Curso30.columns)[dfLab1Curso30.columns[1:26]]
                       
datalab1Normc30 = pd.concat([promCurso,DFnormalizado],axis=1)

In [11]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab1Curso36 = dfFinlab1.loc[dfFinlab1['curso']=='36']

scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab1Curso36.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab1Curso36),columns=dfLab1Curso36.columns)[dfLab1Curso36.columns[1:26]]
                       
datalab1Normc36 = pd.concat([promCurso,DFnormalizado],axis=1)

In [12]:
#Se unen los datos del laboratorio 1
datalab1_norm = pd.concat([datalab1Normc7,datalab1Normc13,datalab1Normc19,datalab1Normc24,datalab1Normc30,datalab1Normc36],axis=0)
datalab1_norm = datalab1_norm.reset_index(drop = True)
datalab1_norm

Unnamed: 0,mean(p$p2),g_lab1,a_lab1,ut_lab1,act_lab1,norm_log(dis_lab1),ct_lab1,rt_lab1,ctr_lab1,rtr_lab1,err_lab1,cer_lab1,actq1_lab1,actq2_lab1,actq3_lab1,mean(qg$_lab1),max(qat$_lab1),mean(qact$_lab1),mean(qavt$_lab1),max(qme$_lab1),max(qmce$_lab1),mean(qmsr$_lab1),mean(qc$_lab1),sum(qat$_lab1),norm_log(sum(qat$_lab1)),avgtime_lab1
0,2.0,0.217428,0.136083,-0.455061,-0.761629,-0.123236,-0.049073,-0.890975,0.786036,-0.87495,-0.637584,0.383103,-0.514311,-0.665163,-0.709985,0.217431,-0.4896,-0.852848,-0.743363,-0.148155,0.017702,0.475535,0.930445,-0.554857,-0.231774,-0.390801
1,2.25,0.217428,0.136083,-0.81762,-0.125169,-0.664505,-0.603989,-0.385012,-0.690473,-0.336087,0.553319,-0.195346,-1.19697,-1.260748,-1.289504,0.217431,-0.350438,-0.503732,0.468175,-0.805294,-0.523207,-0.975358,0.456763,-0.370575,-0.038347,0.34893
2,1.25,0.217428,0.136083,1.01624,2.141457,1.727227,-0.647569,0.618115,-1.033409,-0.37436,1.189007,-0.304597,0.559647,0.83854,0.771844,0.217431,0.275796,0.43611,0.508278,1.95469,-0.523207,-0.20731,-0.793488,0.366554,0.533396,1.6687
3,3.0,0.217428,0.136083,-0.911666,-0.877879,-1.58967,-0.537167,0.54222,-0.117179,3.331465,-0.191185,-0.49671,-1.108546,-1.255053,-1.286561,0.217431,-0.837508,-0.257596,1.066704,-0.673866,-0.631388,0.653245,0.519023,-0.73914,-0.459359,-0.220591
4,2.0,0.217428,0.136083,-0.88597,-0.428801,0.260792,0.25308,-0.496104,0.812065,-0.275336,-0.042385,0.19574,1.020738,0.854618,0.777731,0.217431,-0.350438,-0.519575,-0.314631,-0.148155,0.017702,0.578996,0.505561,-0.481144,-0.15091,0.074525
5,3.0,0.217428,0.136083,-0.223061,0.834033,0.802062,1.102885,-0.5478,0.596023,-0.98977,-0.457405,0.725633,-0.13724,-0.287981,0.11318,0.217431,0.48454,0.449124,0.280275,-0.148155,0.017702,-0.127584,-0.066559,0.440267,0.57868,0.190098
6,3.5,0.217428,0.136083,0.525164,1.814468,1.834012,3.475076,0.554319,1.62548,-0.308141,-0.973142,2.074644,-0.264286,-0.396178,0.761379,0.217431,4.311521,2.609458,0.865472,0.246129,4.344975,0.193146,0.29943,3.241359,1.662132,-0.970884
7,1.0,0.217428,0.136083,0.259429,0.711943,1.007758,0.5102,-0.317917,0.142463,-0.698772,-0.682122,1.538945,-0.123689,-0.268217,-0.29039,0.217431,1.528262,1.082286,-0.103934,-0.016727,2.722248,0.455451,0.113491,1.435392,1.073311,-0.787474
8,3.5,0.217428,0.136083,-0.911666,-0.877879,-1.58967,-0.470344,-0.636893,0.038346,-0.051771,1.214819,-0.448682,-0.939152,-1.083546,-1.122386,0.217431,-0.698345,-0.886232,-0.533004,-0.016727,-0.631388,-0.011951,0.042817,-0.849709,-0.617982,0.102401
9,2.5,0.217428,0.136083,0.963889,2.519935,1.185957,-0.329437,3.794682,-0.881138,1.618888,-0.240785,0.22424,-0.280886,0.404077,0.386915,0.217431,1.180355,2.582864,0.545034,1.034696,-0.090479,-0.273038,-0.788439,1.361679,1.042388,0.534182


In [13]:
#LAB 2
datalab2_all,cursoData = get_custom_dataframe(DATA, [2], ['p1p2','p2p2'], 'all', labels=True, index=None)
 
datalab2 = copy.deepcopy(datalab2_all)
 
# Remove questionsdifficulty
remove_col(datalab2, 'qd?')
# Group columns
datalab2_all = apply(datalab2_all, ['p1p2','p2p2'], statistics.mean)
datalab2 = apply(datalab2, ['p1p2','p2p2'], statistics.mean)
datalab2 = apply(datalab2, 'dis_lab2', norm_log)
datalab2 = apply(datalab2, 'qg?', statistics.mean)
datalab2 = apply(datalab2, 'qat?', sum, replace=False)
datalab2 = apply(datalab2, 'sum(qat$_lab2)', norm_log, replace=False)
datalab2 = apply(datalab2, 'qat?', max)
datalab2 = apply(datalab2, 'qact?', statistics.mean)
datalab2 = apply(datalab2, 'qavt?', statistics.mean)
datalab2 = apply(datalab2, 'qme?', max)
datalab2 = apply(datalab2, 'qmce?', max)
datalab2 = apply(datalab2, 'qmsr?', statistics.mean)
datalab2 = apply(datalab2, 'qc?', statistics.mean)
aux = datalab2['act_lab2'] / datalab2['sum(qat$_lab2)']
for i in range(len(aux)):
    if not aux[i] > 0:
        aux[i] = 0
datalab2['avgtime_lab2'] = aux
datalab2 = datalab2.round(4)

# Se transforma a dataframe la info de ese curso en particular
cursoDF = pd.DataFrame(cursoData,columns=['curso'])

# Se concatenan los dos dataframes 
dfFinlab2 = pd.concat([datalab2,cursoDF],axis=1)

7
13
19
24
30
36


In [14]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab2Curso7 = dfFinlab2.loc[dfFinlab2['curso']=='7']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab2Curso7.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab2Curso7),columns=dfLab2Curso7.columns)[dfLab2Curso7.columns[1:26]]
                       
datalab2Normc7 = pd.concat([promCurso,DFnormalizado],axis=1)

In [15]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab2Curso13 = dfFinlab2.loc[dfFinlab2['curso']=='13']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab2Curso13.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab2Curso13),columns=dfLab2Curso13.columns)[dfLab2Curso13.columns[1:26]]
                       
datalab2Normc13 = pd.concat([promCurso,DFnormalizado],axis=1)

In [16]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab2Curso19 = dfFinlab2.loc[dfFinlab2['curso']=='19']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab2Curso19.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab2Curso19),columns=dfLab2Curso19.columns)[dfLab2Curso19.columns[1:26]]
                       
datalab2Normc19 = pd.concat([promCurso,DFnormalizado],axis=1)

In [17]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab2Curso24 = dfFinlab2.loc[dfFinlab2['curso']=='24']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab2Curso24.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab2Curso24),columns=dfLab2Curso24.columns)[dfLab2Curso24.columns[1:26]]
                       
datalab2Normc24 = pd.concat([promCurso,DFnormalizado],axis=1)

In [18]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab2Curso30 = dfFinlab2.loc[dfFinlab2['curso']=='30']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab2Curso30.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab2Curso30),columns=dfLab2Curso30.columns)[dfLab2Curso30.columns[1:26]]
                       
datalab2Normc30 = pd.concat([promCurso,DFnormalizado],axis=1)

In [19]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab2Curso36 = dfFinlab2.loc[dfFinlab2['curso']=='36']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab2Curso36.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab2Curso36),columns=dfLab2Curso36.columns)[dfLab2Curso36.columns[1:26]]
                       
datalab2Normc36 = pd.concat([promCurso,DFnormalizado],axis=1)

In [20]:
#Se unen los datos del laboratorio 2
datalab2_norm = pd.concat([datalab2Normc7,datalab2Normc13,datalab2Normc19,datalab2Normc24,datalab2Normc30,datalab2Normc36],axis=0)
datalab2_norm = datalab2_norm.reset_index(drop = True)
datalab2_norm

Unnamed: 0,mean(p$p2),g_lab2,a_lab2,ut_lab2,act_lab2,norm_log(dis_lab2),ct_lab2,rt_lab2,ctr_lab2,rtr_lab2,err_lab2,cer_lab2,actq1_lab2,actq2_lab2,actq3_lab2,mean(qg$_lab2),max(qat$_lab2),mean(qact$_lab2),mean(qavt$_lab2),max(qme$_lab2),max(qmce$_lab2),mean(qmsr$_lab2),mean(qc$_lab2),sum(qat$_lab2),norm_log(sum(qat$_lab2)),avgtime_lab2
0,2.0,0.354039,0.194257,-0.566624,0.919896,-0.037085,-0.488607,1.322492,-0.691188,1.139796,0.285747,-0.362555,-1.130648,-1.288007,-1.293935,0.354054,0.48408,1.06365,0.711179,0.05729,-0.606029,-0.669601,1.52445,0.351821,0.544793,0.973342
1,2.25,0.354039,0.194257,2.019516,-1.155864,-0.037085,-0.361603,-1.072133,1.372176,-0.910946,-0.306817,1.297616,-1.127128,-1.292755,-1.615346,0.354054,-0.821037,-1.055218,-0.571231,-0.84298,-0.14309,1.491518,0.030181,-0.964384,-0.689227,-0.959506
2,1.25,0.354039,0.194257,1.64149,-0.267852,1.110783,-0.078672,0.236426,-0.102318,1.57332,0.162319,0.291697,0.995748,0.867389,1.383997,0.354054,-0.592642,0.146819,2.043876,2.758098,-0.606029,0.243305,0.250319,-0.453617,-0.005147,0.670137
3,3.0,0.354039,0.194257,-1.086316,0.716773,-0.037085,4.717318,-0.187177,1.914678,-0.614089,-0.35764,0.396215,-0.992982,-1.136057,-1.453711,0.354054,0.451452,0.731399,0.358005,0.207335,0.474162,-0.663843,-0.113243,0.410756,0.574461,0.546682
4,2.0,-3.251917,0.194257,-1.281882,-1.418622,-2.14717,-0.887225,-1.170016,4.252643,-1.946026,0.839216,2.21634,0.587834,0.432511,0.003601,-3.251745,-0.951549,-1.303003,-1.698634,-0.692935,-0.760343,-2.326651,-2.72488,-1.317991,-2.069596,-2.264376
5,3.0,0.354039,0.194257,-1.123397,-1.375133,-0.706795,-0.947583,-1.160328,-0.899328,-1.442302,-2.884276,-2.114541,1.939075,1.79927,1.287017,0.354054,-1.016804,-1.308081,-1.641681,-1.293114,-1.223282,1.541571,0.094665,-1.219767,-1.442591,-1.83399
6,3.5,0.354039,0.194257,0.116554,1.014426,0.87177,0.865688,0.913255,-0.107985,0.533386,0.25112,0.114419,-0.74268,-0.618477,-0.967321,0.354054,1.169266,0.974907,-0.292432,1.257649,0.165536,0.025377,0.834018,0.842943,0.766488,0.283322
7,1.0,-0.367152,0.194257,0.948552,-0.504829,0.4041,-0.190587,-0.290071,-0.027615,0.819041,0.236599,0.98637,0.99653,0.854331,0.413447,-0.367279,-0.592642,-0.350889,0.168094,0.507425,-0.297403,0.033793,-0.205523,-0.335748,0.099323,-0.314234
8,3.5,0.354039,0.194257,-0.878146,0.19282,-0.329592,-0.537648,0.245445,-0.62988,0.59089,0.923549,-0.478623,-1.478725,-1.645327,-1.637641,0.354054,-0.201107,0.005419,0.167915,0.35738,-0.297403,-2.164534,-0.785887,-0.217879,0.19303,1.063667
9,2.5,0.354039,0.194257,0.639548,-0.233217,0.87177,-0.614354,-0.159115,-0.602574,0.510235,-0.650292,0.420468,-0.746591,-0.289648,0.006945,0.354054,0.386196,-0.13117,2.093045,-0.54289,-0.14309,-0.732056,0.941863,0.214307,0.471528,-0.668015


In [21]:
#from sklearn.preprocessing import StandardScaler

#scaler1 = StandardScaler()
#datalab2Normc7 = dfLab2Curso7[['mean(p$p2)']].join(pd.DataFrame(scaler1.fit_transform(dfLab2Curso7), columns=dfLab2Curso7.columns)[dfLab2Curso7.columns[1:26]]) 
#datalab2Normc7

In [22]:
#LAB 3

datalab3_all,cursoData = get_custom_dataframe(DATA, [3], ['p1p2','p2p2'], 'all', labels=True, index=None)

datalab3 = copy.deepcopy(datalab3_all)
 
# Remove questionsdifficulty
remove_col(datalab3, 'qd?')
# Group columns
datalab3_all = apply(datalab3_all, ['p1p2','p2p2'], statistics.mean)
datalab3 = apply(datalab3, ['p1p2','p2p2'], statistics.mean)
datalab3 = apply(datalab3, 'dis_lab3', norm_log)
datalab3 = apply(datalab3, 'qg?', statistics.mean)
datalab3 = apply(datalab3, 'qat?', sum, replace=False)
datalab3 = apply(datalab3, 'sum(qat$_lab3)', norm_log, replace=False)
datalab3 = apply(datalab3, 'qat?', max)
datalab3 = apply(datalab3, 'qact?', statistics.mean)
datalab3 = apply(datalab3, 'qavt?', statistics.mean)
datalab3 = apply(datalab3, 'qme?', max)
datalab3 = apply(datalab3, 'qmce?', max)
datalab3 = apply(datalab3, 'qmsr?', statistics.mean)
datalab3 = apply(datalab3, 'qc?', statistics.mean)
aux = datalab3['act_lab3'] / datalab3['sum(qat$_lab3)']
for i in range(len(aux)):
    if not aux[i] > 0:
        aux[i] = 0
datalab3['avgtime_lab3'] = aux
datalab3 = datalab3.round(4)

# Se transforma a dataframe la info de ese curso en particular
cursoDF = pd.DataFrame(cursoData,columns=['curso'])

# Se concatenan los dos dataframes 
dfFinlab3 = pd.concat([datalab3,cursoDF],axis=1)



7
13
19
24
30
36


In [23]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab3Curso7 = dfFinlab3.loc[dfFinlab3['curso']=='7']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab3Curso7.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab3Curso7),columns=dfLab3Curso7.columns)[dfLab3Curso7.columns[1:26]]
                       
datalab3Normc7 = pd.concat([promCurso,DFnormalizado],axis=1)

In [24]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab3Curso13 = dfFinlab3.loc[dfFinlab3['curso']=='13']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab3Curso13.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab3Curso13),columns=dfLab3Curso13.columns)[dfLab3Curso13.columns[1:26]]
                       
datalab3Normc13 = pd.concat([promCurso,DFnormalizado],axis=1)

In [25]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab3Curso19 = dfFinlab3.loc[dfFinlab3['curso']=='19']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab3Curso19.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab3Curso19),columns=dfLab3Curso19.columns)[dfLab3Curso19.columns[1:26]]
                       
datalab3Normc19 = pd.concat([promCurso,DFnormalizado],axis=1)

In [27]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab3Curso24 = dfFinlab3.loc[dfFinlab3['curso']=='24']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab3Curso24.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab3Curso24),columns=dfLab3Curso24.columns)[dfLab3Curso24.columns[1:26]]
                       
datalab3Normc24 = pd.concat([promCurso,DFnormalizado],axis=1)

In [28]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab3Curso30 = dfFinlab3.loc[dfFinlab3['curso']=='30']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab3Curso30.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab3Curso30),columns=dfLab3Curso30.columns)[dfLab3Curso30.columns[1:26]]
                       
datalab3Normc30 = pd.concat([promCurso,DFnormalizado],axis=1)

In [29]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab3Curso36 = dfFinlab3.loc[dfFinlab3['curso']=='36']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab3Curso36.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab3Curso36),columns=dfLab3Curso36.columns)[dfLab3Curso36.columns[1:26]]
                       
datalab3Normc36 = pd.concat([promCurso,DFnormalizado],axis=1)

In [30]:
#Se unen los datos del laboratorio 3
datalab3_norm = pd.concat([datalab3Normc7,datalab3Normc13,datalab3Normc19,datalab3Normc24,datalab3Normc30,datalab3Normc36],axis=0)
datalab3_norm = datalab3_norm.reset_index(drop = True)
datalab3_norm

Unnamed: 0,mean(p$p2),g_lab3,a_lab3,ut_lab3,act_lab3,norm_log(dis_lab3),ct_lab3,rt_lab3,ctr_lab3,rtr_lab3,err_lab3,cer_lab3,actq1_lab3,actq2_lab3,actq3_lab3,mean(qg$_lab3),max(qat$_lab3),mean(qact$_lab3),mean(qavt$_lab3),max(qme$_lab3),max(qmce$_lab3),mean(qmsr$_lab3),mean(qc$_lab3),sum(qat$_lab3),norm_log(sum(qat$_lab3)),avgtime_lab3
0,2.0,0.336406,0.136083,0.538057,-0.907582,-0.540732,-0.135743,-0.805039,0.294662,-0.007907,0.153463,0.598155,-0.708866,-1.120637,-1.41954,0.316612,-0.412167,-0.876817,-0.355058,0.701304,-0.402502,-0.41551,-0.779308,-0.679386,-0.322886,-0.396148
1,2.25,0.336406,0.136083,1.438208,0.549781,0.62876,-0.339444,0.751187,-0.544855,0.646843,-0.181433,-1.284445,0.130312,0.309305,0.787157,0.316612,0.058881,0.324684,-0.489575,-1.315518,-0.535062,-1.524515,-0.346083,0.465232,0.572035,-0.062468
2,1.25,-0.336406,0.136083,0.117388,1.817015,1.722759,-0.544351,0.893754,-0.741093,-0.496397,-0.251303,-0.141869,0.706384,0.211178,0.771483,0.316612,-0.02944,1.145388,1.855827,-0.307107,-0.402502,1.114413,0.411085,0.391385,0.532353,1.721464
3,3.0,0.336406,0.136083,1.629656,-0.11793,0.62876,0.759821,-0.197927,1.007821,-0.05069,-0.181433,0.28819,-1.061336,-1.428882,-1.662494,0.316612,-0.117762,-0.174645,-0.045212,-0.811312,0.392861,-0.288479,-0.065072,-0.088616,0.229578,-0.091872
4,2.0,-1.682028,0.136083,-0.168003,-0.866274,-0.270199,0.28733,-1.068148,1.273939,-1.228049,-0.611497,1.635397,0.06396,0.195535,-0.161491,-1.303073,-0.500488,-0.828595,-0.374046,0.701304,0.12774,-0.754261,-0.927619,-0.864002,-0.577362,0.494269
5,3.0,0.336406,0.136083,-0.188313,-1.010284,-0.540732,-0.212884,-0.954415,0.209466,-0.339885,1.969491,-0.634995,-1.047912,-1.435992,-1.724852,0.316612,-0.853774,-1.116681,1.378249,-0.55921,-0.800184,1.403258,1.265826,-1.122464,-1.083122,1.84366
6,3.5,0.336406,0.136083,-0.152281,-1.208617,-0.04164,-0.021236,-1.185486,1.268196,-0.935064,1.25874,1.343547,-0.270867,-0.716755,0.046026,0.316612,-0.736012,-1.212814,-1.121617,0.449201,-0.535062,1.01309,1.139631,-0.84554,-0.548957,-0.868912
7,1.0,0.336406,0.136083,-0.164078,1.180993,-0.270199,-0.384042,1.423977,-0.620479,0.729703,-0.734373,0.168767,-1.06824,-1.438126,-0.170009,0.316612,-0.117762,1.090944,-0.056129,-0.055004,0.657982,0.282659,0.856018,0.44677,0.562247,0.773625
8,3.5,0.336406,0.136083,-0.184906,-0.823835,-0.270199,-0.37922,-0.471848,-0.358189,1.044893,1.735184,-0.991924,-1.070541,0.205134,-0.150587,0.316612,-0.61825,-0.628241,-0.339209,-0.055004,-0.932744,-1.200384,-0.784512,-0.771694,-0.442464,0.20562
9,2.5,0.336406,0.136083,0.07478,1.766937,1.274392,0.255991,1.535308,-0.23566,0.228756,-0.073616,-0.212986,0.728629,1.186397,1.186516,0.316612,0.500488,1.483563,0.042599,0.701304,0.923103,0.461108,-0.098898,1.222157,0.911726,0.238699


In [31]:
#LAB 4

datalab4_all,cursoData = get_custom_dataframe(DATA, [4], ['p1p2','p2p2'], 'all', labels=True, index=None)
datalab4 = copy.deepcopy(datalab4_all)

# Remove questionsdifficulty
remove_col(datalab4, 'qd?')
# Group columns
datalab4_all = apply(datalab4_all, ['p1p2','p2p2'], statistics.mean)
datalab4 = apply(datalab4, ['p1p2','p2p2'], statistics.mean)
datalab4 = apply(datalab4, 'dis_lab4', norm_log)
datalab4 = apply(datalab4, 'qg?', statistics.mean)
datalab4 = apply(datalab4, 'qat?', sum, replace=False)
datalab4 = apply(datalab4, 'sum(qat$_lab4)', norm_log, replace=False)
datalab4 = apply(datalab4, 'qat?', max)
datalab4 = apply(datalab4, 'qact?', statistics.mean)
datalab4 = apply(datalab4, 'qavt?', statistics.mean)
datalab4 = apply(datalab4, 'qme?', max)
datalab4 = apply(datalab4, 'qmce?', max)
datalab4 = apply(datalab4, 'qmsr?', statistics.mean)
datalab4 = apply(datalab4, 'qc?', statistics.mean)
aux = datalab4['act_lab4'] / datalab4['sum(qat$_lab4)']
for i in range(len(aux)):
    if not aux[i] > 0:
        aux[i] = 0
datalab4['avgtime_lab4'] = aux
datalab4 = datalab4.round(4)

# Se transforma a dataframe la info de ese curso en particular
cursoDF = pd.DataFrame(cursoData,columns=['curso'])

# Se concatenan los dos dataframes 
dfFinlab4 = pd.concat([datalab4,cursoDF],axis=1)


7
13
19
24
30
36


In [32]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab4Curso7 = dfFinlab4.loc[dfFinlab4['curso']=='7']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab4Curso7.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab4Curso7),columns=dfLab4Curso7.columns)[dfLab4Curso7.columns[1:26]]
                       
datalab4Normc7 = pd.concat([promCurso,DFnormalizado],axis=1)
datalab4Normc7

Unnamed: 0,mean(p$p2),g_lab4,a_lab4,ut_lab4,act_lab4,norm_log(dis_lab4),ct_lab4,rt_lab4,ctr_lab4,rtr_lab4,err_lab4,cer_lab4,actq1_lab4,actq2_lab4,actq3_lab4,mean(qg$_lab4),max(qat$_lab4),mean(qact$_lab4),mean(qavt$_lab4),max(qme$_lab4),max(qmce$_lab4),mean(qmsr$_lab4),mean(qc$_lab4),sum(qat$_lab4),norm_log(sum(qat$_lab4)),avgtime_lab4
0,2.0,0.088201,0.136083,0.408492,1.683413,-0.038106,-0.046612,1.743906,-0.503147,0.5618,-0.217085,0.012206,-0.741941,-0.039689,-0.331099,0.397935,0.877253,1.475752,-0.108147,0.839012,0.626422,0.09139,-0.294974,0.649945,0.817722,1.573402
1,2.25,-0.346225,0.136083,-1.225869,0.527938,0.556104,-0.785921,0.614574,-0.722142,0.552181,-0.616514,-0.65706,-0.141786,-0.431238,-0.689458,-0.719661,0.318385,0.502721,-0.469624,-0.394829,-0.207124,-0.832709,-0.000111,0.447093,0.694042,0.132
2,1.25,0.522628,0.136083,0.646774,0.298888,1.535293,0.001176,-0.148759,-0.284151,-0.605148,0.186776,0.716258,0.418358,1.771145,1.449665,0.397935,-0.19391,-0.094947,0.575657,-0.148061,0.348574,0.553919,-0.223032,0.018851,0.376451,0.658236
3,3.0,0.522628,0.136083,1.20321,-0.680538,0.184286,-0.541359,-0.69193,-0.36299,-0.205089,-0.182737,1.542057,1.616403,0.983257,0.679366,0.397935,-0.147338,-0.578712,-0.70619,0.592244,0.348574,-0.026519,-0.069013,-0.477009,-0.162217,-0.706991
4,2.0,-2.083932,0.136083,-1.786246,-0.484653,-0.598666,-0.577902,-0.36209,-0.491998,0.576228,0.925249,-0.81568,0.162443,-0.246322,-0.541954,-1.837257,-0.519916,-0.46675,0.466118,-1.135134,-1.04067,-0.886391,-0.6709,-0.770017,-0.674934,1.899398
5,3.0,0.088201,0.136083,0.591253,0.055892,-0.598666,-0.538548,-0.031446,-0.559688,0.076482,0.018916,-0.033014,-0.067805,-0.437626,-0.72791,0.397935,-0.426772,-0.170666,-0.101282,0.098707,-0.484972,1.642899,0.040421,-0.499548,-0.19413,2.239969
6,3.5,0.522628,0.136083,1.525314,-0.326271,0.71486,-0.580713,0.007524,-0.532612,1.337433,0.328599,-0.260509,2.561552,1.782962,1.466369,0.397935,-0.240483,-0.155781,1.933121,-0.888366,-0.762821,1.142984,0.107297,-0.409392,-0.071334,0.344141
7,1.0,-0.346225,0.136083,-0.613969,0.954732,0.184286,1.223986,0.460703,0.068631,-0.305651,-0.873013,1.834252,-0.480741,-0.779353,-0.553931,0.397935,0.225241,1.185493,0.589708,0.839012,0.904271,-0.35628,-0.520935,0.807719,0.90507,0.190825
8,3.5,0.522628,0.136083,0.595925,0.12284,0.184286,-0.046612,0.193536,-0.258668,0.484848,0.186776,-0.484525,-0.178022,-0.533117,-0.563386,0.397935,0.318385,0.004071,-0.253632,4.047,-0.484972,-0.151138,-0.354758,-0.296696,0.065155,1.328309
9,2.5,0.522628,0.136083,-0.576459,-0.253434,0.71486,0.771405,-0.43079,0.612536,-0.393532,-0.523443,0.447716,-0.052329,-0.164883,0.247255,0.397935,-0.19391,-0.398149,0.681957,0.839012,0.904271,-0.086432,-0.106505,-0.274157,0.090553,0.099636


In [33]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab4Curso13 = dfFinlab4.loc[dfFinlab4['curso']=='13']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab4Curso13.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab4Curso13),columns=dfLab4Curso13.columns)[dfLab4Curso13.columns[1:26]]
                       
datalab4Normc13 = pd.concat([promCurso,DFnormalizado],axis=1)

In [34]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab4Curso19 = dfFinlab4.loc[dfFinlab4['curso']=='19']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab4Curso19.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab4Curso19),columns=dfLab4Curso19.columns)[dfLab4Curso19.columns[1:26]]
                       
datalab4Normc19 = pd.concat([promCurso,DFnormalizado],axis=1)

In [35]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab4Curso24 = dfFinlab4.loc[dfFinlab4['curso']=='24']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab4Curso24.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab4Curso24),columns=dfLab4Curso24.columns)[dfLab4Curso24.columns[1:26]]
                       
datalab4Normc24 = pd.concat([promCurso,DFnormalizado],axis=1)


In [36]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab4Curso30 = dfFinlab4.loc[dfFinlab4['curso']=='30']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab4Curso30.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab4Curso30),columns=dfLab4Curso30.columns)[dfLab4Curso30.columns[1:26]]
                       
datalab4Normc30 = pd.concat([promCurso,DFnormalizado],axis=1)

In [37]:
from sklearn.preprocessing import StandardScaler

# Filtrar por curso
dfLab4Curso36 = dfFinlab4.loc[dfFinlab4['curso']=='36']
scaler1 = StandardScaler()

# Se obtiene el promedio del curso X
promCurso = pd.DataFrame(dfLab4Curso36.reset_index()[TARGET])

DFnormalizado = pd.DataFrame(scaler1.fit_transform(dfLab4Curso36),columns=dfLab4Curso36.columns)[dfLab4Curso36.columns[1:26]]
                       
datalab4Normc36 = pd.concat([promCurso,DFnormalizado],axis=1)

In [38]:
#Se unen los datos del laboratorio 4
datalab4_norm = pd.concat([datalab4Normc7,datalab4Normc13,datalab4Normc19,datalab4Normc24,datalab4Normc30,datalab4Normc36],axis=0)
datalab4_norm = datalab4_norm.reset_index(drop = True)
datalab4_norm

Unnamed: 0,mean(p$p2),g_lab4,a_lab4,ut_lab4,act_lab4,norm_log(dis_lab4),ct_lab4,rt_lab4,ctr_lab4,rtr_lab4,err_lab4,cer_lab4,actq1_lab4,actq2_lab4,actq3_lab4,mean(qg$_lab4),max(qat$_lab4),mean(qact$_lab4),mean(qavt$_lab4),max(qme$_lab4),max(qmce$_lab4),mean(qmsr$_lab4),mean(qc$_lab4),sum(qat$_lab4),norm_log(sum(qat$_lab4)),avgtime_lab4
0,2.0,0.088201,0.136083,0.408492,1.683413,-0.038106,-0.046612,1.743906,-0.503147,0.5618,-0.217085,0.012206,-0.741941,-0.039689,-0.331099,0.397935,0.877253,1.475752,-0.108147,0.839012,0.626422,0.09139,-0.294974,0.649945,0.817722,1.573402
1,2.25,-0.346225,0.136083,-1.225869,0.527938,0.556104,-0.785921,0.614574,-0.722142,0.552181,-0.616514,-0.65706,-0.141786,-0.431238,-0.689458,-0.719661,0.318385,0.502721,-0.469624,-0.394829,-0.207124,-0.832709,-0.000111,0.447093,0.694042,0.132
2,1.25,0.522628,0.136083,0.646774,0.298888,1.535293,0.001176,-0.148759,-0.284151,-0.605148,0.186776,0.716258,0.418358,1.771145,1.449665,0.397935,-0.19391,-0.094947,0.575657,-0.148061,0.348574,0.553919,-0.223032,0.018851,0.376451,0.658236
3,3.0,0.522628,0.136083,1.20321,-0.680538,0.184286,-0.541359,-0.69193,-0.36299,-0.205089,-0.182737,1.542057,1.616403,0.983257,0.679366,0.397935,-0.147338,-0.578712,-0.70619,0.592244,0.348574,-0.026519,-0.069013,-0.477009,-0.162217,-0.706991
4,2.0,-2.083932,0.136083,-1.786246,-0.484653,-0.598666,-0.577902,-0.36209,-0.491998,0.576228,0.925249,-0.81568,0.162443,-0.246322,-0.541954,-1.837257,-0.519916,-0.46675,0.466118,-1.135134,-1.04067,-0.886391,-0.6709,-0.770017,-0.674934,1.899398
5,3.0,0.088201,0.136083,0.591253,0.055892,-0.598666,-0.538548,-0.031446,-0.559688,0.076482,0.018916,-0.033014,-0.067805,-0.437626,-0.72791,0.397935,-0.426772,-0.170666,-0.101282,0.098707,-0.484972,1.642899,0.040421,-0.499548,-0.19413,2.239969
6,3.5,0.522628,0.136083,1.525314,-0.326271,0.71486,-0.580713,0.007524,-0.532612,1.337433,0.328599,-0.260509,2.561552,1.782962,1.466369,0.397935,-0.240483,-0.155781,1.933121,-0.888366,-0.762821,1.142984,0.107297,-0.409392,-0.071334,0.344141
7,1.0,-0.346225,0.136083,-0.613969,0.954732,0.184286,1.223986,0.460703,0.068631,-0.305651,-0.873013,1.834252,-0.480741,-0.779353,-0.553931,0.397935,0.225241,1.185493,0.589708,0.839012,0.904271,-0.35628,-0.520935,0.807719,0.90507,0.190825
8,3.5,0.522628,0.136083,0.595925,0.12284,0.184286,-0.046612,0.193536,-0.258668,0.484848,0.186776,-0.484525,-0.178022,-0.533117,-0.563386,0.397935,0.318385,0.004071,-0.253632,4.047,-0.484972,-0.151138,-0.354758,-0.296696,0.065155,1.328309
9,2.5,0.522628,0.136083,-0.576459,-0.253434,0.71486,0.771405,-0.43079,0.612536,-0.393532,-0.523443,0.447716,-0.052329,-0.164883,0.247255,0.397935,-0.19391,-0.398149,0.681957,0.839012,0.904271,-0.086432,-0.106505,-0.274157,0.090553,0.099636


In [39]:
#@title **Grid/Random-SearchCV process**   
 
def run_process(dataset, grid_cv, target=TARGET):
    X, y = dataset.drop(target, axis=1), np.array(dataset[target])
   
    grid_cv.fit(X,y)
    print('R2:', max(grid_cv.cv_results_['mean_test_score']))
    
    try:
        selected_features = X.columns[grid_cv.best_estimator_.steps[0][-1].get_support()]
    except:
        return list(dataset.columns[1:])
    
    return list(selected_features)
    
def run_process_obsolete(dataset, grid_cv, target=TARGET):
    X, y = dataset.drop(target, axis=1), np.array(dataset[target])
   
    grid_cv.fit(X,y)
 
    try:
        print('R2-test-fit:', max(grid_cv.cv_results_['mean_test_score']))
    except:
        pass

    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        grid_cv.best_estimator_.fit(X_train, y_train)
        print('R2-test', grid_cv.best_estimator_.score(X_test, y_test))
        print('MSE-test', metrics.mean_squared_error(y_test,grid_cv.best_estimator_.predict(X_test)))

        print('Best params:', grid_cv.best_params_)

        selected_features = X.columns[grid_cv.best_estimator_.steps[0][-1].get_support()]
        print('Selected features:', list(selected_features))

        return list(selected_features)
    except:
        return list(dataset.columns[1:])

In [40]:
#@title **SVR - Recursive Features Elimination**

from sklearn.svm import SVR

sel_estimator = SVR(kernel='linear')
selector = RFE(sel_estimator)
estimator = SVR()

pipe = Pipeline([
    ('sel', selector),
    ('est', estimator)
])

params = {
    'sel__n_features_to_select' : [5],
    'sel__step'                 : [1,2],
    'est__C'                    : [0.01,0.1,1],
    'est__gamma'                : ['scale','auto'],
    'est__kernel'               : ['linear','poly','rbf']
}

grid_svr = GridSearchCV(estimator=pipe,
                        param_grid=params,
                        scoring='r2',
                        verbose=1,
                        n_jobs=-1,
                        return_train_score=True,
                        cv=KFold(n_splits=10, shuffle=False))

In [41]:
datalab1_shuffle = datalab1_norm.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_svr_1 = run_process(datalab1_shuffle,grid_svr)
selected_features_svr_1

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    7.3s


R2: -0.04747534963116049


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   11.9s finished


['act_lab1', 'norm_log(dis_lab1)', 'rt_lab1', 'err_lab1', 'mean(qact$_lab1)']

In [42]:
datalab2_shuffle = datalab2_norm.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_svr_2 = run_process(datalab2_shuffle,grid_svr)
selected_features_svr_2

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    5.6s


R2: -0.018899532010106544


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   10.2s finished


['g_lab2',
 'act_lab2',
 'rt_lab2',
 'mean(qact$_lab2)',
 'norm_log(sum(qat$_lab2))']

In [69]:
datalab3_shuffle = datalab3_norm.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_svr_3 = run_process(datalab3_shuffle,grid_svr)
selected_features_svr_3

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.5s


R2: 0.050022917363454586


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    8.8s finished


['act_lab3', 'rt_lab3', 'actq1_lab3', 'mean(qg$_lab3)', 'mean(qmsr$_lab3)']

In [70]:
datalab4_shuffle = datalab4_norm.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_svr_4 = run_process(datalab4_shuffle,grid_svr)
selected_features_svr_4

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    5.2s


R2: 0.12836350255649293


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    9.8s finished


['act_lab4',
 'rt_lab4',
 'cer_lab4',
 'mean(qact$_lab4)',
 'norm_log(sum(qat$_lab4))']

In [71]:
estimator = SVR()

params = {
    'C'         : [0.01,0.1,1],
    'gamma'     : ['scale','auto'],
    'kernel'    : ['linear','poly','rbf']
}

grid_svr = GridSearchCV(estimator=estimator,
                        param_grid=params,
                        scoring='r2',
                        verbose=1,
                        n_jobs=-1,
                        return_train_score=True,
                        cv=KFold(n_splits=10, shuffle=False))

In [72]:
dataset = datalab1_norm[[TARGET] + selected_features_svr_1].join(datalab2_norm[selected_features_svr_2])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_svr)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.0s


R2: -0.028746121318383222


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    0.2s finished


['act_lab1',
 'norm_log(dis_lab1)',
 'rt_lab1',
 'err_lab1',
 'mean(qact$_lab1)',
 'g_lab2',
 'act_lab2',
 'rt_lab2',
 'mean(qact$_lab2)',
 'norm_log(sum(qat$_lab2))']

In [50]:
dataset = datalab1_norm[[TARGET] + selected_features_svr_1].join(datalab2_norm[selected_features_svr_2]).join(datalab3_norm[selected_features_svr_3])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_svr)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.0s


R2: 0.031366296237040826


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    0.3s finished


['act_lab1',
 'norm_log(dis_lab1)',
 'rt_lab1',
 'err_lab1',
 'mean(qact$_lab1)',
 'g_lab2',
 'act_lab2',
 'rt_lab2',
 'mean(qact$_lab2)',
 'norm_log(sum(qat$_lab2))',
 'act_lab3',
 'rt_lab3',
 'actq1_lab3',
 'mean(qg$_lab3)',
 'mean(qmsr$_lab3)']

In [51]:
dataset = datalab1_norm[[TARGET] + selected_features_svr_1].join(datalab2_norm[selected_features_svr_2]).join(datalab3_norm[selected_features_svr_3]).join(datalab4_norm[selected_features_svr_4])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_svr)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s


R2: 0.09571823127077915


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    0.4s finished


['act_lab1',
 'norm_log(dis_lab1)',
 'rt_lab1',
 'err_lab1',
 'mean(qact$_lab1)',
 'g_lab2',
 'act_lab2',
 'rt_lab2',
 'mean(qact$_lab2)',
 'norm_log(sum(qat$_lab2))',
 'act_lab3',
 'rt_lab3',
 'actq1_lab3',
 'mean(qg$_lab3)',
 'mean(qmsr$_lab3)',
 'act_lab4',
 'rt_lab4',
 'cer_lab4',
 'mean(qact$_lab4)',
 'norm_log(sum(qat$_lab4))']

In [43]:
#@title **Random Forest Regressor** 

from sklearn.ensemble import  RandomForestRegressor
 
# GradientBoostingRegressor / RandomForestRegressor / SVR(kernel='linear')
sel_estimator = GradientBoostingRegressor(random_state=1)
 
# RFE / SelectFromModel
selector = RFE(sel_estimator)
estimator = RandomForestRegressor(random_state=1, n_jobs=-1)
 
pipe = Pipeline([
    ('sel', selector),
    ('est', estimator)
])
 
params = {
    'sel__estimator__learning_rate': [0.05,0.1,0.2],
    'sel__n_features_to_select'    : [5],
    'est__n_estimators'            : [50,100,200,400],
    'est__criterion'               : ['mse','mae'],
    'est__max_features'            : ['auto','sqrt','log2']
}
 
grid_rfr = GridSearchCV(estimator=pipe,
                        param_grid=params,
                        scoring='r2',
                        verbose=1,
                        n_jobs=-1,
                        return_train_score=True,
                        cv=KFold(n_splits=10, shuffle=False))

In [44]:
datalab1_shuffle = datalab1_norm.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_rfr_1 = run_process(datalab1_shuffle,grid_rfr)
selected_features_rfr_1
#0.021447284761679487
#0.021447284761679463

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  6.0min finished


R2: 0.04219674900503384


['a_lab1', 'ct_lab1', 'rtr_lab1', 'actq3_lab1', 'mean(qmsr$_lab1)']

In [45]:
datalab2_shuffle = datalab2_norm.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_rfr_2 = run_process(datalab2_shuffle,grid_rfr)
selected_features_rfr_2

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  5.9min finished


R2: 0.06287864195413237


['a_lab2',
 'ut_lab2',
 'norm_log(dis_lab2)',
 'actq2_lab2',
 'norm_log(sum(qat$_lab2))']

In [46]:
datalab3_shuffle = datalab3_norm.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_rfr_3 = run_process(datalab3_shuffle,grid_rfr)
selected_features_rfr_3

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  5.8min finished


R2: 0.1916559819672034


['a_lab3', 'rtr_lab3', 'cer_lab3', 'actq1_lab3', 'avgtime_lab3']

In [47]:
datalab4_shuffle = datalab4_norm.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_rfr_4 = run_process(datalab4_shuffle,grid_rfr)
selected_features_rfr_4

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  5.6min finished


R2: 0.1362108342417335


['g_lab4', 'ut_lab4', 'cer_lab4', 'mean(qavt$_lab4)', 'mean(qc$_lab4)']

In [48]:
estimator = RandomForestRegressor(random_state=1, n_jobs=-1)
 
params = {
    'n_estimators'  : [50,100,200,400],
    'criterion'     : ['mse','mae'],
    'max_features'  : ['auto','sqrt','log2']
}
 
grid_rfr = GridSearchCV(estimator=estimator,
                        param_grid=params,
                        scoring='r2',
                        verbose=1,
                        n_jobs=-1,
                        return_train_score=True,
                        cv=KFold(n_splits=10, shuffle=False))

In [49]:
#Combinación de laboratorios
dataset = datalab1_norm[[TARGET] + selected_features_rfr_1].join(datalab2_norm[selected_features_rfr_2])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_rfr)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   30.5s finished


R2: 0.17548652282761198


['a_lab1',
 'ct_lab1',
 'rtr_lab1',
 'actq3_lab1',
 'mean(qmsr$_lab1)',
 'a_lab2',
 'ut_lab2',
 'norm_log(dis_lab2)',
 'actq2_lab2',
 'norm_log(sum(qat$_lab2))']

In [53]:
dataset = datalab1_norm[[TARGET] + selected_features_rfr_1].join(datalab2_norm[selected_features_rfr_2]).join(datalab3_norm[selected_features_rfr_3])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_rfr)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   38.9s finished


R2: 0.2747326610079729


['a_lab1',
 'ct_lab1',
 'rtr_lab1',
 'actq3_lab1',
 'mean(qmsr$_lab1)',
 'a_lab2',
 'ut_lab2',
 'norm_log(dis_lab2)',
 'actq2_lab2',
 'norm_log(sum(qat$_lab2))',
 'a_lab3',
 'rtr_lab3',
 'cer_lab3',
 'actq1_lab3',
 'avgtime_lab3']

In [54]:
dataset = datalab1_norm[[TARGET] + selected_features_rfr_1].join(datalab2_norm[selected_features_rfr_2]).join(datalab3_norm[selected_features_rfr_3]).join(datalab4_norm[selected_features_rfr_4])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_rfr)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   30.4s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   43.2s finished


R2: 0.3043904535874595


['a_lab1',
 'ct_lab1',
 'rtr_lab1',
 'actq3_lab1',
 'mean(qmsr$_lab1)',
 'a_lab2',
 'ut_lab2',
 'norm_log(dis_lab2)',
 'actq2_lab2',
 'norm_log(sum(qat$_lab2))',
 'a_lab3',
 'rtr_lab3',
 'cer_lab3',
 'actq1_lab3',
 'avgtime_lab3',
 'g_lab4',
 'ut_lab4',
 'cer_lab4',
 'mean(qavt$_lab4)',
 'mean(qc$_lab4)']

In [104]:
#@title **Linear Regression**

from sklearn.linear_model import LinearRegression
 
# GradientBoostingRegressor / RandomForestRegressor / SVR(kernel='linear')
sel_estimator = GradientBoostingRegressor(random_state=1)
 
# RFE / SelectFromModel
selector = RFE(sel_estimator)
estimator = LinearRegression()

pipe = Pipeline([
    ('sel', selector),
    ('est', estimator)
])
 
# params = {'est__n_jobs': [-1], 
#           'est__normalize': [True], 
#           'sel__estimator__learning_rate': [0.1], 
#           'sel__estimator__n_estimators': [100], 
#           'sel__max_features': [10], 
#           'sel__prefit': [False]}

params = {
    'sel__n_features_to_select' : [5],
    'sel__step'                 : [1,2],
    'est__n_jobs'               : [-1],
}
 
grid_lr = GridSearchCV(estimator=pipe,
                       param_grid=params,
                       scoring='r2',
                       verbose=1,
                       n_jobs=-1,
                       return_train_score=True,
                       cv=KFold(n_splits=10, shuffle=False))

In [105]:
datalab1_shuffle = datalab1_norm.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_lr_1 = run_process(datalab1_shuffle,grid_lr)
selected_features_lr_1
#-0.055914852011188496

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    6.6s finished


R2: -0.055914852011188496


['a_lab1', 'ct_lab1', 'rtr_lab1', 'actq3_lab1', 'mean(qmsr$_lab1)']

In [106]:
datalab2_shuffle = datalab2_norm.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_lr_2 = run_process(datalab2_shuffle,grid_lr)
selected_features_lr_2

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    7.1s finished


R2: -0.0455034088365188


['a_lab2',
 'norm_log(dis_lab2)',
 'actq2_lab2',
 'max(qat$_lab2)',
 'norm_log(sum(qat$_lab2))']

In [107]:
datalab3_shuffle = datalab3_norm.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_lr_3 = run_process(datalab3_shuffle,grid_lr)
selected_features_lr_3

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    6.6s finished


R2: 0.03910249363376296


['a_lab3', 'rtr_lab3', 'cer_lab3', 'actq1_lab3', 'avgtime_lab3']

In [108]:
datalab4_shuffle = datalab4_norm.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_lr_4 = run_process(datalab4_shuffle,grid_lr)
selected_features_lr_4

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    7.9s finished


R2: 0.04712278555605519


['g_lab4', 'ut_lab4', 'cer_lab4', 'mean(qavt$_lab4)', 'mean(qc$_lab4)']

In [119]:
estimator = LinearRegression()
 
params = {
    'n_jobs'    : [-1],
}
 
grid_lr = GridSearchCV(estimator=estimator,
                       param_grid=params,
                       scoring='r2',
                       verbose=1,
                       n_jobs=-1,
                       return_train_score=True,
                       cv=KFold(n_splits=10, shuffle=False))

In [124]:
#Combinación de laboratorios 1 y 2
dataset = datalab1_norm[[TARGET] + selected_features_lr_1].join(datalab2_norm[selected_features_lr_2])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_lr)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


R2: -0.03989069322352695


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    2.6s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.6s finished


['a_lab1',
 'ct_lab1',
 'rtr_lab1',
 'actq3_lab1',
 'mean(qmsr$_lab1)',
 'a_lab2',
 'norm_log(dis_lab2)',
 'actq2_lab2',
 'max(qat$_lab2)',
 'norm_log(sum(qat$_lab2))']

In [126]:
#Combinación de laboratorios 1, 2 y 3
dataset = datalab1_norm[[TARGET] + selected_features_lr_1].join(datalab2_norm[selected_features_lr_2]).join(datalab3_norm[selected_features_lr_3])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_lr)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


R2: 0.0038621477088947144


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.4s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.5s finished


['a_lab1',
 'ct_lab1',
 'rtr_lab1',
 'actq3_lab1',
 'mean(qmsr$_lab1)',
 'a_lab2',
 'norm_log(dis_lab2)',
 'actq2_lab2',
 'max(qat$_lab2)',
 'norm_log(sum(qat$_lab2))',
 'a_lab3',
 'rtr_lab3',
 'cer_lab3',
 'actq1_lab3',
 'avgtime_lab3']

In [130]:
#Combinación de laboratorios 1, 2, 3 y 4
dataset = datalab1_norm[[TARGET] + selected_features_lr_1].join(datalab2_norm[selected_features_lr_2]).join(datalab3_norm[selected_features_lr_3]).join(datalab4_norm[selected_features_lr_4])
dataset_shuffle = dataset.sample(frac=1,random_state=1).reset_index(drop=True)
run_process(dataset_shuffle,grid_lr)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


R2: 0.037398936382905944


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.5s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.5s finished


['a_lab1',
 'ct_lab1',
 'rtr_lab1',
 'actq3_lab1',
 'mean(qmsr$_lab1)',
 'a_lab2',
 'norm_log(dis_lab2)',
 'actq2_lab2',
 'max(qat$_lab2)',
 'norm_log(sum(qat$_lab2))',
 'a_lab3',
 'rtr_lab3',
 'cer_lab3',
 'actq1_lab3',
 'avgtime_lab3',
 'g_lab4',
 'ut_lab4',
 'cer_lab4',
 'mean(qavt$_lab4)',
 'mean(qc$_lab4)']

In [None]:
#@title **ANN**

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers.core import Dense, Dropout, Flatten
from keras.optimizers import SGD
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import RandomizedSearchCV

def create_model( nl1=1, nl2=1,  nl3=1, 
                 nn1=1000, nn2=500, nn3 = 200, lr=0.01, decay=0., l1=0.01, l2=0.01,
                act = 'relu', dropout=0, input_shape=25, output_shape=1):
    '''This is a model generating function so that we can search over neural net 
    parameters and architecture'''
    
    opt = 'SGD' # keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999,  decay=decay)
    reg = keras.regularizers.l1_l2(l1=l1, l2=l2)
                                                     
    model = Sequential()
    
    # for the firt layer we need to specify the input dimensions
    first=True
    
    for i in range(nl1):
        if first:
            model.add(Dense(nn1, input_dim=input_shape, activation=act, kernel_regularizer=reg))
            first=False
        else: 
            model.add(Dense(nn1, activation=act, kernel_regularizer=reg))
        if dropout!=0:
            model.add(Dropout(dropout))
            
    for i in range(nl2):
        if first:
            model.add(Dense(nn2, input_dim=input_shape, activation=act, kernel_regularizer=reg))
            first=False
        else: 
            model.add(Dense(nn2, activation=act, kernel_regularizer=reg))
        if dropout!=0:
            model.add(Dropout(dropout))
            
    for i in range(nl3):
        if first:
            model.add(Dense(nn3, input_dim=input_shape, activation=act, kernel_regularizer=reg))
            first=False
        else: 
            model.add(Dense(nn3, activation=act, kernel_regularizer=reg))
        if dropout!=0:
            model.add(Dropout(dropout))
            
    model.add(Dense(output_shape, activation='sigmoid'))
    model.compile(loss='mse', optimizer=opt, metrics=['mse'],)
    return model

# model class to use in the scikit random search CV 
model = KerasRegressor(build_fn=create_model, epochs=20, batch_size=20, verbose=1)

# learning algorithm parameters
lr=[1e-2, 1e-3, 1e-4]
decay=[1e-6,1e-9,0]

# activation
activation=['relu', 'sigmoid']

# numbers of layers
nl1 = [0,1,2,3]
nl2 = [0,1,2,3]
nl3 = [0,1,2,3]

# neurons in each layer
nn1=[20,50,100]
nn2=[10,40,70]
nn3=[5,15,30]
# nn1=[300,700,1400,2100]
# nn2=[100,400,800]
# nn3=[50,150,300]

# dropout and regularisation
dropout = [0, 0.1, 0.2, 0.3]
l1 = [0, 0.01, 0.003, 0.001, 0.0001]
l2 = [0, 0.01, 0.003, 0.001, 0.0001]

In [None]:
# GradientBoostingRegressor / RandomForestRegressor / SVR(kernel='linear')
sel_estimator = GradientBoostingRegressor(random_state=random_state)
 
# RFE / SelectFromModel
selector = RFE(sel_estimator)
estimator = model
 
pipe = Pipeline([
    ('sel', selector),
    ('est', estimator)
])

param_grid = {
    'sel__estimator__learning_rate': [0.05,0.1,0.2],
    'sel__n_features_to_select'    : [5],
    'est__nl1':            nl1,
    'est__nl2':            nl2,
    'est__nl3':            nl3,
    'est__nn1':            nn1,
    'est__nn2':            nn2,
    'est__nn3':            nn3,
    'est__act':            activation,
    'est__l1':             l1,
    'est__l2':             l2,
    'est__lr':             lr,
    'est__decay':          decay,
    'est__dropout':        dropout, 
    'est__input_shape':    [5],
    'est__output_shape':   [1]
}

rand_cv = RandomizedSearchCV(estimator=pipe,
                             param_distributions=param_grid,
                             verbose=100,
                             n_iter=10,
                             scoring='neg_mean_squared_error',
                             n_jobs=-1,
                             return_train_score=True,
                             cv=KFold(n_splits=5, shuffle=True))

In [None]:
#RUN PROCESS EN LAB 1 
datalab1_shuffle = datalab1_norm.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_ann_1= run_process(datalab1_shuffle, rand_cv)
selected_features_ann_1

In [None]:
#RUN PROCESS EN LAB 1 
datalab2_shuffle = datalab2_norm.sample(frac=1,random_state=1).reset_index(drop=True)
selected_features_ann_1= run_process(datalab2_shuffle, rand_cv)
selected_features_ann_1

In [None]:
estimator = model

param_grid = {
    'nl1':            nl1,
    'nl2':            nl2,
    'nl3':            nl3,
    'nn1':            nn1,
    'nn2':            nn2,
    'nn3':            nn3,
    'act':            activation,
    'l1':             l1,
    'l2':             l2,
    'lr':             lr,
    'decay':          decay,
    'dropout':        dropout, 
    'input_shape':    [20],
    'output_shape':   [1]
}

rand_cv = RandomizedSearchCV(estimator=estimator,
                             param_distributions=param_grid,
                             verbose=100,
                             n_iter=10,
                             scoring='neg_mean_squared_error',
                             n_jobs=-1,
                             return_train_score=True,
                             cv=KFold(n_splits=5, shuffle=True))

In [None]:
#Combinación de laboratorios
dataset = datalab1[[TARGET] + selected_features_ann_1].join(datalab2[selected_features_ann_2])
run_process(dataset,rand_cv)