# Machine Learning con PySpark

#### Autor: Rodrigo Accurso

## 1. Análisis exploratorio

#### 1.1 Import de las librerias y lectura del dataset

In [104]:
import pyspark
from pyspark.sql.functions import isnan, when, count, countDistinct
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.classification import NaiveBayes, LinearSVC, RandomForestClassifier
from pyspark.ml.feature import ChiSqSelector, PCA
import numpy as np

sc = pyspark.SparkContext(appName="PracticaFinal")
sql = pyspark.SQLContext(sc)

In [105]:
df = sql.read.csv('HR.Employee.Attrition.csv', sep=",", inferSchema=True, header=True)
print(df.count())
print(len(df.columns))
print(df.take(2))

1470
35
[Row(Age=41, Attrition='Yes', BusinessTravel='Travel_Rarely', DailyRate=1102, Department='Sales', DistanceFromHome=1, Education=2, EducationField='Life Sciences', EmployeeCount=1, EmployeeNumber=1, EnvironmentSatisfaction=2, Gender='Female', HourlyRate=94, JobInvolvement=3, JobLevel=2, JobRole='Sales Executive', JobSatisfaction=4, MaritalStatus='Single', MonthlyIncome=5993, MonthlyRate=19479, NumCompaniesWorked=8, Over18='Y', OverTime='Yes', PercentSalaryHike=11, PerformanceRating=3, RelationshipSatisfaction=1, StandardHours=80, StockOptionLevel=0, TotalWorkingYears=8, TrainingTimesLastYear=0, WorkLifeBalance=1, YearsAtCompany=6, YearsInCurrentRole=4, YearsSinceLastPromotion=0, YearsWithCurrManager=5), Row(Age=49, Attrition='No', BusinessTravel='Travel_Frequently', DailyRate=279, Department='Research & Development', DistanceFromHome=8, Education=1, EducationField='Life Sciences', EmployeeCount=1, EmployeeNumber=2, EnvironmentSatisfaction=3, Gender='Male', HourlyRate=61, JobInvo

#### 1.2 Obtengo los tipos de variables

In [106]:
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- DailyRate: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- DistanceFromHome: integer (nullable = true)
 |-- Education: integer (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- EmployeeCount: integer (nullable = true)
 |-- EmployeeNumber: integer (nullable = true)
 |-- EnvironmentSatisfaction: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- HourlyRate: integer (nullable = true)
 |-- JobInvolvement: integer (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobRole: string (nullable = true)
 |-- JobSatisfaction: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- MonthlyIncome: integer (nullable = true)
 |-- MonthlyRate: integer (nullable = true)
 |-- NumCompaniesWorked: integer (nullable = true)
 |-- Over18: string (nullable = true)
 |-- OverTime: string 

#### 1.3 Verifico la presencia de NAs

In [107]:
df_na = df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).collect()
print(df_na)

[Row(Age=0, Attrition=0, BusinessTravel=0, DailyRate=0, Department=0, DistanceFromHome=0, Education=0, EducationField=0, EmployeeCount=0, EmployeeNumber=0, EnvironmentSatisfaction=0, Gender=0, HourlyRate=0, JobInvolvement=0, JobLevel=0, JobRole=0, JobSatisfaction=0, MaritalStatus=0, MonthlyIncome=0, MonthlyRate=0, NumCompaniesWorked=0, Over18=0, OverTime=0, PercentSalaryHike=0, PerformanceRating=0, RelationshipSatisfaction=0, StandardHours=0, StockOptionLevel=0, TotalWorkingYears=0, TrainingTimesLastYear=0, WorkLifeBalance=0, YearsAtCompany=0, YearsInCurrentRole=0, YearsSinceLastPromotion=0, YearsWithCurrManager=0)]


## 2. Ingeniería de variables

#### 2.1 Elimino las columnas con valores diferentes en todas las filas

In [4]:
df_unique = df.select([when(countDistinct(column) == df.count(), 'T').otherwise('F').alias(column) for column in df.columns]) \
                .collect()
print(df_unique)

[Row(Age='F', Attrition='F', BusinessTravel='F', DailyRate='F', Department='F', DistanceFromHome='F', Education='F', EducationField='F', EmployeeCount='F', EmployeeNumber='T', EnvironmentSatisfaction='F', Gender='F', HourlyRate='F', JobInvolvement='F', JobLevel='F', JobRole='F', JobSatisfaction='F', MaritalStatus='F', MonthlyIncome='F', MonthlyRate='F', NumCompaniesWorked='F', Over18='F', OverTime='F', PercentSalaryHike='F', PerformanceRating='F', RelationshipSatisfaction='F', StandardHours='F', StockOptionLevel='F', TotalWorkingYears='F', TrainingTimesLastYear='F', WorkLifeBalance='F', YearsAtCompany='F', YearsInCurrentRole='F', YearsSinceLastPromotion='F', YearsWithCurrManager='F')]


In [108]:
df = df.drop('EmployeeNumber')

#### 2.2 Elimino las columnas con valor igual en todas las filas

In [6]:
df_same = df.select([when(countDistinct(column) == 1, 'T').otherwise('F').alias(column) for column in df.columns]) \
                .collect()
print(df_same)

[Row(Age='F', Attrition='F', BusinessTravel='F', DailyRate='F', Department='F', DistanceFromHome='F', Education='F', EducationField='F', EmployeeCount='T', EnvironmentSatisfaction='F', Gender='F', HourlyRate='F', JobInvolvement='F', JobLevel='F', JobRole='F', JobSatisfaction='F', MaritalStatus='F', MonthlyIncome='F', MonthlyRate='F', NumCompaniesWorked='F', Over18='T', OverTime='F', PercentSalaryHike='F', PerformanceRating='F', RelationshipSatisfaction='F', StandardHours='T', StockOptionLevel='F', TotalWorkingYears='F', TrainingTimesLastYear='F', WorkLifeBalance='F', YearsAtCompany='F', YearsInCurrentRole='F', YearsSinceLastPromotion='F', YearsWithCurrManager='F')]


In [109]:
df = df.drop('EmployeeCount')
df = df.drop('Over18')
df = df.drop('StandardHours')

#### 2.3 Convierto las variables alfanumericas en numericas

In [110]:
main_stages = []
string_cols = [x[0] for x in df.dtypes if (x[1] == 'string') & (x[0] != 'Attrition')]
string_cols

['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'OverTime']

In [111]:
numeric_cols = [x[0] for x in df.dtypes if x[1] != 'string']
numeric_cols

['Age',
 'DailyRate',
 'DistanceFromHome',
 'Education',
 'EnvironmentSatisfaction',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobSatisfaction',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

In [112]:
for col in string_cols:
    indexer = StringIndexer(inputCol = col, outputCol = col + 'Index')
    main_stages += [indexer]

In [113]:
# Transformo la variable target Attrition separadamente porque no debe estar en el pipeline
indexer = StringIndexer(inputCol = 'Attrition', outputCol = 'label')
indexer = indexer.fit(df)
df = indexer.transform(df)

#### 2.4 Aplico el One Hot Encoding en las variables categoricas

In [114]:
cat_cols = ['Department', 'EducationField','JobRole','MaritalStatus']

In [115]:
for col in cat_cols:
    encoder = OneHotEncoderEstimator(inputCols = [col + 'Index'], outputCols = [col + 'Vec'])
    main_stages += [encoder]

#### 2.5 Genero el vector necesario para entrenar los modelos de ML

In [116]:
# Variables numericas
assemblerInputs = numeric_cols
# Variables alfanumericas a las que no aplique el one hot encoding
assemblerInputs = assemblerInputs + [col + 'Index' for col in (set(string_cols) - set(cat_cols))]
# Variables alfanumericas a las que aplique el one hot encoding
assemblerInputs = assemblerInputs + [col + 'Vec' for col in cat_cols]

In [117]:
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features')
main_stages += [assembler]

#### 2.6 Normalizacion de las variables

In [118]:
scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures')
main_stages += [scaler]

#### 2.7 Selección de variables por Chi-cuadrado

Selección con p-valor <= 0.05

In [119]:
chisq_selector_05 = ChiSqSelector(fpr=0.05, selectorType='fpr',featuresCol='scaledFeatures',
                         outputCol='selectedFeatures', labelCol='label')
chisq_stages_05 = main_stages[:]
chisq_stages_05 += [chisq_selector_05]

Selección con p-valor <= 0.01

In [120]:
chisq_selector_01 = ChiSqSelector(fpr=0.01, selectorType='fpr',featuresCol='scaledFeatures',
                         outputCol='selectedFeatures', labelCol='label')
chisq_stages_01 = main_stages[:]
chisq_stages_01 += [chisq_selector_01]

#### 2.8 Extracción de variables con PCA

Extracción con k = 5

In [121]:
pca_5 = PCA(k=5, inputCol='scaledFeatures', outputCol='pcaFeatures')
pca_stages_5 = main_stages[:]
pca_stages_5 += [pca_5]

Extracción con k = 10

In [122]:
pca_10 = PCA(k=10, inputCol='scaledFeatures', outputCol='pcaFeatures')
pca_stages_10 = main_stages[:]
pca_stages_10 += [pca_10]

## 3. Bayes Ingénuo

#### 3.1 Creo diccionarios con la información necesaria para ejecutar todos los casos por cada aloritmo

In [146]:
all_stages = {'BASE': main_stages,
              'CHI-CUADRADO-05': chisq_stages_05,
              'CHI-CUADRADO-01': chisq_stages_01}

feature_field = {'BASE': 'scaledFeatures',
                 'CHI-CUADRADO-05': 'selectedFeatures',
                 'CHI-CUADRADO-01': 'selectedFeatures'}

Nota: No puedo utilizar PCA porque el Bayes Ingénuo no acepta números negativos

#### 3.2 Creo una función que evalúa el algoritmo con los hiper-parámetros en entrada

In [147]:
def evualua_modelo(input_smoothing):
    for sel_stages in all_stages:
        
        print('CASO ' + sel_stages)
        print('------------------------')
        
        # Creo el algoritmo de clasificación
        nb = NaiveBayes(featuresCol=feature_field.get(sel_stages), labelCol='label')    
        print('Features: ' + feature_field.get(sel_stages))
        
        # Construyo el pipeline completo
        nb_stages = all_stages.get(sel_stages)[:]
        nb_stages += [nb]
        pipeline = Pipeline(stages=nb_stages)

        # Creo el grid de hiper-parámetros
        paramGrid = (ParamGridBuilder()
                     .addGrid(nb.smoothing, input_smoothing.get(sel_stages))
                     .addGrid(nb.modelType, ['multinomial'])
                     .build())  

        # Ejecuto la validación cruzada con los hiperparámetros seleccionados
        cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, 
                            evaluator=evaluator, numFolds=5)
        pipelineModel = cv.fit(df)    
        
        # Muestro los resultados
        print('Hiper-parámetros óptimos:')
        print('smoothing = ' + str(pipelineModel.bestModel.stages[-1]._java_obj.getSmoothing()))
        print('ROC-AUC = ' + str(np.mean(pipelineModel.avgMetrics)))
        print('')    

#### 3.3 Evalúo Bayes Ingénuo con lista amplia de hiper-parámetros 

In [148]:
param_smoothing = {'BASE': [0., 0.5, 1.0],
                   'CHI-CUADRADO-05': [0., 0.5, 1.0],
                   'CHI-CUADRADO-01':  [0., 0.5, 1.0]}                   
    
evualua_modelo(param_smoothing)

CASO BASE
------------------------
Features: scaledFeatures
Hiper-parámetros óptimos:
smoothing = 0.0
ROC-AUC = 0.6305925693344068

CASO CHI-CUADRADO-05
------------------------
Features: selectedFeatures
Hiper-parámetros óptimos:
smoothing = 0.0
ROC-AUC = 0.6314127561352433

CASO CHI-CUADRADO-01
------------------------
Features: selectedFeatures
Hiper-parámetros óptimos:
smoothing = 0.0
ROC-AUC = 0.6334527955217494



#### 3.4 Evalúo Bayes Ingénuo con fine-tuning de hiper-parámetros 

In [150]:
param_smoothing = {'BASE': [0., 0.05, .1],
                   'CHI-CUADRADO-05': [0., 0.05, .1],
                   'CHI-CUADRADO-01':  [0., 0.05, .1]}                   
    
evualua_modelo(param_smoothing)

CASO BASE
------------------------
Features: scaledFeatures
Hiper-parámetros óptimos:
smoothing = 0.0
ROC-AUC = 0.6310011695464576

CASO CHI-CUADRADO-05
------------------------
Features: selectedFeatures
Hiper-parámetros óptimos:
smoothing = 0.0
ROC-AUC = 0.6320403657626575

CASO CHI-CUADRADO-01
------------------------
Features: selectedFeatures
Hiper-parámetros óptimos:
smoothing = 0.0
ROC-AUC = 0.6338857821851124



#### 3.5 Resultados

El único hiper-parámetro configurable es el Smoothing, que es la suavización de probabilidades a través del Estimador de Laplace. El valor óptimo es 0 porque en el dataset no existe el caso en que el valor de una variable categórica no se esté en una de las dos clases.

La ejecución con selección de variables con p-valor <= 0.01 obtuvo el mejor resultado. Supongo que el motivo es la no independencia de variables que existe en el dataset, mientras que Bayes Ingénuo asume una completa independencia.

ROC-AUC = 0.6339

## 4. Support Vector Machines

#### 4.1 Creo diccionarios con la información necesaria para ejecutar todos los casos por cada aloritmo

In [168]:
all_stages = {'BASE': main_stages,
              'CHI-CUADRADO-05': chisq_stages_05,
              'CHI-CUADRADO-01': chisq_stages_01,
              'PCA-k5': pca_stages_5,
              'PCA-k10': pca_stages_10}

feature_field = {'BASE': 'scaledFeatures',
                 'CHI-CUADRADO-05': 'selectedFeatures',
                 'CHI-CUADRADO-01': 'selectedFeatures',
                 'PCA-k5': 'pcaFeatures',
                 'PCA-k10': 'pcaFeatures'}

#### 4.2 Creo una función que evalúa el algoritmo con los hiper-parámetros en entrada

In [158]:
def evualua_modelo_SVM(input_regParam, input_maxIter):
    for sel_stages in all_stages:
        
        print('CASO ' + sel_stages)
        print('------------------------')
        
        # Creo el algoritmo de clasificación
        svc = LinearSVC(featuresCol=feature_field.get(sel_stages), labelCol='label')    
        print('Features: ' + feature_field.get(sel_stages))
        
        # Construyo el pipeline completo
        svc_stages = all_stages.get(sel_stages)[:]
        svc_stages += [svc]
        pipeline = Pipeline(stages=svc_stages)

        # Creo el grid de hiper-parámetros
        paramGrid = (ParamGridBuilder()
                     .addGrid(svc.regParam, input_regParam.get(sel_stages))
                     .addGrid(svc.maxIter, input_maxIter.get(sel_stages))                  
                     .build())  

        # Ejecuto la validación cruzada con los hiperparámetros seleccionados
        cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, 
                            evaluator=evaluator, numFolds=5)
        pipelineModel = cv.fit(df)    
        
        # Muestro los resultados
        print('Hiper-parámetros óptimos:')
        print('regParam = ' + str(pipelineModel.bestModel.stages[-1]._java_obj.getRegParam()))
        print('maxIter = ' + str(pipelineModel.bestModel.stages[-1]._java_obj.getMaxIter()))
        print('ROC-AUC = ' + str(np.mean(pipelineModel.avgMetrics)))
        print('')    

#### 4.3 Evalúo SVM con lista amplia de hiper-parámetros 

In [169]:
param_regParam = {'BASE': [0., 0.5, 1.0],
                  'CHI-CUADRADO-05': [0., 0.5, 1.0],
                  'CHI-CUADRADO-01':  [0., 0.5, 1.0],
                  'PCA-k5': [0., 0.5, 1.0],
                  'PCA-k10': [0., 0.5, 1.0]}

param_maxIter = {'BASE': [10, 50, 100],
                 'CHI-CUADRADO-05': [10, 50, 100],
                 'CHI-CUADRADO-01': [10, 50, 100],
                 'PCA-k5': [10, 50, 100],
                 'PCA-k10': [10, 50, 100]}
    
evualua_modelo_SVM(param_regParam, param_maxIter)

CASO BASE
------------------------
Features: scaledFeatures
Hiper-parámetros óptimos:
regParam = 0.5
maxIter = 100
ROC-AUC = 0.8143945767212432

CASO CHI-CUADRADO-05
------------------------
Features: selectedFeatures
Hiper-parámetros óptimos:
regParam = 1.0
maxIter = 50
ROC-AUC = 0.8002146337229715

CASO CHI-CUADRADO-01
------------------------
Features: selectedFeatures
Hiper-parámetros óptimos:
regParam = 0.5
maxIter = 100
ROC-AUC = 0.798068156553817

CASO PCA-k5
------------------------
Features: pcaFeatures
Hiper-parámetros óptimos:
regParam = 0.0
maxIter = 10
ROC-AUC = 0.654768298041164

CASO PCA-k10
------------------------
Features: pcaFeatures
Hiper-parámetros óptimos:
regParam = 0.0
maxIter = 10
ROC-AUC = 0.7626341477018037



#### 4.4 Evalúo SVM con fine-tuning de hiper-parámetros 

In [172]:
param_regParam = {'BASE': [0.4, 0.5, 0.6],
                  'CHI-CUADRADO-05': [0.8, 0.9, 1.0],
                  'CHI-CUADRADO-01':  [0.4, 0.5, 0.6],
                  'PCA-k5': [0., 0.05, 0.1],
                  'PCA-k10': [0., 0.05, 0.1]}

param_maxIter = {'BASE': [80, 100, 120],
                 'CHI-CUADRADO-05': [30, 50, 70],
                 'CHI-CUADRADO-01': [80, 100, 120],
                 'PCA-k5': [10, 20, 30],
                 'PCA-k10': [10, 20, 30]}
    
evualua_modelo_SVM(param_regParam, param_maxIter)

CASO BASE
------------------------
Features: scaledFeatures
Hiper-parámetros óptimos:
regParam = 0.6
maxIter = 100
ROC-AUC = 0.8293923941436913

CASO CHI-CUADRADO-05
------------------------
Features: selectedFeatures
Hiper-parámetros óptimos:
regParam = 1.0
maxIter = 50
ROC-AUC = 0.8022375679279371

CASO CHI-CUADRADO-01
------------------------
Features: selectedFeatures
Hiper-parámetros óptimos:
regParam = 0.5
maxIter = 80
ROC-AUC = 0.8046145747715915

CASO PCA-k5
------------------------
Features: pcaFeatures
Hiper-parámetros óptimos:
regParam = 0.0
maxIter = 10
ROC-AUC = 0.6437187105127199

CASO PCA-k10
------------------------
Features: pcaFeatures
Hiper-parámetros óptimos:
regParam = 0.1
maxIter = 10
ROC-AUC = 0.7625266621007054



#### 4.5 Resultados

El parámetro de regularización en SVM sirve a penalizar las clasificaciones erradas. Más alto su valor, mayor será el precio a pagar en la función de evaluación. El valor óptimo encontrado es 0.01.
El parámetro maxIter, en cambio, limita el número de iteraciones para el entrenamiento. El valor óptimo encontrado es 120.

Entre los 5 escenarios probados, obtuve mejor resultado sin aplicar selección o extracción de variables.

ROC-AUC = 0.8294

## 5. Random Forest

#### 5.1 Creo diccionarios con la información necesaria para ejecutar todos los casos por cada aloritmo

In [175]:
all_stages = {'BASE': main_stages}#,
              #'CHI-CUADRADO-05': chisq_stages_05,
              #'CHI-CUADRADO-01': chisq_stages_01,
              #'PCA-k5': pca_stages_5,
              #'PCA-k10': pca_stages_10}

feature_field = {'BASE': 'scaledFeatures',
                 'CHI-CUADRADO-05': 'selectedFeatures',
                 'CHI-CUADRADO-01': 'selectedFeatures',
                 'PCA-k5': 'pcaFeatures',
                 'PCA-k10': 'pcaFeatures'}

#### 5.2 Creo una función que evalúa el algoritmo con los hiper-parámetros en entrada

In [176]:
def evualua_modelo_RF(input_numTrees, input_maxDepth):
    for sel_stages in all_stages:
        
        print('CASO ' + sel_stages)
        print('------------------------')
        
        # Creo el algoritmo de clasificación
        rf = RandomForestClassifier(featuresCol=feature_field.get(sel_stages), labelCol='label')
        print('Features: ' + feature_field.get(sel_stages))
        
        # Construyo el pipeline completo
        rf_stages = all_stages.get(sel_stages)[:]
        rf_stages += [rf]
        pipeline = Pipeline(stages=rf_stages)

        # Creo el grid de hiper-parámetros
        paramGrid = (ParamGridBuilder()
                     .addGrid(rf.numTrees, input_numTrees.get(sel_stages))
                     .addGrid(rf.maxDepth, input_maxDepth.get(sel_stages))                  
                     .build())  

        # Ejecuto la validación cruzada con los hiperparámetros seleccionados
        cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, 
                            evaluator=evaluator, numFolds=5)
        pipelineModel = cv.fit(df)    
        
        # Muestro los resultados
        print('Hiper-parámetros óptimos:')
        print('numTrees = ' + str(pipelineModel.bestModel.stages[-1]._java_obj.getNumTrees()))
        print('maxDepth = ' + str(pipelineModel.bestModel.stages[-1]._java_obj.getMaxDepth()))
        print('ROC-AUC = ' + str(np.mean(pipelineModel.avgMetrics)))
        print('')    

#### 5.3 Evalúo Random Forest con lista amplia de hiper-parámetros 

In [182]:
param_numTrees = {'BASE': [50, 200, 300],
                  'CHI-CUADRADO-05': [50, 200, 300],
                  'CHI-CUADRADO-01': [50, 200, 300],
                  'PCA-k5': [50, 200, 300],
                  'PCA-k10': [50, 200, 300]}

param_maxDepth = {'BASE': [5, 10, 15],
                 'CHI-CUADRADO-05': [5, 10, 15],
                 'CHI-CUADRADO-01': [5, 10, 15],
                 'PCA-k5': [5, 10, 15],
                 'PCA-k10': [5, 10, 15]}
    
evualua_modelo_RF(param_numTrees, param_maxDepth)

CASO BASE
------------------------
Features: scaledFeatures
Hiper-parámetros óptimos:
numTrees = 200
maxDepth = 10
ROC-AUC = 0.8074997580402385



#### 5.4 Evalúo Random Forest con fine-tuning de hiper-parámetros 

In [None]:
param_numTrees = {'BASE': [10, 40, 80],
                  'CHI-CUADRADO-05': [10, 40, 80],
                  'CHI-CUADRADO-01':  [10, 40, 80],
                  'PCA-k5': [10, 40, 80],
                  'PCA-k10': [10, 40, 80]}

param_maxDepth = {'BASE': [5, 10, 15],
                 'CHI-CUADRADO-05': [5, 10, 15],
                 'CHI-CUADRADO-01': [5, 10, 15],
                 'PCA-k5': [5, 10, 15],
                 'PCA-k10': [5, 10, 15]}
    
evualua_modelo_RF(param_numTrees, param_maxDepth)

#### 5.5 Resultados

El hiper-parámetro numTrees es el número de árboles que serán creados. Un valor alto puede provocas overfitting, mientras que uno bajo puede dar lungar al underfitting. El valor óptimo encontrado es XXXX.

El hiper-parámetro maxDepth representa la profundidad máxima de cada árbol. También en este caso, un valor muy alto causa overfitting. El valor óptimo encontrado es XXXX.

La mejor eficacia la obtuve con el caso sin selección ni extracción de variables (BASE). Esto tiene sentido, ya que los algoritmos basados en árboles de decisiones tienen integrada la selección de las variables más predictivas.

ROC-AUC = 0.8294

In [103]:
sc.stop()