In [5]:
SANDBOX_NAME = '' # Sandbox Name
DATA_PATH = "/data/sandboxes/"+SANDBOX_NAME+"/data/"



# Ejemplo de Param Grid

Para encontrar los mejores hiperaparámetros para un modelo, se puede definir un conjunto de posibles valores para cada hiperparámetro, y crear un programa que entrene modelos con cada combinación posible de ellos, y almacene el mejor modelo dada una metrica. Además se puede mejorar junto con la técnica de Validación Cruzada.

In [6]:
# Respuesta

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

spark = SparkSession.builder.getOrCreate()



Primer paso, cargar algunos datos de prueba e inspeccionar.

In [7]:
# Respuesta

df = spark.read.csv(DATA_PATH+'data/telecom_customer_churn.csv', sep=',', header=True, inferSchema=True)

df = df.select([F.col(c).alias(c.lower().replace('\. ', '_').replace(' ', '_')) for c in df.columns])

df.printSchema()

root
 |-- rev_mean: double (nullable = true)
 |-- mou_mean: double (nullable = true)
 |-- totmrc_mean: double (nullable = true)
 |-- da_mean: double (nullable = true)
 |-- ovrmou_mean: double (nullable = true)
 |-- ovrrev_mean: double (nullable = true)
 |-- vceovr_mean: double (nullable = true)
 |-- datovr_mean: double (nullable = true)
 |-- roam_mean: double (nullable = true)
 |-- change_mou: double (nullable = true)
 |-- change_rev: double (nullable = true)
 |-- drop_vce_mean: double (nullable = true)
 |-- drop_dat_mean: double (nullable = true)
 |-- blck_vce_mean: double (nullable = true)
 |-- blck_dat_mean: double (nullable = true)
 |-- unan_vce_mean: double (nullable = true)
 |-- unan_dat_mean: double (nullable = true)
 |-- plcd_vce_mean: double (nullable = true)
 |-- plcd_dat_mean: double (nullable = true)
 |-- recv_vce_mean: double (nullable = true)
 |-- recv_sms_mean: double (nullable = true)
 |-- comp_vce_mean: double (nullable = true)
 |-- comp_dat_mean: double (nullable = tr

In [8]:
# Respuesta

df.show(5)

+--------+--------+-----------+-------+-----------+-----------+-----------+-----------+---------+----------+----------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-----------+-------------+-------------+-------------+-------------+-------------+---------------+---------------+---------------+---------------+-------------+-------------+-------------+-------------+------------+------------+-------------+-------------+-------------+------------+-------------+-------------+-------------+-----+------+--------+--------+--------+--------+--------+--------+-----------+-------+-------+--------+------+------+------+------+-------+-------+-------+-------+-------+-------+----------------+--------------------+--------+----------+-----------+------+------+----------+-----+---+-------+---+--------+-------+------+--------+------+--------+--------+-----



Busquemos valores nulos en todas las columnas y descartemos filas que tengan nulos en ellas. Ya vimos anteriormente cómo trabajar con valores nulos.

In [9]:
# Respuesta

for column in df.columns:
    print("Looking for nulls at " +column)
    num_nulls = df.where(F.col(column).isNull()).count()
    if  num_nulls != 0:
        print("There are null values in the column {}".format(column))
        df = df.where(F.col(column).isNotNull())
        if num_nulls == 0:
            print("The column {} is free from null values".format(column))
    else:
        print("-> None null found.")

Looking for nulls at rev_mean
There are null values in the column rev_mean
Looking for nulls at mou_mean
-> None null found.
Looking for nulls at totmrc_mean
-> None null found.
Looking for nulls at da_mean
-> None null found.
Looking for nulls at ovrmou_mean
-> None null found.
Looking for nulls at ovrrev_mean
-> None null found.
Looking for nulls at vceovr_mean
-> None null found.
Looking for nulls at datovr_mean
-> None null found.
Looking for nulls at roam_mean
-> None null found.
Looking for nulls at change_mou
There are null values in the column change_mou
Looking for nulls at change_rev
-> None null found.
Looking for nulls at drop_vce_mean
-> None null found.
Looking for nulls at drop_dat_mean
-> None null found.
Looking for nulls at blck_vce_mean
-> None null found.
Looking for nulls at blck_dat_mean
-> None null found.
Looking for nulls at unan_vce_mean
-> None null found.
Looking for nulls at unan_dat_mean
-> None null found.
Looking for nulls at plcd_vce_mean
-> None null f



Tras limpiar el dataset de nulos, podemos continuar preparando las variables y entrenando un modelo.
Para mantenerlo sencillo, apuntaremos las columnas binarias para evitar aplicarles onehot.

- Aplicamos el string indexer a todas las columnas tipo string (estamos asumiendo aquí que todas las columnas tipo string son categóricas)
- Aplicamos one hot encoder a todas las variables string no binarias
- Tras el string indexer Y el one hot encoder en las variables no binarias, removemos la columna resultado del string indexer para quedarnos sólo con la salida del one hot encoder. Y le cambiamos el nombre a la salida del one hot encoder a *_encoded*. Así sólo tendremos una variable *_encoded* para cada variable transformada en lugar de tener en el dataset el resultado del string indexer Y el del one hot encoder.

In [10]:
# Respuesta

binary_columns = ['asl_flag', 'refurb_new', 'ownrent',  'dwlltype', 'kid0_2', 'kid3_5',  'kid6_10',  'kid11_15',  'kid16_17', 'creditcd'] 

original_numerical_columns = [item[0] for item in df.dtypes if item[1] in ['double', 'float', 'int'] and 
                             item[0]!='churn']

string_columns = [item[0] for item in df.dtypes if item[1]=='string']    

In [11]:
# Respuesta

# making a copy of our dataset
df_many_steps = df

for col in string_columns:
    print(col)
    string_indexer = StringIndexer(inputCol=col, outputCol=col+"_encoded")
    string_indexer_model = string_indexer.fit(df_many_steps)
    df_many_steps = string_indexer_model.transform(df_many_steps)
    
    if col not in binary_columns:
        onehotencoder = OneHotEncoder(dropLast=False, inputCol= string_indexer.getOutputCol(), outputCol=col+"_encoded_tmp")
        
        df_many_steps = onehotencoder.transform(df_many_steps)
        df_many_steps = df_many_steps.drop(string_indexer.getOutputCol())
        df_many_steps = df_many_steps.withColumnRenamed(onehotencoder.getOutputCol(),string_indexer.getOutputCol())
    

new_cell
crclscod
asl_flag
prizm_social_one
area
dualband
refurb_new
hnd_webcap
ownrent
dwlltype
marital
infobase
hhstatin
dwllsize
ethnic
kid0_2
kid3_5
kid6_10
kid11_15
kid16_17
creditcd


In [12]:
# Respuesta

df_many_steps.show(5)

+--------+--------+-----------+-------+-----------+-----------+-----------+-----------+---------+----------+----------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-----------+-------------+-------------+-------------+-------------+-------------+---------------+---------------+---------------+---------------+-------------+-------------+-------------+-------------+------------+------------+-------------+-------------+-------------+------------+-------------+-------------+-------------+-----+------+--------+--------+--------+--------+--------+--------+-----------+-------+-------+--------+------+------+-------+------+-------+-------+-------+-------+-------+-------+----------------+--------------------+--------+----------+-----------+------+------+----------+-----+---+-------+---+--------+-------+------+--------+------+--------+--------+----



Listamos columnas de entrenamiento y seleccionamos columna de target

In [13]:
# Respuesta

target_column = "churn"
numeric_columns = [element[0] for element in df_many_steps.dtypes if element[1] != 'string' and element[0]!='customer_id']
columns_for_model = [c for c in numeric_columns if c!=target_column]

In [14]:
# Respuesta

columns_for_model

['rev_mean',
 'mou_mean',
 'totmrc_mean',
 'da_mean',
 'ovrmou_mean',
 'ovrrev_mean',
 'vceovr_mean',
 'datovr_mean',
 'roam_mean',
 'change_mou',
 'change_rev',
 'drop_vce_mean',
 'drop_dat_mean',
 'blck_vce_mean',
 'blck_dat_mean',
 'unan_vce_mean',
 'unan_dat_mean',
 'plcd_vce_mean',
 'plcd_dat_mean',
 'recv_vce_mean',
 'recv_sms_mean',
 'comp_vce_mean',
 'comp_dat_mean',
 'custcare_mean',
 'ccrndmou_mean',
 'cc_mou_mean',
 'inonemin_mean',
 'threeway_mean',
 'mou_cvce_mean',
 'mou_cdat_mean',
 'mou_rvce_mean',
 'owylis_vce_mean',
 'mouowylisv_mean',
 'iwylis_vce_mean',
 'mouiwylisv_mean',
 'peak_vce_mean',
 'peak_dat_mean',
 'mou_peav_mean',
 'mou_pead_mean',
 'opk_vce_mean',
 'opk_dat_mean',
 'mou_opkv_mean',
 'mou_opkd_mean',
 'drop_blk_mean',
 'attempt_mean',
 'complete_mean',
 'callfwdv_mean',
 'callwait_mean',
 'months',
 'uniqsubs',
 'actvsubs',
 'totcalls',
 'totmou',
 'totrev',
 'adjrev',
 'adjmou',
 'adjqty',
 'avgrev',
 'avgmou',
 'avgqty',
 'avg3mou',
 'avg3qty',
 'avg3r



Son demasiadas variables para introducir al modelo. Calculamos la correlación con las numéricas originales (que no son categóricas indexadas)

In [15]:
# Respuesta

df_corrs = df_many_steps.select([F.round(F.corr(F.col('churn'), F.col(c)), 2).alias(c) for c in original_numerical_columns])
    
[(x, df_many_steps.columns[i]) for i, x in enumerate(df_corrs.first()) if x>0.4]

# The result is not conclusive, since any of the features is too correlated with our target to include it

[]



Hacemos un random forest para sacar la importancia de las variables con todas las disponibles:

In [16]:
# Respuesta

vectorassembler = VectorAssembler(inputCols=columns_for_model, outputCol='for_feature_relevance_assembled')
df_many_steps = vectorassembler.transform(df_many_steps)

In [17]:
# Respuesta

from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol='for_feature_relevance_assembled', labelCol='churn')

rf_model = rf.fit(df_many_steps)

In [18]:
# Respuesta

feature_importances = [x for x in zip(columns_for_model, rf_model.featureImportances.toArray().tolist())]
feature_importances.sort(key=lambda x: x[1], reverse=True)
feature_importances

[('months', 0.19714504125473276),
 ('hnd_price', 0.07575455882922477),
 ('eqpdays', 0.05357906172201724),
 ('totcalls', 0.05001250995016435),
 ('mou_mean', 0.0355266145408016),
 ('avg3mou', 0.028839542039690292),
 ('totmrc_mean', 0.021585782702271208),
 ('change_mou', 0.0207012986012553),
 ('mou_cvce_mean', 0.019144052513190256),
 ('ovrmou_mean', 0.01727186043264665),
 ('lor', 0.016386890819140035),
 ('totmou', 0.016170383807543813),
 ('iwylis_vce_mean', 0.015583457504144315),
 ('avg3qty', 0.014135811066364976),
 ('adjrev', 0.014048330975594265),
 ('mou_opkv_mean', 0.013764017725986417),
 ('adjqty', 0.012453512284686535),
 ('mouiwylisv_mean', 0.012433833016508614),
 ('vceovr_mean', 0.01042596190542066),
 ('opk_vce_mean', 0.010249712941547855),
 ('totrev', 0.009485604345871123),
 ('mou_peav_mean', 0.008818422458762585),
 ('avgqty', 0.00859510402168404),
 ('adjmou', 0.008216079726430594),
 ('avg6qty', 0.00792190722928188),
 ('custcare_mean', 0.007814044217889931),
 ('comp_vce_mean', 0.00

In [19]:
# Respuesta

columns_for_model=['eqpdays', 'months', 'change_mou', 'hnd_price', 'adjrev', 'totmrc_mean', 
                   'avg3qty', 'mou_cvce_mean', 'avg3mou', 'mou_mean']



Primera aproximación para crear un modelo: crear obligatorio VectorAssembler, dividir los datos en train y test y entrenar la primera versión con hipermarámetros por defecto. Seleccionaremos las 10 variables más importantes. Después evaluaremos nuestro modelo (usaremos accuracy)

In [20]:
# Respuesta

vector_assembler = VectorAssembler(inputCols=columns_for_model, outputCol='assembled_features')
df_many_steps = vector_assembler.transform(df_many_steps)

df_many_steps_train, df_many_steps_test = df_many_steps.randomSplit([0.8,0.2])

rf = RandomForestClassifier(featuresCol=vector_assembler.getOutputCol(), labelCol=target_column)
rf_model = rf.fit(df_many_steps_train)
df_many_steps_prediction = rf_model.transform(df_many_steps_test)

In [21]:
# Respuesta

evaluator = MulticlassClassificationEvaluator(predictionCol=rf.getPredictionCol(),
                                              labelCol=rf.getLabelCol(),
                                             metricName="accuracy")

In [22]:
# Respuesta

evaluator.evaluate(df_many_steps_prediction)

0.6073427724728351



Podemos ver el número de árboles de nuestro random forest usando el atributo getNumTrees

In [23]:
# Respuesta

rf_model.getNumTrees

20



Pero, ¿podemos afirmar que este modelo fue entrenado con los mejores hiperparámetros? Se podrían listar muchas combinaciones de ellos, lo que significa mucho trabajo manual. Pero no hay que preocuparse, para manejar esta situación Spark incorpora **ParamGrid**. 

Sólo hay que definir los valores de cada hiperparámetro que queremos probar en ParamGridBuilder y utilizar el objeto resultante en el parámetro `estimatorParamMaps` cuando se define el CrossValidator.

En este caso, probaremos todas las combinaciones de:

- Number of trees: [10,50,100,200]
- Max depth: [3, 5, 7]

In [24]:
# Respuesta

grid = ParamGridBuilder().addGrid(rf.numTrees, [10,50,100,200]) \
                                .addGrid(rf.maxDepth, [3,5,7]) \
                                .addGrid(rf.impurity, ['gini', 'entropy']) \
                                .build()
rf_cv = CrossValidator(estimator=rf, 
                       estimatorParamMaps=grid, 
                       evaluator=evaluator, 
                       numFolds=3)
rf_cv_model = rf_cv.fit(df_many_steps_train)

# In the attribute bestModel we have the best model after trying all the possible combinations of 
# hyperparameter values in a random forest, using accuracy as our metric and doing cross validation with 3 folds
bestModel = rf_cv_model.bestModel

In [25]:
# Respuesta

bestModel.getNumTrees

10

In [26]:
# Respuesta

evaluator.evaluate(df_many_steps_prediction)

0.6073427724728351



# Ejercicio 1

Dado el siguiente DataFrame:

In [27]:
# Respuesta

data = spark.read.csv(DATA_PATH+'data/creditcard.csv', sep=',', header=True, inferSchema=True)

data.printSchema()

root
 |-- Time: decimal(10,0) (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double



1) Calcula las dimensiones del Data Frame

In [28]:
# Respuesta

N = data.count()
D = len(data.columns)
print(N)
print(D)

284807
31




2) Determina las clases de la base de datos, y su proporción

In [29]:
# Respuesta

data.groupBy('Class').count()\
    .withColumn('Proportion',F.round(100*F.col('count')/N,2)).show()

+-----+------+----------+
|Class| count|Proportion|
+-----+------+----------+
|    1|   492|      0.17|
|    0|284315|     99.83|
+-----+------+----------+



In [30]:
# Respuesta

one_df = data.filter(F.col('Class')==1)
zeros_df = data.filter(F.col('Class')==0).limit(one_df.count())

data = one_df.union(zeros_df)
data.groupBy('Class').count().show()

+-----+-----+
|Class|count|
+-----+-----+
|    1|  492|
|    0|  492|
+-----+-----+





3) Calcula el numero de valores no nulos, media, desviacion estandar, minimo y maximo de cada variable.

In [31]:
# Respuesta

data.describe().toPandas()

Unnamed: 0,summary,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,count,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,...,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0
1,mean,40460.3913,-2.4674030372100706,1.905303596823136,-3.083884202829431,2.456780057740528,-1.5617259373325365,-0.5725839910410212,-2.730903338343168,0.2610818513880641,...,0.3548982757919289,-0.0444814921140579,-0.0365289425895097,-0.0473804301134352,0.0875705455321788,0.0261204601057549,0.0961816565001866,0.0278653037584263,96.22459349593504,0.5
2,stddev,52607.97621895258,5.407122314226479,3.5961094277406067,6.4359049253853895,3.0427216170397475,4.202691637741724,1.80365716680006,5.863241960076915,4.850081053008372,...,2.787267047849961,1.145079823805901,1.1489601018179971,0.5866834793500018,0.6404192414977026,0.468299112195734,1.0037324673667465,0.4429545316584083,240.1423970706583,0.5002542588519275
3,min,0.0,-30.552380043581,-12.1142127363483,-31.1036848245812,-4.51582435488105,-22.105531524316,-6.40626663445964,-43.5572415712451,-41.0442609210741,...,-22.7976039055519,-8.88701714094871,-19.2543276173719,-2.02802422921896,-4.78160552206407,-1.24392415371264,-7.26348214633855,-2.73388711897575,0.0,0.0
4,max,170348.0,2.13238602134104,22.0577289904909,3.77285685226266,12.1146718424589,11.0950886001596,6.47411462748849,5.80253735302589,20.0072083651213,...,27.2028391573154,8.36198519168435,5.46622995370963,1.21527882183022,2.20820917836653,3.06557569653728,3.05235768679424,1.77936385243205,3828.04,1.0




4) Convierte la columna "Class" a double

In [32]:
# Respuesta

data = data.withColumn('Class',F.col('Class').astype('double'))



5) Divide el dataset en train y test, y comprueba si se mantiene la proporcion de las clases.

In [33]:
# Respuesta

data_train, data_test = data.randomSplit([0.7,0.3],seed=0)
print("train :")
data_train.groupBy('Class').count()\
    .withColumn('Proportion',F.round(100*F.col('count')/data_train.count(),2)).show()
print("test : ")
data_test.groupBy('Class').count()\
    .withColumn('Proportion',F.round(100*F.col('count')/data_test.count(),2)).show()

train :
+-----+-----+----------+
|Class|count|Proportion|
+-----+-----+----------+
|  0.0|  349|      49.5|
|  1.0|  356|      50.5|
+-----+-----+----------+

test : 
+-----+-----+----------+
|Class|count|Proportion|
+-----+-----+----------+
|  0.0|  143|     51.25|
|  1.0|  136|     48.75|
+-----+-----+----------+





6) Determina las columnas a utilizar como entradas del modelo

In [35]:
# Respuesta

vectorassembler = VectorAssembler(inputCols=[x for x in data.columns if x!='Class'], outputCol='for_feature_relevance_assembled')
data = vectorassembler.transform(data)

from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol='for_feature_relevance_assembled', labelCol='Class')

rf_model = rf.fit(data)

In [36]:
# Respuesta

feature_importances = [x for x in zip(data.columns, rf_model.featureImportances.toArray().tolist())]
feature_importances.sort(key=lambda x: x[1], reverse=True)
feature_importances

[('Time', 0.46020842119743854),
 ('V12', 0.19002351140284945),
 ('V3', 0.05985659536567158),
 ('V11', 0.04851105989642065),
 ('V14', 0.0438213545008186),
 ('V4', 0.04261377093342171),
 ('V1', 0.03320995766363637),
 ('V2', 0.029447540044507642),
 ('V17', 0.020695199098059447),
 ('V21', 0.017221337935920657),
 ('V10', 0.01660435391236873),
 ('V7', 0.006497692380353545),
 ('V6', 0.006468735859061245),
 ('V16', 0.0048992331899589585),
 ('Amount', 0.0035837509838148617),
 ('V28', 0.0028329948700722037),
 ('V22', 0.0021869523270229368),
 ('V20', 0.002068634488723442),
 ('V25', 0.0019551124438291372),
 ('V19', 0.0013965528077369252),
 ('V26', 0.0013708544251635028),
 ('V9', 0.0013564620961224823),
 ('V23', 0.0012712421651141011),
 ('V8', 0.000892459791711719),
 ('V27', 0.0004128378044335263),
 ('V13', 0.00036063804930590814),
 ('V24', 0.00014317423660036173),
 ('V5', 8.957012986170879e-05),
 ('V15', 0.0),
 ('V18', 0.0)]

In [41]:
# Respuesta

features = ['Time', 'V14', 'V12', 'V17', 'V11', 'V3', 'V10', 'V16', 'V2', 'V21', 'V4', 'Amount', 'V20', 'V6']



7) Entrena una regresión logistica usando Pipeline y evalua el modelo resultante calculando la AUC, f1, la tasa de acierto y la matriz de confusión  para train y test.

In [39]:
# Respuesta

from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.pipeline import Pipeline

assembler = VectorAssembler(inputCols=features,outputCol='features')
standard = StandardScaler(inputCol='features',outputCol='standar') 
lr = LogisticRegression(featuresCol='standar',labelCol='Class')

pipeline_lr = Pipeline(stages=[assembler, standard, lr])

model_lr = pipeline_lr.fit(data_train)

In [42]:
# Respuesta

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


auc = BinaryClassificationEvaluator(labelCol='Class')
acc = MulticlassClassificationEvaluator(labelCol='Class',metricName="accuracy")
f1  = MulticlassClassificationEvaluator(labelCol='Class',metricName="f1")

data_train_LR = model_lr.transform(data_train)
data_test_LR = model_lr.transform(data_test)

print("train auc : ", auc.evaluate(data_train_LR))
print("test auc  : ", auc.evaluate(data_test_LR))
print("#####################################")
print("train f1  : " , f1.evaluate(data_train_LR))
print("test f1   : " , f1.evaluate(data_test_LR))
print("#####################################")
print("train acc : " , 100 * acc.evaluate(data_train_LR))
print("test acc  : " , 100 * acc.evaluate(data_test_LR))

print("train Confusion matrix :")
data_train_LR.groupBy('Class').pivot('prediction').count().show()
print("test Confusion matrix  :")
data_test_LR.groupBy('Class').pivot('prediction').count().show()

train auc :  1.0
test auc  :  0.9976347182229536
#####################################
train f1  :  1.0
test f1   :  0.9928324624249172
#####################################
train acc :  100.0
test acc  :  99.2831541218638
train Confusion matrix :
+-----+----+----+
|Class| 0.0| 1.0|
+-----+----+----+
|  0.0| 349|null|
|  1.0|null| 356|
+-----+----+----+

test Confusion matrix  :
+-----+----+---+
|Class| 0.0|1.0|
+-----+----+---+
|  0.0| 141|  2|
|  1.0|null|136|
+-----+----+---+





8) Entrena un RandomForestClassifier usando Pipeline y evalua el modelo resultante calculando la AUC, la tasa de acierto y la matriz de confusión  para train y test.

In [43]:
# Respuesta

rf = RandomForestClassifier(featuresCol=assembler.getOutputCol(),labelCol='Class', maxDepth=2, numTrees=5)

pipeline_rf = Pipeline(stages=[assembler, rf])
model_rf = pipeline_rf.fit(data_train)

In [44]:
# Respuesta

data_tr02 = model_rf.transform(data_train)
data_ts02 = model_rf.transform(data_test)

print("train auc : ", auc.evaluate(data_tr02))
print("test auc  : ", auc.evaluate(data_ts02))
print("#####################################")
print("train f1  : " , f1.evaluate(data_tr02))
print("test f1   : " , f1.evaluate(data_ts02))
print("#####################################")
print("train acc : " , 100*acc.evaluate(data_tr02))
print("test acc  : " , 100*acc.evaluate(data_ts02))

print("train Confusion matrix :")
data_tr02.groupBy('Class').pivot('prediction').count().show()
print("test Confusion matrix  :")
data_ts02.groupBy('Class').pivot('prediction').count().show()

train auc :  0.999903415859116
test auc  :  0.9998971616618676
#####################################
train f1  :  0.997163177647445
test f1   :  0.9928298818531793
#####################################
train acc :  99.71631205673759
test acc  :  99.2831541218638
train Confusion matrix :
+-----+---+----+
|Class|0.0| 1.0|
+-----+---+----+
|  0.0|349|null|
|  1.0|  2| 354|
+-----+---+----+

test Confusion matrix  :
+-----+---+----+
|Class|0.0| 1.0|
+-----+---+----+
|  0.0|143|null|
|  1.0|  2| 134|
+-----+---+----+





9) Utiliza una CV 5-fold con grid search para determinar los mejores parametros del RandomForestClassifier utilizando como metrica la f1. Prueba los parametros _maxDepth_ y _numTrees_.

In [45]:
# Respuesta

paramGrid = ParamGridBuilder().addGrid(rf.maxDepth,[2,3])\
                              .addGrid(rf.numTrees,[2,3])\
                              .build()

crossval_rf = CrossValidator(estimator=pipeline_rf,
                          estimatorParamMaps=paramGrid,
                          numFolds=5,evaluator = f1,seed = 0) 

model_cv_rf = crossval_rf.fit(data_train)

In [46]:
# Respuesta

data_tr03 = model_cv_rf.transform(data_train)
data_ts03 = model_cv_rf.transform(data_test)

print("train auc : ", auc.evaluate(data_tr03))
print("test auc  : ", auc.evaluate(data_ts03))
print("#####################################")
print("train f1  : " , f1.evaluate(data_tr03))
print("test f1   : " , f1.evaluate(data_ts03))
print("#####################################")
print("train acc : " , 100*acc.evaluate(data_tr03))
print("test acc  : " , 100*acc.evaluate(data_ts03))

print("train Confusion matrix :")
data_tr03.groupBy(target).pivot('prediction').count().show()
print("test Confusion matrix  :")
data_ts03.groupBy(target).pivot('prediction').count().show()

train auc :  0.9999839026431859
test auc  :  0.9964006581653639
#####################################
train f1  :  0.9971631205673759
test f1   :  0.992831541218638
#####################################
train acc :  99.71631205673759
test acc  :  99.2831541218638
train Confusion matrix :
+-----+---+---+
|Class|0.0|1.0|
+-----+---+---+
|  0.0|348|  1|
|  1.0|  1|355|
+-----+---+---+

test Confusion matrix  :
+-----+---+---+
|Class|0.0|1.0|
+-----+---+---+
|  0.0|142|  1|
|  1.0|  1|135|
+-----+---+---+





10) Cuales son los valores de los parametros usados en el grid search

In [47]:
# Respuesta

best_rf_Model = model_cv_rf.bestModel.stages[-1]
print("Max Depth        : ", best_rf_Model._java_obj.getMaxDepth())
print("Num Trees        : ", best_rf_Model._java_obj.getNumTrees())
print("Subsampling Rate : ", best_rf_Model._java_obj.getSubsamplingRate())

Max Depth        :  2
Num Trees        :  2
Subsampling Rate :  1.0
