### Criar um modelo para classificar clientes do Banco União, a partir das variáveis de entrada, classificando-os como possíveis bons ou maus pagadores, a fim de uma aprovação ou não para pedidos de empréstimo.

In [1]:
# Findspark
import findspark
findspark.init()

In [2]:
# Imports
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import PCA
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#### Carga dos Dados

In [3]:
# Spark Context
sc = SparkContext(appName = "EmprestimoBancario")
sc.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/30 14:31:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Spark Session
spSession = SparkSession.builder.master("local").getOrCreate()

In [5]:
# Carga dos dados em RDD
bankRDD = sc.textFile("dados/dataset3.csv")
bankRDD.cache()

dados/dataset3.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [6]:
bankRDD.count()

                                                                                

542

In [7]:
bankRDD.take(5)

['"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"',
 '30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no"',
 '33;"services";"married";"secondary";"no";4789;"yes";"yes";"cellular";11;"may";220;1;339;4;"failure";"yes"',
 '35;"management";"single";"tertiary";"no";1350;"yes";"no";"cellular";16;"apr";185;1;330;1;"failure";"yes"',
 '30;"management";"married";"tertiary";"no";1476;"yes";"yes";"unknown";3;"jun";199;4;-1;0;"unknown";"yes"']

In [8]:
# Remove cabeçalho
firstline = bankRDD.first()
bankRDD2 = bankRDD.filter(lambda x: x != firstline)
bankRDD2.count()

541

#### Limpeza e Transformação

In [9]:
# Encoding
def transformToNumeric(inputStr) :
    
    #substituição do enter e split
    attList = inputStr.replace("\"","").split(";")
    
    # Int. para Float
    age = float(attList[0])
    balance = float(attList[5])
    
    # One-Hot Enconding com var. dummy
    single = 1.0 if attList[2] == 'single' else 0.0
    married = 1.0 if attList[2] == 'married' else 0.0
    divorced = 1.0 if attList[2] == 'divorced' else 0.0
    primary = 1.0 if attList[3] == 'primary' else 0.0
    secondary = 1.0 if attList[3] == 'secondary' else 0.0
    tertiary = 1.0 if attList[3] == 'tertiary' else 0.0
    
    # Label Encoding
    default = 0.0 if attList[4] == "no" else 1.0
    loan = 0.0 if attList[7] == "no" else 1.0
    outcome = 0.0 if attList[16] == "no" else 1.0
    
    # Linhas com os att. transformados
    linhas = Row(OUTCOME = outcome, 
                 AGE = age, 
                 SINGLE = single, 
                 MARRIED = married, 
                 DIVORCED = divorced,
                 PRIMARY = primary, 
                 SECONDARY = secondary, 
                 TERTIARY = tertiary, 
                 DEFAULT = default, 
                 BALANCE = balance,
                 LOAN = loan)
    return linhas

In [10]:
# Aplicando a função aos dados
bankRDD3 = bankRDD2.map(transformToNumeric)

In [11]:
bankRDD3.collect()[:5]

[Row(OUTCOME=0.0, AGE=30.0, SINGLE=0.0, MARRIED=1.0, DIVORCED=0.0, PRIMARY=1.0, SECONDARY=0.0, TERTIARY=0.0, DEFAULT=0.0, BALANCE=1787.0, LOAN=0.0),
 Row(OUTCOME=1.0, AGE=33.0, SINGLE=0.0, MARRIED=1.0, DIVORCED=0.0, PRIMARY=0.0, SECONDARY=1.0, TERTIARY=0.0, DEFAULT=0.0, BALANCE=4789.0, LOAN=1.0),
 Row(OUTCOME=1.0, AGE=35.0, SINGLE=1.0, MARRIED=0.0, DIVORCED=0.0, PRIMARY=0.0, SECONDARY=0.0, TERTIARY=1.0, DEFAULT=0.0, BALANCE=1350.0, LOAN=0.0),
 Row(OUTCOME=1.0, AGE=30.0, SINGLE=0.0, MARRIED=1.0, DIVORCED=0.0, PRIMARY=0.0, SECONDARY=0.0, TERTIARY=1.0, DEFAULT=0.0, BALANCE=1476.0, LOAN=1.0),
 Row(OUTCOME=0.0, AGE=59.0, SINGLE=0.0, MARRIED=1.0, DIVORCED=0.0, PRIMARY=0.0, SECONDARY=1.0, TERTIARY=0.0, DEFAULT=0.0, BALANCE=0.0, LOAN=0.0)]

#### Análise Exploratória

In [12]:
# Transforma em DF
bankDF = spSession.createDataFrame(bankRDD3)

In [13]:
# Correlação da variável OUTCOME as demais
for i in bankDF.columns:
    if not ( isinstance(bankDF.select(i).take(1)[0][0], str)) :
        print("Correlação da variável OUTCOME com:", i, bankDF.stat.corr('OUTCOME',i))

Correlação da variável OUTCOME com: OUTCOME 1.0
Correlação da variável OUTCOME com: AGE -0.18232104327365253
Correlação da variável OUTCOME com: SINGLE 0.46323284934360515
Correlação da variável OUTCOME com: MARRIED -0.3753241299133561
Correlação da variável OUTCOME com: DIVORCED -0.07812659940926987
Correlação da variável OUTCOME com: PRIMARY -0.12561548832677985
Correlação da variável OUTCOME com: SECONDARY 0.026392774894072976
Correlação da variável OUTCOME com: TERTIARY 0.08494840766635618
Correlação da variável OUTCOME com: DEFAULT -0.04536965206737378
Correlação da variável OUTCOME com: BALANCE 0.03657486611997681
Correlação da variável OUTCOME com: LOAN -0.03042058611271732


#### Pré-Processamento dos Dados

In [14]:
# Criando LabelPoint
def transformaVar(row):
    obj = (row["OUTCOME"], Vectors.dense([row["AGE"], 
                                          row["BALANCE"], 
                                          row["DEFAULT"], 
                                          row["DIVORCED"], 
                                          row["LOAN"], 
                                          row["MARRIED"], 
                                          row["PRIMARY"], 
                                          row["SECONDARY"], 
                                          row["SINGLE"], 
                                          row["TERTIARY"]]))
    return obj

In [15]:
# Aplica função
bankRDD4 = bankDF.rdd.map(transformaVar)

In [16]:
bankRDD4.collect()

                                                                                

[(0.0, DenseVector([30.0, 1787.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0])),
 (1.0, DenseVector([33.0, 4789.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0])),
 (1.0, DenseVector([35.0, 1350.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0])),
 (1.0, DenseVector([30.0, 1476.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0])),
 (0.0, DenseVector([59.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0])),
 (1.0, DenseVector([35.0, 747.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0])),
 (1.0, DenseVector([36.0, 307.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0])),
 (0.0, DenseVector([39.0, 147.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0])),
 (0.0, DenseVector([41.0, 221.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0])),
 (1.0, DenseVector([43.0, -88.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0])),
 (0.0, DenseVector([39.0, 9374.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0])),
 (0.0, DenseVector([43.0, 264.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0])),
 (0.0, DenseVector([36.0, 1109.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0])),
 (1.0, D

In [17]:
# Converte RDD em DF
bankDF = spSession.createDataFrame(bankRDD4, ["label", "features"])

[Stage 51:>                                                         (0 + 1) / 1]                                                                                

In [18]:
bankDF.select('features', 'label').show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[30.0,1787.0,0.0,...|  0.0|
|[33.0,4789.0,0.0,...|  1.0|
|[35.0,1350.0,0.0,...|  1.0|
|[30.0,1476.0,0.0,...|  1.0|
|[59.0,0.0,0.0,0.0...|  0.0|
+--------------------+-----+
only showing top 5 rows



#### Redução de dimensionalidade 

In [19]:
# Cria PCA de 3 componentes
bankPCA = PCA(k = 3, inputCol = "features", outputCol = "pcaFeatures")

In [20]:
#Treina
pcaModel = bankPCA.fit(bankDF)

                                                                                

In [21]:
# Redução
pcaResult = pcaModel.transform(bankDF).select("label", "pcaFeatures")

In [22]:
pcaResult.show(5, truncate = False)

+-----+------------------------------------------------------------+
|label|pcaFeatures                                                 |
+-----+------------------------------------------------------------+
|0.0  |[-1787.018897197381,28.86209683775529,-0.06459982604876241] |
|1.0  |[-4789.020177138492,29.922562636341947,-0.9830243513096373] |
|1.0  |[-1350.022213163262,34.10110809796688,0.8951427168301704]   |
|1.0  |[-1476.0189517184556,29.051333993596703,0.3952723868021948] |
|0.0  |[-0.037889185366442445,58.9897182000177,-0.7290792383661886]|
+-----+------------------------------------------------------------+
only showing top 5 rows



In [23]:
# Indexação do label para Decision Trees
stringIndexer = StringIndexer(inputCol = "label", outputCol = "label_indexed")
si_model = stringIndexer.fit(pcaResult)
obj_final = si_model.transform(pcaResult)
obj_final.collect()

                                                                                

[Row(label=0.0, pcaFeatures=DenseVector([-1787.0189, 28.8621, -0.0646]), label_indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([-4789.0202, 29.9226, -0.983]), label_indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-1350.0222, 34.1011, 0.8951]), label_indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-1476.019, 29.0513, 0.3953]), label_indexed=1.0),
 Row(label=0.0, pcaFeatures=DenseVector([-0.0379, 58.9897, -0.7291]), label_indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([-747.0223, 34.4883, 0.9046]), label_indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-307.0231, 35.7999, 0.5171]), label_indexed=1.0),
 Row(label=0.0, pcaFeatures=DenseVector([-147.025, 38.9011, -0.807]), label_indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-221.0263, 40.8536, 0.5373]), label_indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([87.9724, 43.0627, -0.067]), label_indexed=1.0),
 Row(label=0.0, pcaFeatures=DenseVector([-9374.0231, 32.9765, -0.9511]), label_indexed=0.0

#### Machine Learning

In [24]:
# Split em treino e teste (75% para treino e 25% para teste)
(dados_treino, dados_teste) = obj_final.randomSplit([0.75, 0.25])

In [25]:
dados_treino.count()

414

In [26]:
dados_teste.count()

127

In [27]:
# Cria objeto
rfClassifer = RandomForestClassifier(labelCol = "label_indexed", featuresCol = "pcaFeatures")

In [28]:
# Treina objeto e cria modelo
modelo = rfClassifer.fit(dados_treino)

In [29]:
# Previsões com teste
predictions = modelo.transform(dados_teste)

In [30]:
predictions

DataFrame[label: double, pcaFeatures: vector, label_indexed: double, rawPrediction: vector, probability: vector, prediction: double]

In [31]:
predictions.select("label", "label_indexed", "pcaFeatures", "prediction").collect()

[Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-8104.0336, 49.7873, -0.8708]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-7190.0255, 37.3733, 0.7344]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-7082.0351, 52.4544, -0.0453]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-5996.0302, 45.1426, -0.8606]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-5426.0252, 37.5115, 0.456]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-4073.0351, 53.3753, -0.8041]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-3762.0275, 41.5791, 0.4933]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-1831.0215, 32.8212, -0.8522]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-1787.0189, 28.8621, -0.0646]), prediction=0.0),
 Row(label=0.0, label_i

In [32]:
# Avaliando acurácia
evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", 
                                              labelCol = "label_indexed", 
                                              metricName = "accuracy")

In [33]:
evaluator.evaluate(predictions)

0.6299212598425197

In [34]:
# Confusion Matrix
predictions.groupBy("label_indexed", "prediction").count().show()

+-------------+----------+-----+
|label_indexed|prediction|count|
+-------------+----------+-----+
|          1.0|       1.0|   18|
|          0.0|       1.0|   17|
|          1.0|       0.0|   30|
|          0.0|       0.0|   62|
+-------------+----------+-----+



### Modelo 2 - PCA k = 4, e utilizando somente Label Encoding

In [35]:
# Dicionário para o encoding das variáveis.
marital_status_mapping = {"single": 0, "married": 1, "divorced": 2}
education_mapping = {"primary": 0, "secondary": 1, "tertiary": 2}

In [36]:
# Função para transformação dos dados
def transformToNumeric2(inputStr) :
    
    #substituição do enter e split
    attList = inputStr.replace("\"","").split(";")
    
    # Inteiro para Float
    age = float(attList[0])
    balance = float(attList[5])
    

    # Label Encoding
    marital_status = marital_status_mapping.get(attList[2], -1)
    default = 0.0 if attList[4] == "no" else 1.0
    education = education_mapping.get(attList[3], -1) 
    loan = 1.0 if attList[7] == "yes" else 0.0
    outcome = 1.0 if attList[16] == "yes" else 0.0

    # Linhas com os att. transformados
    linhas = Row(
        OUTCOME=outcome,
        AGE=age,
        MARITAL_STATUS=marital_status,
        EDUCATION=education,
        DEFAULT=default,
        BALANCE=balance,
        LOAN=loan
    )
    return linhas

In [37]:
# Aplicando a função aos dados
bank2RDD = bankRDD2.map(transformToNumeric2)

In [38]:
bank2RDD.collect()[:10]

[Row(OUTCOME=0.0, AGE=30.0, MARITAL_STATUS=1, EDUCATION=0, DEFAULT=0.0, BALANCE=1787.0, LOAN=0.0),
 Row(OUTCOME=1.0, AGE=33.0, MARITAL_STATUS=1, EDUCATION=1, DEFAULT=0.0, BALANCE=4789.0, LOAN=1.0),
 Row(OUTCOME=1.0, AGE=35.0, MARITAL_STATUS=0, EDUCATION=2, DEFAULT=0.0, BALANCE=1350.0, LOAN=0.0),
 Row(OUTCOME=1.0, AGE=30.0, MARITAL_STATUS=1, EDUCATION=2, DEFAULT=0.0, BALANCE=1476.0, LOAN=1.0),
 Row(OUTCOME=0.0, AGE=59.0, MARITAL_STATUS=1, EDUCATION=1, DEFAULT=0.0, BALANCE=0.0, LOAN=0.0),
 Row(OUTCOME=1.0, AGE=35.0, MARITAL_STATUS=0, EDUCATION=2, DEFAULT=0.0, BALANCE=747.0, LOAN=0.0),
 Row(OUTCOME=1.0, AGE=36.0, MARITAL_STATUS=1, EDUCATION=2, DEFAULT=0.0, BALANCE=307.0, LOAN=0.0),
 Row(OUTCOME=0.0, AGE=39.0, MARITAL_STATUS=1, EDUCATION=1, DEFAULT=0.0, BALANCE=147.0, LOAN=0.0),
 Row(OUTCOME=0.0, AGE=41.0, MARITAL_STATUS=1, EDUCATION=2, DEFAULT=0.0, BALANCE=221.0, LOAN=0.0),
 Row(OUTCOME=1.0, AGE=43.0, MARITAL_STATUS=1, EDUCATION=0, DEFAULT=0.0, BALANCE=-88.0, LOAN=1.0)]

In [39]:
# Transforma em DF
bank2DF = spSession.createDataFrame(bank2RDD)

#### Pré-Processamento dos Dados

In [40]:
# Criando LabelPoint
def transformaVar2(row):
    obj = (row["OUTCOME"], Vectors.dense([row["AGE"], 
                                          row["BALANCE"], 
                                          row["DEFAULT"], 
                                          row["LOAN"], 
                                          row["MARITAL_STATUS"], 
                                          row["EDUCATION"]]))
    return obj

In [41]:
# Aplica função transformaVar
bank2RDD2 = bank2DF.rdd.map(transformaVar2)

In [42]:
bank2RDD2.collect()

                                                                                

[(0.0, DenseVector([30.0, 1787.0, 0.0, 0.0, 1.0, 0.0])),
 (1.0, DenseVector([33.0, 4789.0, 0.0, 1.0, 1.0, 1.0])),
 (1.0, DenseVector([35.0, 1350.0, 0.0, 0.0, 0.0, 2.0])),
 (1.0, DenseVector([30.0, 1476.0, 0.0, 1.0, 1.0, 2.0])),
 (0.0, DenseVector([59.0, 0.0, 0.0, 0.0, 1.0, 1.0])),
 (1.0, DenseVector([35.0, 747.0, 0.0, 0.0, 0.0, 2.0])),
 (1.0, DenseVector([36.0, 307.0, 0.0, 0.0, 1.0, 2.0])),
 (0.0, DenseVector([39.0, 147.0, 0.0, 0.0, 1.0, 1.0])),
 (0.0, DenseVector([41.0, 221.0, 0.0, 0.0, 1.0, 2.0])),
 (1.0, DenseVector([43.0, -88.0, 0.0, 1.0, 1.0, 0.0])),
 (0.0, DenseVector([39.0, 9374.0, 0.0, 0.0, 1.0, 1.0])),
 (0.0, DenseVector([43.0, 264.0, 0.0, 0.0, 1.0, 1.0])),
 (0.0, DenseVector([36.0, 1109.0, 0.0, 0.0, 1.0, 2.0])),
 (1.0, DenseVector([20.0, 502.0, 0.0, 0.0, 0.0, 1.0])),
 (1.0, DenseVector([31.0, 360.0, 0.0, 1.0, 1.0, 1.0])),
 (0.0, DenseVector([40.0, 194.0, 0.0, 1.0, 1.0, 2.0])),
 (0.0, DenseVector([56.0, 4073.0, 0.0, 0.0, 1.0, 1.0])),
 (1.0, DenseVector([37.0, 2317.0, 0.0, 0.0,

In [43]:
# Converte RDD em DF
bank2DF = spSession.createDataFrame(bank2RDD2, ["label", "features"])

In [44]:
bank2DF.select('features', 'label').show(3)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[30.0,1787.0,0.0,...|  0.0|
|[33.0,4789.0,0.0,...|  1.0|
|[35.0,1350.0,0.0,...|  1.0|
+--------------------+-----+
only showing top 3 rows



#### Redução de dimensionalidade 

In [45]:
# Cria PCA de 4 componentes
bank2PCA = PCA(k = 4, inputCol = "features", outputCol = "pcaFeatures")

In [46]:
#Treina
pcaModel2 = bank2PCA.fit(bank2DF)

                                                                                

In [47]:
# Redução
pcaResult2 = pcaModel2.transform(bank2DF).select("label", "pcaFeatures")

In [48]:
pcaResult2.show(5, truncate = False)

+-----+-------------------------------------------------------------------------------+
|label|pcaFeatures                                                                    |
+-----+-------------------------------------------------------------------------------+
|0.0  |[-1787.0189013456968,28.86465735936177,0.42720870292758745,0.3016100110286005] |
|1.0  |[-4789.020200453407,29.921189981776095,1.368325854165402,0.19434042149159386]  |
|1.0  |[-1350.0222313116572,34.087439977346605,2.521215563988013,-0.8139875298027369] |
|1.0  |[-1476.0189724478373,29.033089082983494,2.4036636564627702,0.2256740119710345] |
|0.0  |[-0.03791261698648324,58.98382682694944,1.9329018663001152,-0.4130336855783044]|
+-----+-------------------------------------------------------------------------------+
only showing top 5 rows



In [49]:
# Indexação do label para Decision Trees
stringIndexer2 = StringIndexer(inputCol = "label", outputCol = "label_indexed")
si_model2 = stringIndexer2.fit(pcaResult2)
obj_final2 = si_model2.transform(pcaResult2)
obj_final2.collect()

                                                                                

[Row(label=0.0, pcaFeatures=DenseVector([-1787.0189, 28.8647, 0.4272, 0.3016]), label_indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([-4789.0202, 29.9212, 1.3683, 0.1943]), label_indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-1350.0222, 34.0874, 2.5212, -0.814]), label_indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-1476.019, 29.0331, 2.4037, 0.2257]), label_indexed=1.0),
 Row(label=0.0, pcaFeatures=DenseVector([-0.0379, 58.9838, 1.9329, -0.413]), label_indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([-747.0224, 34.4745, 2.5364, -0.8219]), label_indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-307.0231, 35.7805, 2.5587, 0.1446]), label_indexed=1.0),
 Row(label=0.0, pcaFeatures=DenseVector([-147.025, 38.8978, 1.6111, 0.0677]), label_indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-221.0263, 40.8337, 2.6404, 0.0238]), label_indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([87.9724, 43.0631, 0.6512, -0.1122]), label_indexed=1.0),
 Row(label

#### Machine Learning

In [50]:
# Split em treino e teste
(dados_treino, dados_teste) = obj_final2.randomSplit([0.75, 0.25])

In [51]:
dados_treino.count()

388

In [52]:
dados_teste.count()

153

In [53]:
# Cria objeto
rfClassifer = RandomForestClassifier(labelCol = "label_indexed", featuresCol = "pcaFeatures")

In [54]:
# Treina objeto e criar modelo
modelo2 = rfClassifer.fit(dados_treino)

In [55]:
# Previsões com teste
predictions2 = modelo2.transform(dados_teste)

In [56]:
predictions2

DataFrame[label: double, pcaFeatures: vector, label_indexed: double, rawPrediction: vector, probability: vector, prediction: double]

In [57]:
predictions2.select("label", "label_indexed", "pcaFeatures", "prediction").collect()

[Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-16873.0325, 45.1552, 1.4589, -0.1195]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-14093.0337, 47.9391, 1.5451, -0.1799]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-11494.0343, 49.6073, 1.6107, -0.2141]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-9009.0251, 36.1927, 2.4343, 0.1154]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-7082.0352, 52.4549, 0.7228, -0.2751]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-4073.0351, 53.3708, 1.7823, -0.2877]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-4030.0229, 34.4063, 1.4812, 0.1666]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-3571.025, 37.6839, 2.5399, 0.0918]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-3096

In [58]:
# Avaliando acurácia
evaluator.evaluate(predictions2)

0.7581699346405228

In [59]:
# Confusion Matrix
predictions2.groupBy("label_indexed", "prediction").count().show()

+-------------+----------+-----+
|label_indexed|prediction|count|
+-------------+----------+-----+
|          1.0|       1.0|   29|
|          0.0|       1.0|   14|
|          1.0|       0.0|   23|
|          0.0|       0.0|   87|
+-------------+----------+-----+

