# ランダムフォレストによる分類予測
- bank-fullのy列（定期預金を申し込んだか否か）について分類予測する
- 特徴量は簡単のため、数値の列と文字列の"default"列のみ使用する
- パイプラインは使わないでやってみる

In [16]:
import pandas as pd
import numpy as np

In [17]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("random_forest").getOrCreate()

In [18]:
filename = "./data/bank/bank-full.csv"
data = spark.read.csv(filename, header=True, inferSchema=True, sep=";")
data.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

In [19]:
# string -> index
from pyspark.ml.feature import StringIndexer
default_index = StringIndexer(inputCol="default", outputCol="default_index")
data1 = default_index.fit(data).transform(data)

In [20]:
data1.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+-------------+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|default_index|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+-------------+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|          0.0|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|          0.0|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|          0.0|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|

In [21]:
#目的変数
from pyspark.sql.functions import lit, when, col
data2 = data1.withColumn("y1", when(col("y")=="yes", lit(1.0)).otherwise(lit(0.0)))
data2.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+-------------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|default_index| y1|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+-------------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|          0.0|0.0|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|          0.0|0.0|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|          0.0|0.0|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -

In [22]:
use_df = data2.select(["age","balance","day","duration","campaign","pdays","previous","default_index", "y1"])

In [23]:
target = "y1"
features = use_df.columns

In [24]:
features.remove(target)

In [25]:
#データ作成（assembler）
from pyspark.ml.feature import VectorAssembler
assemble = VectorAssembler(inputCols=features, outputCol="features")
df = assemble.transform(use_df)

In [26]:
df.show()

+---+-------+---+--------+--------+-----+--------+-------------+---+--------------------+
|age|balance|day|duration|campaign|pdays|previous|default_index| y1|            features|
+---+-------+---+--------+--------+-----+--------+-------------+---+--------------------+
| 58|   2143|  5|     261|       1|   -1|       0|          0.0|0.0|[58.0,2143.0,5.0,...|
| 44|     29|  5|     151|       1|   -1|       0|          0.0|0.0|[44.0,29.0,5.0,15...|
| 33|      2|  5|      76|       1|   -1|       0|          0.0|0.0|[33.0,2.0,5.0,76....|
| 47|   1506|  5|      92|       1|   -1|       0|          0.0|0.0|[47.0,1506.0,5.0,...|
| 33|      1|  5|     198|       1|   -1|       0|          0.0|0.0|[33.0,1.0,5.0,198...|
| 35|    231|  5|     139|       1|   -1|       0|          0.0|0.0|[35.0,231.0,5.0,1...|
| 28|    447|  5|     217|       1|   -1|       0|          0.0|0.0|[28.0,447.0,5.0,2...|
| 42|      2|  5|     380|       1|   -1|       0|          1.0|0.0|[42.0,2.0,5.0,380...|
| 58|    1

In [27]:
train_df, test_df = df.randomSplit([0.7, 0.3], seed=123)

In [28]:
from pyspark.ml.classification import RandomForestClassifier
clf = RandomForestClassifier(featuresCol="features", labelCol="y1", impurity="gini")

In [29]:
clf_model = clf.fit(train_df)

In [35]:
list(zip(features, clf_model.featureImportances))

[('age', 0.06962682586399632),
 ('balance', 0.011939607313434016),
 ('day', 0.026348412955006107),
 ('duration', 0.7481027752368549),
 ('campaign', 0.006599319634942584),
 ('pdays', 0.07542313839299836),
 ('previous', 0.061596603227567034),
 ('default_index', 0.0003633173752006348)]

In [36]:
pred_train = clf_model.transform(train_df)

DataFrame[age: int, balance: int, day: int, duration: int, campaign: int, pdays: int, previous: int, default_index: double, y1: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [37]:
pred_train.show()

+---+-------+---+--------+--------+-----+--------+-------------+---+--------------------+--------------------+--------------------+----------+
|age|balance|day|duration|campaign|pdays|previous|default_index| y1|            features|       rawPrediction|         probability|prediction|
+---+-------+---+--------+--------+-----+--------+-------------+---+--------------------+--------------------+--------------------+----------+
| 18|      3| 25|     130|       2|   -1|       0|          0.0|1.0|[18.0,3.0,25.0,13...|[18.9762000701442...|[0.94881000350721...|       0.0|
| 18|      5| 24|     143|       2|   -1|       0|          0.0|0.0|[18.0,5.0,24.0,14...|[18.9762000701442...|[0.94881000350721...|       0.0|
| 18|    108|  8|     169|       1|   -1|       0|          0.0|1.0|[18.0,108.0,8.0,1...|[18.8479630419347...|[0.94239815209673...|       0.0|
| 18|    108|  9|      92|       1|  183|       1|          0.0|1.0|[18.0,108.0,9.0,9...|[17.9775575061588...|[0.89887787530794...|       0.0|

In [38]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="y1")
AUC = evaluator.evaluate(pred_train)
AUC

0.8512642771091619

In [39]:
pred_test = clf_model.transform(test_df)
pred_test.show()

+---+-------+---+--------+--------+-----+--------+-------------+---+--------------------+--------------------+--------------------+----------+
|age|balance|day|duration|campaign|pdays|previous|default_index| y1|            features|       rawPrediction|         probability|prediction|
+---+-------+---+--------+--------+-----+--------+-------------+---+--------------------+--------------------+--------------------+----------+
| 18|     35| 21|     104|       2|   -1|       0|          0.0|0.0|[18.0,35.0,21.0,1...|[18.9762000701442...|[0.94881000350721...|       0.0|
| 18|    156|  4|     298|       2|   82|       4|          0.0|0.0|[18.0,156.0,4.0,2...|[14.7161035947629...|[0.73580517973814...|       0.0|
| 18|    608| 12|     267|       1|   -1|       0|          0.0|1.0|[18.0,608.0,12.0,...|[18.5152297653201...|[0.92576148826600...|       0.0|
| 19|      0|  4|      72|       4|   -1|       0|          0.0|0.0|[19.0,0.0,4.0,72....|[19.1177254581166...|[0.95588627290583...|       0.0|

In [40]:
pred_test.count()

13676

In [41]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="y1")
AUC = evaluator.evaluate(pred_test)
AUC

0.8492651774775294