In [1]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('722-model').getOrCreate()

In [2]:
data1= spark.read.csv("./data_japan.csv", header=True, inferSchema=True)
data1.printSchema()

root
 |-- adm0_id: integer (nullable = true)
 |-- adm0_name: string (nullable = true)
 |-- adm1_id: integer (nullable = true)
 |-- adm1_name: string (nullable = true)
 |-- mkt_id: integer (nullable = true)
 |-- mkt_name: string (nullable = true)
 |-- cm_id: integer (nullable = true)
 |-- cm_name: string (nullable = true)
 |-- cur_id: integer (nullable = true)
 |-- cur_name: string (nullable = true)
 |-- pt_id: integer (nullable = true)
 |-- pt_name: string (nullable = true)
 |-- um_id: integer (nullable = true)
 |-- um_name: string (nullable = true)
 |-- mp_month: integer (nullable = true)
 |-- mp_year: integer (nullable = true)
 |-- mp_price_RMB: double (nullable = true)



In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
  inputCols=['mp_price_RMB'],
              outputCol="features")
output = assembler.transform(data1)
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="mkt_name", outputCol="PrivateIndex")
output_fixed = indexer.fit(output).transform(output)
final_data = output_fixed.select("features",'PrivateIndex')

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml import Pipeline
dtc = DecisionTreeClassifier(labelCol='PrivateIndex',featuresCol='features')
train_data,test_data = final_data.randomSplit([0.7,0.3])
dtc_model = dtc.fit(train_data)
dtc_predictions = dtc_model.transform(test_data)

In [7]:
# Let's start off with binary classification.
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Note that the label column isn't named label, it's named PrivateIndex in this case.
my_binary_eval = BinaryClassificationEvaluator(labelCol = 'PrivateIndex')

In [8]:
print("DTC")
print(my_binary_eval.evaluate(dtc_predictions))

DTC
0.5531400966183574


In [9]:
dtc_predictions.show()
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_evaluator = MulticlassClassificationEvaluator(labelCol="PrivateIndex", predictionCol="prediction", metricName="accuracy")
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc_acc*100))


+--------------------+------------+-------------+-----------------+----------+
|            features|PrivateIndex|rawPrediction|      probability|prediction|
+--------------------+------------+-------------+-----------------+----------+
|            [7.2372]|         0.0|  [65.0,95.0]|[0.40625,0.59375]|       1.0|
| [8.867199999999999]|         0.0|  [65.0,95.0]|[0.40625,0.59375]|       1.0|
|            [8.9324]|         0.0|  [65.0,95.0]|[0.40625,0.59375]|       1.0|
| [8.997599999999998]|         0.0|  [65.0,95.0]|[0.40625,0.59375]|       1.0|
| [8.997599999999998]|         0.0|  [65.0,95.0]|[0.40625,0.59375]|       1.0|
|            [9.0628]|         1.0|  [65.0,95.0]|[0.40625,0.59375]|       1.0|
|            [9.3888]|         0.0|  [65.0,95.0]|[0.40625,0.59375]|       1.0|
| [9.453999999999999]|         0.0|  [65.0,95.0]|[0.40625,0.59375]|       1.0|
|            [9.5192]|         0.0|  [65.0,95.0]|[0.40625,0.59375]|       1.0|
| [9.714799999999999]|         0.0|  [65.0,95.0]|[0.

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
final_data.show()
data1.select('mkt_name').distinct().show()
final_data.select('PrivateIndex').distinct().show()

+--------------------+------------+
|            features|PrivateIndex|
+--------------------+------------+
|          [139.4628]|         1.0|
|           [136.268]|         1.0|
|          [137.9632]|         1.0|
|[134.70319999999998]|         1.0|
|[136.85479999999998]|         1.0|
|[134.70319999999998]|         1.0|
|          [147.2216]|         1.0|
|[146.11319999999998]|         1.0|
|[149.89479999999998]|         1.0|
|          [148.7864]|         1.0|
|           [147.678]|         1.0|
|[147.80839999999998]|         1.0|
|[149.95999999999998]|         1.0|
|          [150.4816]|         1.0|
|          [153.0896]|         1.0|
|          [154.7848]|         1.0|
|[155.82799999999997]|         1.0|
|[155.37159999999997]|         1.0|
|          [159.5444]|         1.0|
|[160.13119999999998]|         1.0|
+--------------------+------------+
only showing top 20 rows

+--------+
|mkt_name|
+--------+
|   Tokyo|
|   Osaka|
+--------+

+------------+
|PrivateIndex|
+------------

In [11]:
print(dtc_model.toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4b1e88d39177eb03b3b4) of depth 5 with 23 nodes
  If (feature 0 <= 160.19639999999998)
   If (feature 0 <= 134.70319999999998)
    If (feature 0 <= 37.9464)
     If (feature 0 <= 23.0156)
      If (feature 0 <= 12.713999999999999)
       Predict: 1.0
      Else (feature 0 > 12.713999999999999)
       Predict: 0.0
     Else (feature 0 > 23.0156)
      If (feature 0 <= 37.163999999999994)
       Predict: 1.0
      Else (feature 0 > 37.163999999999994)
       Predict: 0.0
    Else (feature 0 > 37.9464)
     If (feature 0 <= 39.9676)
      If (feature 0 <= 38.859199999999994)
       Predict: 0.0
      Else (feature 0 > 38.859199999999994)
       Predict: 0.0
     Else (feature 0 > 39.9676)
      Predict: 0.0
   Else (feature 0 > 134.70319999999998)
    If (feature 0 <= 147.678)
     Predict: 1.0
    Else (feature 0 > 147.678)
     If (feature 0 <= 152.56799999999998)
      Predict: 1.0
     Else (feature 0 > 152.56799999999998)
   