In [83]:
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix, classification_report

from IPython.display import display

In [93]:
data_iris_all = load_iris(as_frame=True)

data_iris = data_iris_all["data"]
target_iris = data_iris_all["target"]
target_label_iris = data_iris_all["target_names"]
frame_iris = data_iris_all["frame"]

print(data_iris_all.keys())
display(data_iris.head())
print(data_iris.shape)
print(target_iris.value_counts())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


(150, 4)
0    50
1    50
2    50
Name: target, dtype: int64


In [94]:
display(frame_iris.head())
display(frame_iris.info())

#from pyspark.sql.types import *
#from pyspark.ml import Pipeline
#from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
#from pyspark.ml.classification import NaiveBayes
#rom pyspark.ml.evaluation import BinaryClassificationEvaluator

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


None

### Sparkのセッションを起動
javaが必要

In [95]:
from pyspark.sql import SparkSession
# セッションを開く
try:
    #spark.stop()
    pass
except NameError:
    pass

spark = SparkSession.builder.appName("pyspark-notebook").master(master="local[*]").getOrCreate()
spark

In [96]:
# print runtime versions
print ('****************')
print ('Python version: {}'.format(sys.version))
print ('Spark version: {}'.format(spark.version))
print ('****************')


****************
Python version: 3.8.5 (default, Jul 21 2020, 10:48:26) 
[Clang 11.0.3 (clang-1103.0.32.62)]
Spark version: 3.1.1
****************


In [129]:
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler, VectorIndexer

In [99]:
#データの読み込み（pandasのDataFrameより）
spark_data = spark.createDataFrame(frame_iris)

columns = spark_data.columns

In [106]:
print("columns : ", spark_data.columns)
print("rows : ", spark_data.count())
spark_data.show(10)

columns :  ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'target']
rows :  150
+-----------------+----------------+-----------------+----------------+------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|
+-----------------+----------------+-----------------+----------------+------+
|              5.1|             3.5|              1.4|             0.2|     0|
|              4.9|             3.0|              1.4|             0.2|     0|
|              4.7|             3.2|              1.3|             0.2|     0|
|              4.6|             3.1|              1.5|             0.2|     0|
|              5.0|             3.6|              1.4|             0.2|     0|
|              5.4|             3.9|              1.7|             0.4|     0|
|              4.6|             3.4|              1.4|             0.3|     0|
|              5.0|             3.4|              1.5|             0.2|     0|
|              

In [131]:
"""
特徴量をまとめる
"""
feature_cols = columns[:-1]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
spark_data_2 = assembler.transform(spark_data)

print("columns : ", spark_data_2.columns)
print("rows : ", spark_data_2.count())
spark_data_2.show(10)

columns :  ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'target', 'features']
rows :  150
+-----------------+----------------+-----------------+----------------+------+-----------------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|         features|
+-----------------+----------------+-----------------+----------------+------+-----------------+
|              5.1|             3.5|              1.4|             0.2|     0|[5.1,3.5,1.4,0.2]|
|              4.9|             3.0|              1.4|             0.2|     0|[4.9,3.0,1.4,0.2]|
|              4.7|             3.2|              1.3|             0.2|     0|[4.7,3.2,1.3,0.2]|
|              4.6|             3.1|              1.5|             0.2|     0|[4.6,3.1,1.5,0.2]|
|              5.0|             3.6|              1.4|             0.2|     0|[5.0,3.6,1.4,0.2]|
|              5.4|             3.9|              1.7|             0.4|     0|[5.4,3.9,1.7,0.4]|

In [136]:
labelIndexer = StringIndexer(inputCol="target", outputCol="indexedLabel").fit(spark_data_2)
spark_data_3 = labelIndexer.transform(spark_data_2)

In [137]:
spark_data_3.show(10)

+-----------------+----------------+-----------------+----------------+------+-----------------+------------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|         features|indexedLabel|
+-----------------+----------------+-----------------+----------------+------+-----------------+------------+
|              5.1|             3.5|              1.4|             0.2|     0|[5.1,3.5,1.4,0.2]|         0.0|
|              4.9|             3.0|              1.4|             0.2|     0|[4.9,3.0,1.4,0.2]|         0.0|
|              4.7|             3.2|              1.3|             0.2|     0|[4.7,3.2,1.3,0.2]|         0.0|
|              4.6|             3.1|              1.5|             0.2|     0|[4.6,3.1,1.5,0.2]|         0.0|
|              5.0|             3.6|              1.4|             0.2|     0|[5.0,3.6,1.4,0.2]|         0.0|
|              5.4|             3.9|              1.7|             0.4|     0|[5.4,3.9,1.7,0.4]|         0.0|
|         

In [132]:
"""
学習用データの作成
pysparkのlabelColのデフォルトが'label'なので変更しておく
"""
learning_data = spark_data_2.select("features", F.col("target").alias("label"))
learning_data.show(10)

train, test = learning_data.randomSplit([0.8, 0.2], seed=10)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|    0|
|[4.9,3.0,1.4,0.2]|    0|
|[4.7,3.2,1.3,0.2]|    0|
|[4.6,3.1,1.5,0.2]|    0|
|[5.0,3.6,1.4,0.2]|    0|
|[5.4,3.9,1.7,0.4]|    0|
|[4.6,3.4,1.4,0.3]|    0|
|[5.0,3.4,1.5,0.2]|    0|
|[4.4,2.9,1.4,0.2]|    0|
|[4.9,3.1,1.5,0.1]|    0|
+-----------------+-----+
only showing top 10 rows



In [143]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [138]:
#とりあえずデフォルトパラメータで学習
gbtr = GBTRegressor(
    featuresCol="features"
    , labelCol="label"
    , 
)

#学習
model = gbtr.fit(train)

In [144]:
predictions = model.transform(test)
predictions.show(10)

+-----------------+-----+----------+
|         features|label|prediction|
+-----------------+-----+----------+
|[4.4,2.9,1.4,0.2]|    0|       0.0|
|[4.6,3.4,1.4,0.3]|    0|       0.0|
|[4.7,3.2,1.6,0.2]|    0|       0.0|
|[4.8,3.4,1.9,0.2]|    0|       0.0|
|[4.9,3.1,1.5,0.2]|    0|       0.0|
|[5.0,3.4,1.5,0.2]|    0|       0.0|
|[5.5,3.5,1.3,0.2]|    0|       0.0|
|[5.7,4.4,1.5,0.4]|    0|       0.0|
|[4.9,2.4,3.3,1.0]|    1|       2.0|
|[4.9,3.6,1.4,0.1]|    0|       0.0|
+-----------------+-----+----------+
only showing top 10 rows



In [142]:

evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

+-----------------+-----+----------+
|         features|label|prediction|
+-----------------+-----+----------+
|[4.4,2.9,1.4,0.2]|    0|       0.0|
|[4.6,3.4,1.4,0.3]|    0|       0.0|
|[4.7,3.2,1.6,0.2]|    0|       0.0|
|[4.8,3.4,1.9,0.2]|    0|       0.0|
|[4.9,3.1,1.5,0.2]|    0|       0.0|
|[5.0,3.4,1.5,0.2]|    0|       0.0|
|[5.5,3.5,1.3,0.2]|    0|       0.0|
|[5.7,4.4,1.5,0.4]|    0|       0.0|
|[4.9,2.4,3.3,1.0]|    1|       2.0|
|[4.9,3.6,1.4,0.1]|    0|       0.0|
+-----------------+-----+----------+
only showing top 10 rows

