## 学習に必要な列を抽出して集計
ここではデバイスごとに各センサー平均値と**RUL**(耐用期間)の関係を作成します。

In [16]:
pydf = spark.sql("""
    SELECT
        DeviceId,
        Period,
        max(Cycle) AS RUL,
        round(avg(Sensor11),2) AS Sensor11,
        round(avg(Sensor14),2) AS Sensor14,
        round(avg(Sensor15),2) AS Sensor15,
        round(avg(Sensor9),2) AS Sensor9
    FROM 
        sensortablespark
    WHERE
        endofperiod = 1 
    GROUP BY 
        DeviceId,
        Period
    """)
pydf.show(10)

+---------+------+---+------------------+--------+--------+-------+
| DeviceId|Period|RUL|          Sensor11|Sensor14|Sensor15|Sensor9|
+---------+------+---+------------------+--------+--------+-------+
|N1172FJ-2|    16|172|               0.0| 8121.61|    8.78|8782.62|
|N3172FJ-1|     5|164|               0.0|  8079.9|    9.27|8748.87|
|N1172FJ-1|    52|149|               0.0| 8122.38|    8.75| 8778.9|
|N1172FJ-1|    35|134|               0.0| 8106.67|    8.72| 8764.4|
|N3172FJ-1|    24|203|               0.0| 8068.05|    9.21|8728.73|
|N4172FJ-1|     6|168|               0.0| 8101.64|    9.32|8338.59|
|N1172FJ-1|    25|228|2.4906038810512774|  8105.7|    8.91| 8720.3|
|N4172FJ-2|    15|177|               0.0| 8102.25|    9.43|8346.38|
|N1172FJ-2|    46|277|               0.0| 8080.73|    9.47|8331.11|
|N3172FJ-2|    48|242|               0.0| 8156.91|    9.37|8396.64|
+---------+------+---+------------------+--------+--------+-------+
only showing top 10 rows

In [13]:
# Shape
print((pydf.count(), len(pydf.columns)))

(436, 7)

## 特徴量変換
ベクター形式に変換します

In [18]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
# 特徴量変換
# vectorAssembler = VectorAssembler(inputCols = ['Sensor11','Sensor14','Sensor15','Sensor9'], outputCol = 'features')
# T-SQL PREDICT単純化のために特徴量削減ver
vectorAssembler = VectorAssembler(inputCols = ['Sensor11'], outputCol = 'features')

vdf = vectorAssembler.transform(pydf)

## データセット分割


In [20]:
trainingFraction = 0.7
testingFraction = (1-trainingFraction)
seed = 42

# Split the dataframe into test and training dataframes
df_train, df_test = vdf.randomSplit([trainingFraction, testingFraction], seed=seed)

## モデル学習
線形回帰を利用しています。
実運用には時系列データ用の適切なアルゴリズムを選定ください。

In [21]:
# モデル作成
lin_reg = LinearRegression(featuresCol = 'features', labelCol='RUL', maxIter = 10, regParam=0.3)
model = lin_reg.fit(df_train)
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

Coefficients: [33.57542095562193,0.7705913215264765,122.99461308737763,0.0852144001079971]
Intercept: -7931.400028834169

## 推論結果の確認


In [22]:
# テストデータセットで推論実行
prediction = model.transform(df_test)

In [23]:
# 確認
display(prediction)

## ONNX形式で保存


In [48]:
from onnxmltools import convert_sparkml
from onnxmltools.convert.common.data_types import FloatTensorType

initial_types = [ 
    ("features", FloatTensorType([1, model.numFeatures]))
]


In [49]:
model_onnx = convert_sparkml(model, 'sparkml GeneralizedLinearRegression', initial_types)
model_onnx

ir_version: 6
producer_name: "OnnxMLTools"
producer_version: "1.5.5"
domain: "onnxconverter-common"
model_version: 0
doc_string: ""
graph {
  node {
    input: "features"
    output: "prediction"
    name: "LinearRegressor"
    op_type: "LinearRegressor"
    attribute {
      name: "coefficients"
      floats: 26.632780075073242
      type: FLOATS
    }
    attribute {
      name: "intercepts"
      floats: -972.1617431640625
      type: FLOATS
    }
    domain: "ai.onnx.ml"
  }
  name: "sparkml GeneralizedLinearRegression"
  input {
    name: "features"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
            dim_value: 1
          }
          dim {
            dim_value: 1
          }
        }
      }
    }
  }
  output {
    name: "prediction"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
            dim_value: 1
          }
          dim {
            dim_value: 1
          }
        }
      }
    }
  }
}

In [40]:
# ファイル化
with open("model.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

278

## データレイクに保存
blobAPIを利用するため、Spark Poolにreference.txtを読み込ませてライブラリをインストールします。

In [41]:
connection_string = "DefaultEndpointsProtocol=https;AccountName=<ストレージアカウント名>;AccountKey=<key>;EndpointSuffix=core.windows.net"

from azure.storage.blob import BlobClient

blob = BlobClient.from_connection_string(conn_str=connection_string, container_name="datalake", blob_name="Models/onnx/rul_model.onnx")

with open("./model.onnx", "rb") as data:
    blob.upload_blob(data, overwrite=True)

{'etag': '"0x8D7EA9E6A159DF9"', 'last_modified': datetime.datetime(2020, 4, 27, 11, 30, 40, tzinfo=datetime.timezone.utc), 'content_md5': bytearray(b'\x04\xa7\xd7\x08\xe2)\xac\xb7\x91r\xb9m\x1f\x93\xc3\x8d'), 'client_request_id': '85d7252a-887a-11ea-a981-000d3a07d930', 'request_id': '8975ef9d-401e-0027-0487-1c347d000000', 'version': '2019-07-07', 'date': datetime.datetime(2020, 4, 27, 11, 30, 39, tzinfo=datetime.timezone.utc), 'request_server_encrypted': True, 'encryption_key_sha256': None, 'encryption_scope': None, 'error_code': None}