In [6]:
!pip install mleap==0.8.1
!pip install pyspark==2.2.2

Collecting mleap==0.8.1
  Downloading mleap-0.8.1.tar.gz (22 kB)
Collecting argparse>=1.1
  Downloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Collecting nose-exclude>=0.5.0
  Downloading nose-exclude-0.5.0.tar.gz (7.4 kB)
Building wheels for collected packages: mleap, nose-exclude
  Building wheel for mleap (setup.py) ... [?25ldone
[?25h  Created wheel for mleap: filename=mleap-0.8.1-py3-none-any.whl size=36866 sha256=9d8a13b4e9d512109fcf94095642be2a293b20869cbfddb3a57c4c633bd05eb8
  Stored in directory: /home/ec2-user/.cache/pip/wheels/71/5b/16/5c34a1af380b5145ef1820807ca6824721d4f9d9b04b8d5e9b
  Building wheel for nose-exclude (setup.py) ... [?25ldone
[?25h  Created wheel for nose-exclude: filename=nose_exclude-0.5.0-py3-none-any.whl size=4392 sha256=c6b4011bed010781bfb81ec02c50cc298d81b6a83305d7ec7231c8ce967821ff
  Stored in directory: /home/ec2-user/.cache/pip/wheels/5a/7d/fd/7c4c7fc5907b1717fd54e0ac8175188ffb6089b2299d0ece7a
Successfully built mleap nose-exclude
Installi

In [7]:
import mleap
import mleap.pyspark
from mleap.pyspark.spark_support import SimpleSparkSerializer
from pyspark import keyword_only
from pyspark.sql.functions import to_timestamp,hour,minute,when,col,current_timestamp,date_format,lit,unix_timestamp,expr,abs,to_date,rank,datediff
from pyspark.sql.window import Window
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder,StringIndexer,VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator,TrainValidationSplit
from pyspark.ml.regression import LinearRegression,RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import os
import boto3

# from pyspark.ml import Pipeline, Transformer
# from pyspark.sql import DataFrame
# from pyspark import keyword_only
# from pyspark.ml import Transformer
# from pyspark.ml.param.shared import HasOutputCols, Param, Params
# from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
# from pyspark.sql.functions import lit # for the dummy _transform

In [8]:
# !wget https://s3-us-west-2.amazonaws.com/sparkml-mleap/0.9.6/jar/mleap_spark_assembly.jar

In [9]:
import pyspark
pyspark.__version__

'2.2.2'

In [10]:
# !python -v

In [11]:
# !cp ./mleap_spark_assembly.jar /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyspark/jars

In [12]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
import sagemaker_pyspark
import botocore.session

session = botocore.session.get_session()
# credentials = session.get_credentials()

conf = (SparkConf().set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))

In [13]:
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
# May take awhile locally
spark = SparkSession.builder.config(conf = conf).appName("test").getOrCreate()
spark

In [14]:
spark.version

'2.2.2'

In [15]:
rawdata_s3 = 's3a://solarpowerbackend/rawdata/'
processed_s3 = 's3a://solarpowerbackend/processed/'
model_s3 = 's3a://solarpowerbackend/model/'

In [16]:
raw_df = spark.read.parquet(f'{processed_s3}rul_dataset')
raw_df = raw_df.withColumn('TIME',date_format(col('DATE_TIME'),'HH:mm:ss'))
raw_df = raw_df.filter(col('TIME').between('06:00:00','18:15:00'))

In [17]:
raw_df = raw_df.withColumn('DATE',to_date(col('DATE_TIME')))

In [18]:
only_fault_df = raw_df.select('DATE_TIME','FAULT_FLAG','SOURCE_KEY','DATE').filter(col('Fault_Flag')==1)

In [19]:
display(only_fault_df)

DataFrame[DATE_TIME: timestamp, FAULT_FLAG: int, SOURCE_KEY: string, DATE: date]

In [20]:
# rank_window = Window.partitionBy(only_fault_df['SOURCE_KEY'],only_fault_df['DATE']).orderBy(only_fault_df['DATE_TIME'])
# fault_rank_on_day = rank().over(rank_window)

In [21]:
fault_rank_df = only_fault_df.select(col('SOURCE_KEY').alias('FAULT_SOURCE_KEY'),col('DATE_TIME').alias('FAULT_DATE_TIME'))

In [22]:
rul_oncondition = (fault_rank_df['FAULT_DATE_TIME']>=raw_df['DATE_TIME']) & (fault_rank_df['FAULT_SOURCE_KEY'] == raw_df['SOURCE_KEY'])

In [23]:
rul_df = fault_rank_df.join(raw_df,on = rul_oncondition,how = 'inner')

In [24]:
rul_df = rul_df.withColumn('RUL',(col('FAULT_DATE_TIME').cast('long') - col('DATE_TIME').cast('long'))/60 - datediff(col('FAULT_DATE_TIME'),col('DATE_TIME'))*47*15)

In [25]:
display(rul_df)

DataFrame[FAULT_SOURCE_KEY: string, FAULT_DATE_TIME: timestamp, DATE_TIME: timestamp, AMBIENT_TEMPERATURE: double, MODULE_TEMPERATURE: double, IRRADIATION: double, PLANT_ID: int, SOURCE_KEY: string, DC_POWER: double, AC_POWER: double, DAILY_YIELD: double, TOTAL_YIELD: double, Fault_Flag: int, TIME: string, DATE: date, RUL: double]

In [26]:
rul_df = rul_df.groupby('SOURCE_KEY','DATE_TIME','AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','PLANT_ID','AC_POWER','DC_POWER','DAILY_YIELD','TOTAL_YIELD').min('RUL')
rul_df = rul_df.withColumnRenamed('min(RUL)','label')

# rul_df = rul_df.filter(col('RUL')>=1440)
# rul_df = rul_df.withColumnRenamed('RUL','')

In [27]:
# rul_df = rul_df.withColumn('AC_POWER',col('AC_POWER') - col('AC_POWER')/(col('RUL')+1)*2)
rul_df = rul_df.select('SOURCE_KEY', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION', 'AC_POWER', 'DC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD','label')

In [28]:
# acdc_calc = ACDC_Calulator(outputCols=["a", "b"], value=123.0)
str_ind = StringIndexer().setInputCol('SOURCE_KEY').setOutputCol('SOURCE_KEY_NUM')
ohe = OneHotEncoder().setInputCol('SOURCE_KEY_NUM').setOutputCol('encoded_Source_Key')

In [29]:
independent_features = ['AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','AC_POWER','DC_POWER','DAILY_YIELD','encoded_Source_Key']
model_input_vector = VectorAssembler().setInputCols(independent_features).setOutputCol('features')

# rul_indexed_df = rul_indexed_df.withColumnRenamed('RUL','label')

In [30]:
lr_model = LinearRegression().setFeaturesCol('features').setLabelCol('label')
lr_model.setPredictionCol('LR_prediction')
rf_model = RandomForestRegressor().setFeaturesCol('features').setLabelCol('label')
rf_model.setPredictionCol('RF_prediction')


RandomForestRegressor_4d6ab63ffbe8c48c49f8

In [31]:
# acdc_calc = ACDC_Calulator()
rf_pipeline = Pipeline(stages = [str_ind,ohe,model_input_vector,rf_model])

In [32]:
rf_model = rf_pipeline.fit(rul_df)
rul_trained_df = rf_model.transform(rul_df)

In [40]:
SimpleSparkSerializer().serializeToBundle(
    rf_model, "jar:file:/tmp/rfmodel.zip", rul_trained_df
)

In [41]:
import zipfile

with zipfile.ZipFile("/tmp/rfmodel.zip") as zf:
    zf.extractall("/tmp/rfmodel")

import tarfile

with tarfile.open("/tmp/rfmodel.tar.gz", "w:gz") as tar:
    tar.add("/tmp/rfmodel/bundle.json", arcname="bundle.json")
    tar.add("/tmp/rfmodel/root", arcname="root")

In [33]:
independent_features

['AMBIENT_TEMPERATURE',
 'MODULE_TEMPERATURE',
 'IRRADIATION',
 'AC_POWER',
 'DC_POWER',
 'DAILY_YIELD',
 'encoded_Source_Key']

In [34]:
feature_list = []
for col in independent_features:
    if col =='encoded_Source_Key':
        col = 'SOURCE_KEY'
    feature_list.append(str(rul_df.select(col).collect()[0][col]))

In [35]:
feature_string = ','.join(feature_list)

In [36]:
feature_string

'34.40285233333333,46.7442812,0.5651963973333333,637.7,6510.0,5016.142857,1BY6WEcLGh8j5v7'

In [53]:
source_key_dict = {}
for i,j in enumerate(rul_df.select('SOURCE_KEY').distinct().collect()):
    source_key_dict[i] = j[0]

In [None]:
source_ke

In [54]:
feature_string = '34.40285233333333,46.7442812,0.5651963973333333,637.7,6510.0,5016.142857,0'

In [74]:
import json
import urllib.parse
import boto3
from datetime import datetime
print('Loading function')
s3 = boto3.client('s3')

Loading function


10/15/13/
RAWDATA2021/10/15/13/KDS-S3-x8Hfx-2-2021-10-15-13-05-53-4d72421d-b831-4817-8352-d3c81453a21b


In [84]:
for i in text.split('\n'):
    print(i)

1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,46,34y8
1,2,3,

In [77]:
result

{'ResponseMetadata': {'RequestId': 'GGET73XRH5NTHN9N',
  'HostId': 'nrt+Vlg6N8pEgsF64D56j/Q+eysTUq+soSPXWUEXsWosMpAD/5t1koGMiuktrYg/U8tQTRKRc4o=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'nrt+Vlg6N8pEgsF64D56j/Q+eysTUq+soSPXWUEXsWosMpAD/5t1koGMiuktrYg/U8tQTRKRc4o=',
   'x-amz-request-id': 'GGET73XRH5NTHN9N',
   'date': 'Fri, 15 Oct 2021 13:21:16 GMT',
   'last-modified': 'Fri, 15 Oct 2021 13:05:32 GMT',
   'etag': '"b95cbdc261908e7682a6fc0686351707"',
   'accept-ranges': 'bytes',
   'content-type': 'application/octet-stream',
   'server': 'AmazonS3',
   'content-length': '13300'},
  'RetryAttempts': 0},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2021, 10, 15, 13, 5, 32, tzinfo=tzutc()),
 'ContentLength': 13300,
 'ETag': '"b95cbdc261908e7682a6fc0686351707"',
 'ContentType': 'application/octet-stream',
 'Metadata': {},
 'Body': <botocore.response.StreamingBody at 0x7ff9e7843278>}

In [67]:
feature_string

'34.40285233333333,46.7442812,0.5651963973333333,637.7,6510.0,5016.142857,1BY6WEcLGh8j5v7'

In [None]:
# feature_string = feature_string[1:]

In [48]:
# Please replace the bucket name with your bucket name where you want to upload the model
s3 = boto3.resource("s3")
file_name = os.path.join("model/RF_RUL_MODEL", "rfmodel.tar.gz")
s3.Bucket("solarpowerbackend").upload_file("/tmp/rfmodel.tar.gz", file_name)

In [None]:

# from mleap.pyspark.spark_support import SimpleSparkSerializer

# SimpleSparkSerializer().serializeToBundle(
#     rul_rf_model, "model.zip", rul_trained_df
# )

In [1]:
import json

schema = {
    "input": [
        {"name": "AMBIENT_TEMPERATURE", "type": "double"},
        {"name": "MODULE_TEMPERATURE", "type": "double"},
        {"name": "IRRADIATION", "type": "double"},
        
        {"name": "AC_POWER", "type": "double"},
        {"name": "DC_POWER", "type": "double"},
        
        {"name": "DAILY_YIELD", "type": "double"},
        {"name": "SOURCE_KEY", "type": "string"}
        
    ],
    "output": {"name": "RF_prediction", "type": "double"},
}
schema_json = json.dumps(schema)
print(schema_json)

{"input": [{"name": "AMBIENT_TEMPERATURE", "type": "double"}, {"name": "MODULE_TEMPERATURE", "type": "double"}, {"name": "IRRADIATION", "type": "double"}, {"name": "AC_POWER", "type": "double"}, {"name": "DC_POWER", "type": "double"}, {"name": "DAILY_YIELD", "type": "double"}, {"name": "SOURCE_KEY", "type": "string"}], "output": {"name": "RF_prediction", "type": "double"}}


In [2]:
from time import gmtime, strftime
import time

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

import sagemaker
from sagemaker import get_execution_role
from sagemaker.sparkml.model import SparkMLModel

sess = sagemaker.Session()
role = get_execution_role()

# S3 location of where you uploaded your trained and serialized SparkML model
sparkml_data = "s3://{}/{}/{}".format(
    "solarpowerbackend", "model/RF_RUL_MODEL", "rfmodel.tar.gz"
)
model_name = "sparkml-solarpower-" + timestamp_prefix
sparkml_model = SparkMLModel(
    model_data=sparkml_data,
    role=role,
    sagemaker_session=sess,
    name=model_name,
    # passing the schema defined above by using an environment
    # variable that sagemaker-sparkml-serving understands
    env={"SAGEMAKER_SPARKML_SCHEMA": schema_json},
)


endpoint_name = "sparkml-solarpower-rf" + timestamp_prefix
sparkml_model.deploy(
    initial_instance_count=1, instance_type="ml.c4.xlarge", endpoint_name=endpoint_name
)

-----!

<sagemaker.sparkml.model.SparkMLPredictor at 0x7ff9ed903e48>

In [3]:
endpoint_name

'sparkml-solarpower-rf2021-10-15-11-36-41'

In [57]:
from sagemaker.predictor import (
    json_serializer,
    csv_serializer,
    json_deserializer,
    RealTimePredictor,
)
# from sagemaker.content_types import CONTENT_TYPE_CSV

payload = feature_string
predictor = RealTimePredictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
    serializer=csv_serializer)
print(predictor.predict(feature_string))

The class RealTimePredictor has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


b'1977.2963658942524'
