In [1]:
# !pip install mleap==0.8.1
# !pip install pyspark==2.2.2

In [5]:
import mleap
import mleap.pyspark
from mleap.pyspark.spark_support import SimpleSparkSerializer
from pyspark import keyword_only
from pyspark.sql.functions import to_timestamp,hour,minute,when,col,current_timestamp,date_format,lit,unix_timestamp,expr,abs,to_date,rank,datediff
from pyspark.sql.window import Window
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder,StringIndexer,VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator,TrainValidationSplit
from pyspark.ml.regression import LinearRegression,RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import os
import boto3

# from pyspark.ml import Pipeline, Transformer
# from pyspark.sql import DataFrame
# from pyspark import keyword_only
# from pyspark.ml import Transformer
# from pyspark.ml.param.shared import HasOutputCols, Param, Params
# from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
# from pyspark.sql.functions import lit # for the dummy _transform

In [6]:
# !wget https://s3-us-west-2.amazonaws.com/sparkml-mleap/0.9.6/jar/mleap_spark_assembly.jar

In [7]:
import pyspark
pyspark.__version__

'2.2.2'

In [8]:
# !python -v

In [10]:
# !cp ./mleap_spark_assembly.jar /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyspark/jars

In [11]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
import sagemaker_pyspark
import botocore.session

session = botocore.session.get_session()
# credentials = session.get_credentials()

conf = (SparkConf().set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))

In [12]:
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
# May take awhile locally
spark = SparkSession.builder.config(conf = conf).appName("test").getOrCreate()
spark

In [13]:
spark.version

'2.2.2'

In [14]:
rawdata_s3 = 's3a://solarpowerbackend/rawdata/'
processed_s3 = 's3a://solarpowerbackend/processed/'
model_s3 = 's3a://solarpowerbackend/model/'

In [15]:
raw_df = spark.read.parquet(f'{processed_s3}rul_dataset')
raw_df = raw_df.withColumn('TIME',date_format(col('DATE_TIME'),'HH:mm:ss'))
raw_df = raw_df.filter(col('TIME').between('06:00:00','18:15:00'))

In [16]:
raw_df = raw_df.withColumn('DATE',to_date(col('DATE_TIME')))

In [17]:
only_fault_df = raw_df.select('DATE_TIME','FAULT_FLAG','SOURCE_KEY','DATE').filter(col('Fault_Flag')==1)

In [18]:
display(only_fault_df)

DataFrame[DATE_TIME: timestamp, FAULT_FLAG: int, SOURCE_KEY: string, DATE: date]

In [19]:
# rank_window = Window.partitionBy(only_fault_df['SOURCE_KEY'],only_fault_df['DATE']).orderBy(only_fault_df['DATE_TIME'])
# fault_rank_on_day = rank().over(rank_window)

In [20]:
fault_rank_df = only_fault_df.select(col('SOURCE_KEY').alias('FAULT_SOURCE_KEY'),col('DATE_TIME').alias('FAULT_DATE_TIME'))

In [21]:
rul_oncondition = (fault_rank_df['FAULT_DATE_TIME']>=raw_df['DATE_TIME']) & (fault_rank_df['FAULT_SOURCE_KEY'] == raw_df['SOURCE_KEY'])

In [22]:
rul_df = fault_rank_df.join(raw_df,on = rul_oncondition,how = 'inner')

In [23]:
rul_df = rul_df.withColumn('RUL',(col('FAULT_DATE_TIME').cast('long') - col('DATE_TIME').cast('long'))/60 - datediff(col('FAULT_DATE_TIME'),col('DATE_TIME'))*47*15)

In [24]:
display(rul_df)

DataFrame[FAULT_SOURCE_KEY: string, FAULT_DATE_TIME: timestamp, DATE_TIME: timestamp, AMBIENT_TEMPERATURE: double, MODULE_TEMPERATURE: double, IRRADIATION: double, PLANT_ID: int, SOURCE_KEY: string, DC_POWER: double, AC_POWER: double, DAILY_YIELD: double, TOTAL_YIELD: double, Fault_Flag: int, TIME: string, DATE: date, RUL: double]

In [25]:
rul_df = rul_df.groupby('SOURCE_KEY','DATE_TIME','AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','PLANT_ID','AC_POWER','DC_POWER','DAILY_YIELD','TOTAL_YIELD').min('RUL')
rul_df = rul_df.withColumnRenamed('min(RUL)','label')

# rul_df = rul_df.filter(col('RUL')>=1440)
# rul_df = rul_df.withColumnRenamed('RUL','')

In [26]:
# rul_df = rul_df.withColumn('AC_POWER',col('AC_POWER') - col('AC_POWER')/(col('RUL')+1)*2)
rul_df = rul_df.select('SOURCE_KEY', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION', 'AC_POWER', 'DC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD','label')

In [27]:
# from pyspark import keyword_only
# from pyspark.ml import Transformer
# from pyspark.ml.param.shared import HasOutputCols, Param, Params
# from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
# from pyspark.sql.functions import lit # for the dummy _transform

# class ACDC_Calulator(
#     Transformer, HasOutputCols, DefaultParamsReadable, DefaultParamsWritable,
# ):
#     value = Param(
#         Params._dummy(),
#         "value",
#         "value to fill",
#     )

#     @keyword_only
#     def __init__(self, outputCols=None, value=0.0):
#         super(ACDC_Calulator, self).__init__()
#         self._setDefault(value=0.0)
#         kwargs = self._input_kwargs
#         self._set(**kwargs)

#     @keyword_only
#     def setParams(self, outputCols=None, value=0.0):
#         """
#         setParams(self, outputCols=None, value=0.0)
#         Sets params for this SetValueTransformer.
#         """
#         kwargs = self._input_kwargs
#         return self._set(**kwargs)

#     def setValue(self, value):
#         """
#         Sets the value of :py:attr:`value`.
#         """
#         return self._set(value=value)

#     def getValue(self):
#         """
#         Gets the value of :py:attr:`value` or its default value.
#         """
#         return self.getOrDefault(self.value)

#     def _transform(self, dataset):
#         dataset = dataset.withColumn('ACDC',col('AC_POWER')/(col('DC_POWER')+1))
#         return dataset

In [28]:
# acdc_calc = ACDC_Calulator(outputCols=["a", "b"], value=123.0)
str_ind = StringIndexer().setInputCol('SOURCE_KEY').setOutputCol('SOURCE_KEY_NUM')
ohe = OneHotEncoder().setInputCol('SOURCE_KEY_NUM').setOutputCol('encoded_Source_Key')

In [29]:
independent_features = ['AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','AC_POWER','DC_POWER','DAILY_YIELD','encoded_Source_Key']
model_input_vector = VectorAssembler().setInputCols(independent_features).setOutputCol('features')

# rul_indexed_df = rul_indexed_df.withColumnRenamed('RUL','label')

In [30]:
lr_model = LinearRegression().setFeaturesCol('features').setLabelCol('label')
lr_model.setPredictionCol('LR_prediction')
rf_model = RandomForestRegressor().setFeaturesCol('features').setLabelCol('label')
rf_model.setPredictionCol('RF_prediction')


RandomForestRegressor_4a36af26463d1b4e4514

In [33]:
# acdc_calc = ACDC_Calulator()
rf_pipeline = Pipeline(stages = [str_ind,ohe,model_input_vector,rf_model])

In [34]:
rf_model = rf_pipeline.fit(rul_df)
rul_trained_df = rf_model.transform(rul_df)

In [35]:
# rfcv = CrossValidator(estimator = rf_model,
#                       estimatorParamMaps = rfparamGrid,
#                       evaluator = rfevaluator,
#                       numFolds = 5)

In [36]:
# rfcvModel = rfcv.fit(rul_trained_df)
# print(rfcvModel)

# # Use test set here so we can measure the accuracy of our model on new data
# rfpredictions = rfcvModel.transform(rul_trained_df)

# # cvModel uses the best model found from the Cross Validation
# # Evaluate best model
# print('RMSE:', rfevaluator.evaluate(rfpredictions))

In [37]:


# rul_lr_model = lr_model.fit(rul_trained_df)
# rul_trained_df = rul_lr_model.transform(rul_trained_df)
# print('Linear Regression Error: ' ,str(rul_lr_model.summary.meanAbsoluteError))

In [38]:
# rul_trained_df.head()

In [39]:
# rul_rf_model = rf_model.fit(rul_trained_df)
# rul_trained_df = rul_rf_model.transform(rul_trained_df)
# rul_rf_model.write().overwrite().save(f'{model_s3}RF_RUL_Model')
# print('Randomforest Regression Error: ' ,str(rul_rf_model.stages[-1].summary.meanAbsoluteError))

In [40]:
SimpleSparkSerializer().serializeToBundle(
    rf_model, "jar:file:/tmp/rfmodel.zip", rul_trained_df
)

In [41]:
import zipfile

with zipfile.ZipFile("/tmp/rfmodel.zip") as zf:
    zf.extractall("/tmp/rfmodel")

import tarfile

with tarfile.open("/tmp/rfmodel.tar.gz", "w:gz") as tar:
    tar.add("/tmp/rfmodel/bundle.json", arcname="bundle.json")
    tar.add("/tmp/rfmodel/root", arcname="root")

In [43]:
independent_features

['AMBIENT_TEMPERATURE',
 'MODULE_TEMPERATURE',
 'IRRADIATION',
 'AC_POWER',
 'DC_POWER',
 'DAILY_YIELD',
 'encoded_Source_Key']

In [44]:
feature_list = []
for col in independent_features:
    if col =='encoded_Source_Key':
        col = 'SOURCE_KEY'
    feature_list.append(str(rul_df.select(col).collect()[0][col]))

In [47]:
feature_string = ','.join(feature_list)

In [49]:
independent_features

['AMBIENT_TEMPERATURE',
 'MODULE_TEMPERATURE',
 'IRRADIATION',
 'AC_POWER',
 'DC_POWER',
 'DAILY_YIELD',
 'encoded_Source_Key']

In [None]:
# feature_string = feature_string[1:]

In [48]:
# Please replace the bucket name with your bucket name where you want to upload the model
s3 = boto3.resource("s3")
file_name = os.path.join("model/RF_RUL_MODEL", "rfmodel.tar.gz")
s3.Bucket("solarpowerbackend").upload_file("/tmp/rfmodel.tar.gz", file_name)

In [None]:

# from mleap.pyspark.spark_support import SimpleSparkSerializer

# SimpleSparkSerializer().serializeToBundle(
#     rul_rf_model, "model.zip", rul_trained_df
# )

In [53]:
import json

schema = {
    "input": [
        {"name": "AMBIENT_TEMPERATURE", "type": "double"},
        {"name": "MODULE_TEMPERATURE", "type": "double"},
        {"name": "IRRADIATION", "type": "double"},
        
        {"name": "AC_POWER", "type": "double"},
        {"name": "DC_POWER", "type": "double"},
        
        {"name": "DAILY_YIELD", "type": "double"},
        {"name": "SOURCE_KEY", "type": "string"}
        
    ],
    "output": {"name": "RF_prediction", "type": "double"},
}
schema_json = json.dumps(schema)
print(schema_json)

{"input": [{"name": "AMBIENT_TEMPERATURE", "type": "double"}, {"name": "MODULE_TEMPERATURE", "type": "double"}, {"name": "IRRADIATION", "type": "double"}, {"name": "AC_POWER", "type": "double"}, {"name": "DC_POWER", "type": "double"}, {"name": "DAILY_YIELD", "type": "double"}, {"name": "SOURCE_KEY", "type": "string"}], "output": {"name": "RF_prediction", "type": "double"}}


In [None]:
from time import gmtime, strftime
import time

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

import sagemaker
from sagemaker import get_execution_role
from sagemaker.sparkml.model import SparkMLModel

sess = sagemaker.Session()
role = get_execution_role()

# S3 location of where you uploaded your trained and serialized SparkML model
sparkml_data = "s3://{}/{}/{}".format(
    "solarpowerbackend", "model/RF_RUL_MODEL", "rfmodel.tar.gz"
)
model_name = "sparkml-solarpower-" + timestamp_prefix
sparkml_model = SparkMLModel(
    model_data=sparkml_data,
    role=role,
    sagemaker_session=sess,
    name=model_name,
    # passing the schema defined above by using an environment
    # variable that sagemaker-sparkml-serving understands
    env={"SAGEMAKER_SPARKML_SCHEMA": schema_json},
)


endpoint_name = "sparkml-solarpower-rf" + timestamp_prefix
sparkml_model.deploy(
    initial_instance_count=1, instance_type="ml.c4.xlarge", endpoint_name=endpoint_name
)

--

In [None]:
from sagemaker.predictor import (
    json_serializer,
    csv_serializer,
    json_deserializer,
    RealTimePredictor,
)
# from sagemaker.content_types import CONTENT_TYPE_CSV

payload = feature_string
predictor = RealTimePredictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
    serializer=csv_serializer)
print(predictor.predict(feature_string))