In [0]:
from pyspark.sql.functions import to_timestamp,hour,minute,when,col,current_timestamp,date_format,lit,unix_timestamp,expr,abs,to_date,rank,datediff
from pyspark.sql.window import Window
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder,StringIndexer,VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator,TrainValidationSplit
from pyspark.ml.regression import LinearRegression,RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
# display(raw_df)

In [0]:
raw_df = spark.table('solarpowerData.RUL_Dataset')
raw_df = raw_df.withColumn('TIME',date_format(col('DATE_TIME'),'HH:mm:ss'))
raw_df = raw_df.filter(col('TIME').between('06:00:00','18:15:00'))

In [0]:
raw_df = raw_df.withColumn('DATE',to_date(col('DATE_TIME')))

In [0]:
only_fault_df = raw_df.select('DATE_TIME','FAULT_FLAG','SOURCE_KEY','DATE').filter(col('Fault_Flag')==1)

In [0]:
display(only_fault_df)

DATE_TIME,FAULT_FLAG,SOURCE_KEY,DATE
2020-05-25T10:15:00.000+0000,1,ih0vzX44oOqAx2f,2020-05-25
2020-06-07T12:15:00.000+0000,1,1BY6WEcLGh8j5v7,2020-06-07
2020-06-07T12:15:00.000+0000,1,bvBOhCH3iADSZry,2020-06-07
2020-06-07T12:30:00.000+0000,1,1BY6WEcLGh8j5v7,2020-06-07
2020-06-07T12:30:00.000+0000,1,bvBOhCH3iADSZry,2020-06-07
2020-06-07T12:30:00.000+0000,1,wCURE6d3bPkepu2,2020-06-07
2020-06-07T12:30:00.000+0000,1,z9Y9gH1T5YWrNuG,2020-06-07
2020-06-07T12:45:00.000+0000,1,1BY6WEcLGh8j5v7,2020-06-07
2020-06-07T12:45:00.000+0000,1,bvBOhCH3iADSZry,2020-06-07
2020-06-07T12:45:00.000+0000,1,sjndEbLyjtCKgGv,2020-06-07


In [0]:
# rank_window = Window.partitionBy(only_fault_df['SOURCE_KEY'],only_fault_df['DATE']).orderBy(only_fault_df['DATE_TIME'])
# fault_rank_on_day = rank().over(rank_window)

In [0]:
fault_rank_df = only_fault_df.select(col('SOURCE_KEY').alias('FAULT_SOURCE_KEY'),col('DATE_TIME').alias('FAULT_DATE_TIME'))

In [0]:
rul_oncondition = (fault_rank_df['FAULT_DATE_TIME']>=raw_df['DATE_TIME']) & (fault_rank_df['FAULT_SOURCE_KEY'] == raw_df['SOURCE_KEY'])

In [0]:
rul_df = fault_rank_df.join(raw_df,on = rul_oncondition,how = 'inner')

In [0]:
display(rul_df)

FAULT_SOURCE_KEY,FAULT_DATE_TIME,DATE_TIME,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,PLANT_ID,SOURCE_KEY,AC_POWER,DC_POWER,DAILY_YIELD,TOTAL_YIELD,Fault_Flag,TIME,DATE
McdE0feGgRqW7Ca,2020-06-07T13:30:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,McdE0feGgRqW7Ca,5.25,54.375,0.0,7158964.0,0,06:00:00,2020-05-15
McdE0feGgRqW7Ca,2020-06-07T13:15:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,McdE0feGgRqW7Ca,5.25,54.375,0.0,7158964.0,0,06:00:00,2020-05-15
McdE0feGgRqW7Ca,2020-06-07T13:00:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,McdE0feGgRqW7Ca,5.25,54.375,0.0,7158964.0,0,06:00:00,2020-05-15
bvBOhCH3iADSZry,2020-06-14T14:00:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,bvBOhCH3iADSZry,3.571428571,37.0,0.0,6316803.0,0,06:00:00,2020-05-15
bvBOhCH3iADSZry,2020-06-14T13:45:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,bvBOhCH3iADSZry,3.571428571,37.0,0.0,6316803.0,0,06:00:00,2020-05-15
bvBOhCH3iADSZry,2020-06-14T13:15:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,bvBOhCH3iADSZry,3.571428571,37.0,0.0,6316803.0,0,06:00:00,2020-05-15
bvBOhCH3iADSZry,2020-06-07T13:30:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,bvBOhCH3iADSZry,3.571428571,37.0,0.0,6316803.0,0,06:00:00,2020-05-15
bvBOhCH3iADSZry,2020-06-07T13:15:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,bvBOhCH3iADSZry,3.571428571,37.0,0.0,6316803.0,0,06:00:00,2020-05-15
bvBOhCH3iADSZry,2020-06-07T13:00:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,bvBOhCH3iADSZry,3.571428571,37.0,0.0,6316803.0,0,06:00:00,2020-05-15
bvBOhCH3iADSZry,2020-06-07T12:45:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,bvBOhCH3iADSZry,3.571428571,37.0,0.0,6316803.0,0,06:00:00,2020-05-15


In [0]:
rul_df = rul_df.withColumn('RUL',(col('FAULT_DATE_TIME').cast('long') - col('DATE_TIME').cast('long'))/60 - datediff(col('FAULT_DATE_TIME'),col('DATE_TIME'))*47*15)

In [0]:
display(rul_df)

FAULT_SOURCE_KEY,FAULT_DATE_TIME,DATE_TIME,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,PLANT_ID,SOURCE_KEY,AC_POWER,DC_POWER,DAILY_YIELD,TOTAL_YIELD,Fault_Flag,TIME,DATE,RUL
McdE0feGgRqW7Ca,2020-06-07T13:30:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,McdE0feGgRqW7Ca,5.25,54.375,0.0,7158964.0,0,06:00:00,2020-05-15,17355.0
McdE0feGgRqW7Ca,2020-06-07T13:15:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,McdE0feGgRqW7Ca,5.25,54.375,0.0,7158964.0,0,06:00:00,2020-05-15,17340.0
McdE0feGgRqW7Ca,2020-06-07T13:00:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,McdE0feGgRqW7Ca,5.25,54.375,0.0,7158964.0,0,06:00:00,2020-05-15,17325.0
bvBOhCH3iADSZry,2020-06-14T14:00:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,bvBOhCH3iADSZry,3.571428571,37.0,0.0,6316803.0,0,06:00:00,2020-05-15,22530.0
bvBOhCH3iADSZry,2020-06-14T13:45:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,bvBOhCH3iADSZry,3.571428571,37.0,0.0,6316803.0,0,06:00:00,2020-05-15,22515.0
bvBOhCH3iADSZry,2020-06-14T13:15:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,bvBOhCH3iADSZry,3.571428571,37.0,0.0,6316803.0,0,06:00:00,2020-05-15,22485.0
bvBOhCH3iADSZry,2020-06-07T13:30:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,bvBOhCH3iADSZry,3.571428571,37.0,0.0,6316803.0,0,06:00:00,2020-05-15,17355.0
bvBOhCH3iADSZry,2020-06-07T13:15:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,bvBOhCH3iADSZry,3.571428571,37.0,0.0,6316803.0,0,06:00:00,2020-05-15,17340.0
bvBOhCH3iADSZry,2020-06-07T13:00:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,bvBOhCH3iADSZry,3.571428571,37.0,0.0,6316803.0,0,06:00:00,2020-05-15,17325.0
bvBOhCH3iADSZry,2020-06-07T12:45:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,bvBOhCH3iADSZry,3.571428571,37.0,0.0,6316803.0,0,06:00:00,2020-05-15,17310.0


In [0]:
rul_df = rul_df.groupby('SOURCE_KEY','DATE_TIME','AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','PLANT_ID','AC_POWER','DC_POWER','DAILY_YIELD','TOTAL_YIELD').min('RUL')
rul_df = rul_df.withColumnRenamed('min(RUL)','RUL')
# rul_df = rul_df.filter(col('RUL')>=1440)

In [0]:
rul_df = rul_df.withColumn('AC_POWER',col('AC_POWER') - col('AC_POWER')/(col('RUL')+1)*2)

In [0]:
rul_df = rul_df.withColumn('ACDC',col('AC_POWER')/(col('DC_POWER')+1))

In [0]:
display(rul_df.sort(['SOURCE_KEY','DATE_TIME']).filter(col('RUL')<100))

SOURCE_KEY,DATE_TIME,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,PLANT_ID,AC_POWER,DC_POWER,DAILY_YIELD,TOTAL_YIELD,RUL,ACDC
1BY6WEcLGh8j5v7,2020-06-07T10:45:00.000+0000,26.655895933333333,52.52440386666667,1.0379223366666668,4135001,1189.987283725275,12471.85714,2026.428571,6420508.429,90.0,0.0954061503606121
1BY6WEcLGh8j5v7,2020-06-07T11:00:00.000+0000,27.075844133333334,53.64908433333333,0.8485740693333335,4135001,1047.599013157895,11020.625,2310.375,6420792.375,75.0,0.0950494154135977
1BY6WEcLGh8j5v7,2020-06-07T11:15:00.000+0000,27.4672506,58.268178266666666,1.093312369333333,4135001,1213.479391377049,12864.28571,2608.142857,6421090.143,60.0,0.0943219932095117
1BY6WEcLGh8j5v7,2020-06-07T11:30:00.000+0000,27.8593812,57.1712628,0.855925206,4135001,1017.6434782608696,10891.5,2902.375,6421384.375,45.0,0.0934260709902106
1BY6WEcLGh8j5v7,2020-06-07T11:45:00.000+0000,27.78503,53.52336233333334,0.8263966746666666,4135001,967.4506916451612,10586.57143,3159.0,6421641.0,30.0,0.0913760722221792
1BY6WEcLGh8j5v7,2020-06-07T12:00:00.000+0000,28.042243,56.312655,1.017537535,4135001,1062.6109375,12455.625,3448.125,6421930.125,15.0,0.085304882943815
1BY6WEcLGh8j5v7,2020-06-07T12:15:00.000+0000,28.7483338,59.98777086666667,1.0242290946666668,4135001,-167.2142857,1715.0,3633.285714,6422115.286,0.0,-0.0974442224358974
1BY6WEcLGh8j5v7,2020-06-07T12:30:00.000+0000,28.884908,59.14914326666666,1.00650392,4135001,0.0,0.0,3634.0,6422116.0,0.0,0.0
1BY6WEcLGh8j5v7,2020-06-07T12:45:00.000+0000,29.4468058,57.7418068,0.9985397593333332,4135001,0.0,0.0,3634.0,6422116.0,0.0,0.0
1BY6WEcLGh8j5v7,2020-06-07T13:00:00.000+0000,30.082822666666665,56.89508453333334,0.9434325606666668,4135001,0.0,0.0,3634.0,6422116.0,0.0,0.0


In [0]:

str_ind = StringIndexer().setInputCol('SOURCE_KEY').setOutputCol('SOURCE_KEY_NUM')
ohe = OneHotEncoder().setInputCol('SOURCE_KEY_NUM').setOutputCol('encoded_Source_Key')

In [0]:
index_stages = [str_ind,ohe]
ohe_pipeline = Pipeline(stages = index_stages)
rul_indexed_df = ohe_pipeline.fit(rul_df).transform(rul_df)

In [0]:
independent_features = ['AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','AC_POWER','ACDC','DC_POWER','DAILY_YIELD','encoded_Source_Key']
model_input_vector = VectorAssembler().setInputCols(independent_features).setOutputCol('features')

rul_indexed_df = rul_indexed_df.withColumnRenamed('RUL','label')

In [0]:
rul_indexed_train_df,rul_indexed_test_df = rul_indexed_df.randomSplit([0.8,0.2])

In [0]:
lr_model = LinearRegression().setFeaturesCol('features').setLabelCol('label')
lr_model.setPredictionCol('LR_prediction')
rf_model = RandomForestRegressor().setFeaturesCol('features').setLabelCol('label')
rf_model.setPredictionCol('RF_prediction')


In [0]:
model_training_stages  = [model_input_vector]
rul_pipeline = Pipeline(stages=model_training_stages)
rul_trained_df = rul_pipeline.fit(rul_indexed_train_df).transform(rul_indexed_train_df)

In [0]:
rfparamGrid = (ParamGridBuilder()
             .addGrid(rf_model.maxDepth, [2, 5, 10, 20, 30])
#                .addGrid(rf_model.maxDepth, [2, 5, 10])
             .addGrid(rf_model.maxBins, [10, 20, 40, 80, 100])
#                .addGrid(rf_model.maxBins, [5, 10, 20])
             .addGrid(rf_model.numTrees, [5, 20, 50, 100, 500])
#                .addGrid(rf_model.numTrees, [5, 20, 50])
             .build())
rfevaluator = RegressionEvaluator(predictionCol="RF_prediction", labelCol="label", metricName="rmse")

In [0]:
rfcv = CrossValidator(estimator = rf_model,
                      estimatorParamMaps = rfparamGrid,
                      evaluator = rfevaluator,
                      numFolds = 5)

In [0]:
rfcvModel = rfcv.fit(rul_trained_df)
print(rfcvModel)

# Use test set here so we can measure the accuracy of our model on new data
rfpredictions = rfcvModel.transform(rul_trained_df)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
print('RMSE:', rfevaluator.evaluate(rfpredictions))

In [0]:


rul_lr_model = lr_model.fit(rul_trained_df)
rul_trained_df = rul_lr_model.transform(rul_trained_df)
print('Linear Regression Error: ' ,str(rul_lr_model.summary.meanAbsoluteError))

In [0]:
rul_rf_model = rf_model.fit(rul_trained_df)
rul_trained_df = rul_rf_model.transform(rul_trained_df)
# print('Randomforest Regression Error: ' ,str(rul_rf_model.stages[-1].summary.meanAbsoluteError))

In [0]:
for i in zip(independent_features,rul_rf_model.featureImportances):
  print(i)

In [0]:
rul_trained_df.select('SOURCE_KEY').distinct().collect()

In [0]:
display(rul_trained_df.filter(col('SOURCE_KEY')=='bvBOhCH3iADSZry').select('SOURCE_KEY','DATE_TIME','RF_Prediction','LR_Prediction','label'))

SOURCE_KEY,DATE_TIME,RF_Prediction,LR_Prediction,label
bvBOhCH3iADSZry,2020-05-15T06:30:00.000+0000,7829.081043901271,9776.27762688249,17250.0
bvBOhCH3iADSZry,2020-05-15T06:45:00.000+0000,8112.226501604467,9731.541000203004,17235.0
bvBOhCH3iADSZry,2020-05-15T07:00:00.000+0000,9173.064519411626,9550.06084372016,17220.0
bvBOhCH3iADSZry,2020-05-15T07:15:00.000+0000,8844.50308616581,9590.56044110706,17205.0
bvBOhCH3iADSZry,2020-05-15T07:30:00.000+0000,8844.50308616581,9722.830332895166,17190.0
bvBOhCH3iADSZry,2020-05-15T07:45:00.000+0000,9119.137166248976,10123.941412294473,17175.0
bvBOhCH3iADSZry,2020-05-15T08:15:00.000+0000,8171.687324871156,9275.427776340344,17145.0
bvBOhCH3iADSZry,2020-05-15T08:30:00.000+0000,8171.687324871156,8828.332465581552,17130.0
bvBOhCH3iADSZry,2020-05-15T09:00:00.000+0000,9351.828885430505,8920.708123845827,17100.0
bvBOhCH3iADSZry,2020-05-15T09:15:00.000+0000,9361.980849884643,9945.91189697948,17085.0


In [0]:
rul_trained_df = rul_trained_df.withColumn('acdc',col('DC_POWER')/col('AC_POWER'))

In [0]:
display(rul_trained_df.filter(col('SOURCE_KEY')=='1BY6WEcLGh8j5v7').select('DATE_TIME',*(independent_features[:-1]),'label'))