In [0]:
from pyspark.sql.functions import to_timestamp,hour,minute,when,col,current_timestamp,date_format,lit,unix_timestamp,expr,abs,to_date,rank,datediff
from pyspark.sql.window import Window
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder,StringIndexer,VectorAssembler
from pyspark.ml.regression import LinearRegression,RandomForestRegressor

In [0]:
# display(raw_df)

In [0]:
raw_df = spark.table('solarpowerData.RUL_Dataset')
raw_df = raw_df.withColumn('TIME',date_format(col('DATE_TIME'),'HH:mm:ss'))
raw_df = raw_df.filter(col('TIME').between('06:00:00','18:15:00'))

In [0]:
raw_df = raw_df.withColumn('DATE',to_date(col('DATE_TIME')))

In [0]:
only_fault_df = raw_df.select('DATE_TIME','FAULT_FLAG','SOURCE_KEY','DATE').filter(col('Fault_Flag')==1)

In [0]:
display(only_fault_df)

DATE_TIME,FAULT_FLAG,SOURCE_KEY,DATE
2020-05-15T09:00:00.000+0000,1,zVJPv84UY57bAof,2020-05-15
2020-05-15T10:00:00.000+0000,1,1BY6WEcLGh8j5v7,2020-05-15
2020-05-15T10:00:00.000+0000,1,bvBOhCH3iADSZry,2020-05-15
2020-05-15T11:45:00.000+0000,1,1BY6WEcLGh8j5v7,2020-05-15
2020-05-15T15:00:00.000+0000,1,z9Y9gH1T5YWrNuG,2020-05-15
2020-05-17T11:15:00.000+0000,1,1BY6WEcLGh8j5v7,2020-05-17
2020-05-17T11:45:00.000+0000,1,pkci93gMrogZuBj,2020-05-17
2020-05-17T11:45:00.000+0000,1,uHbuxQJl8lW7ozc,2020-05-17
2020-05-17T12:45:00.000+0000,1,z9Y9gH1T5YWrNuG,2020-05-17
2020-05-21T10:00:00.000+0000,1,1BY6WEcLGh8j5v7,2020-05-21


In [0]:
# rank_window = Window.partitionBy(only_fault_df['SOURCE_KEY'],only_fault_df['DATE']).orderBy(only_fault_df['DATE_TIME'])
# fault_rank_on_day = rank().over(rank_window)

In [0]:
fault_rank_df = only_fault_df.select(col('SOURCE_KEY').alias('FAULT_SOURCE_KEY'),col('DATE_TIME').alias('FAULT_DATE_TIME'))

In [0]:
rul_oncondition = (fault_rank_df['FAULT_DATE_TIME']>=raw_df['DATE_TIME']) & (fault_rank_df['FAULT_SOURCE_KEY'] == raw_df['SOURCE_KEY'])

In [0]:
rul_df = fault_rank_df.join(raw_df,on = rul_oncondition,how = 'inner')

In [0]:
display(rul_df)

FAULT_SOURCE_KEY,FAULT_DATE_TIME,DATE_TIME,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,Fault_Flag,TIME,DATE
1BY6WEcLGh8j5v7,2020-06-14T13:00:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15
1BY6WEcLGh8j5v7,2020-06-14T12:15:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15
1BY6WEcLGh8j5v7,2020-06-14T11:45:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15
1BY6WEcLGh8j5v7,2020-06-14T14:15:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15
1BY6WEcLGh8j5v7,2020-06-14T14:00:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15
1BY6WEcLGh8j5v7,2020-06-14T13:45:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15
1BY6WEcLGh8j5v7,2020-06-14T13:30:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15
1BY6WEcLGh8j5v7,2020-06-14T13:15:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15
1BY6WEcLGh8j5v7,2020-06-14T12:45:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15
1BY6WEcLGh8j5v7,2020-06-14T12:30:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15


In [0]:
rul_df = rul_df.withColumn('RUL',(col('FAULT_DATE_TIME').cast('long') - col('DATE_TIME').cast('long'))/60 - datediff(col('FAULT_DATE_TIME'),col('DATE_TIME'))*47*15)

In [0]:
display(rul_df)

FAULT_SOURCE_KEY,FAULT_DATE_TIME,DATE_TIME,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,Fault_Flag,TIME,DATE,RUL
1BY6WEcLGh8j5v7,2020-06-14T13:00:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15,22470.0
1BY6WEcLGh8j5v7,2020-06-14T12:15:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15,22425.0
1BY6WEcLGh8j5v7,2020-06-14T11:45:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15,22395.0
1BY6WEcLGh8j5v7,2020-06-14T14:15:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15,22545.0
1BY6WEcLGh8j5v7,2020-06-14T14:00:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15,22530.0
1BY6WEcLGh8j5v7,2020-06-14T13:45:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15,22515.0
1BY6WEcLGh8j5v7,2020-06-14T13:30:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15,22500.0
1BY6WEcLGh8j5v7,2020-06-14T13:15:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15,22485.0
1BY6WEcLGh8j5v7,2020-06-14T12:45:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15,22455.0
1BY6WEcLGh8j5v7,2020-06-14T12:30:00.000+0000,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,1BY6WEcLGh8j5v7,37.14285714,3.585714286,0.0,6259559.0,0,06:00:00,2020-05-15,22440.0


In [0]:
rul_df = rul_df.groupby('SOURCE_KEY','DATE_TIME','AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','PLANT_ID','AC_POWER','DC_POWER','DAILY_YIELD','TOTAL_YIELD').min('RUL')
rul_df = rul_df.withColumnRenamed('min(RUL)','RUL')
# rul_df = rul_df.filter(col('RUL')>=1440)

In [0]:
display(rul_df.sort(['SOURCE_KEY','DATE_TIME']))

SOURCE_KEY,DATE_TIME,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,PLANT_ID,AC_POWER,DC_POWER,DAILY_YIELD,TOTAL_YIELD,RUL
1BY6WEcLGh8j5v7,2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,4135001,3.585714286,37.14285714,0.0,6259559.0,240.0
1BY6WEcLGh8j5v7,2020-05-15T06:15:00.000+0000,24.01163526666667,22.35345866666667,0.0222816074666666,4135001,26.8625,278.0,2.625,6259561.625,225.0
1BY6WEcLGh8j5v7,2020-05-15T06:30:00.000+0000,23.976731266666665,22.893282,0.0494097238666666,4135001,60.68571429,626.5714286,13.28571429,6259572.286,210.0
1BY6WEcLGh8j5v7,2020-05-15T06:45:00.000+0000,24.21899,24.442443933333333,0.0953944536,4135001,119.275,1224.75,34.875,6259593.875,195.0
1BY6WEcLGh8j5v7,2020-05-15T07:00:00.000+0000,24.5373984,27.185652866666665,0.1419404433333333,4135001,170.0142857,1734.571429,72.71428571,6259631.714,180.0
1BY6WEcLGh8j5v7,2020-05-15T07:15:00.000+0000,24.8159595,28.888477857142853,0.1547126757142856,4135001,181.9375,1856.375,116.25,6259675.25,165.0
1BY6WEcLGh8j5v7,2020-05-15T07:30:00.000+0000,24.988789866666668,29.6056438,0.1487991533333333,4135001,180.5857143,1842.285714,162.4285714,6259721.429,150.0
1BY6WEcLGh8j5v7,2020-05-15T07:45:00.000+0000,25.21618033333333,29.547110933333336,0.1447934189333333,4135001,184.0625,1877.875,206.375,6259765.375,135.0
1BY6WEcLGh8j5v7,2020-05-15T08:00:00.000+0000,25.41951306666667,31.41254473333333,0.2016386213333333,4135001,318.6714286,3246.0,263.5714286,6259822.571,120.0
1BY6WEcLGh8j5v7,2020-05-15T08:15:00.000+0000,25.95908213333333,35.5287108,0.3457076533333333,4135001,384.8375,3917.5,359.875,6259918.875,105.0


In [0]:

str_ind = StringIndexer().setInputCol('SOURCE_KEY').setOutputCol('SOURCE_KEY_NUM')
ohe = OneHotEncoder().setInputCol('SOURCE_KEY_NUM').setOutputCol('encoded_Source_Key')

In [0]:
index_stages = [str_ind,ohe]
ohe_pipeline = Pipeline(stages = index_stages)
rul_indexed_df = ohe_pipeline.fit(rul_df).transform(rul_df)

In [0]:
independent_features = ['AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','AC_POWER','DC_POWER','DAILY_YIELD','encoded_Source_Key']
model_input_vector = VectorAssembler().setInputCols(independent_features).setOutputCol('features')

rul_indexed_df = rul_indexed_df.withColumnRenamed('RUL','label')

In [0]:
rul_indexed_train_df,rul_indexed_test_df = rul_indexed_df.randomSplit([0.8,0.2])

In [0]:
lr_model = LinearRegression().setFeaturesCol('features').setLabelCol('label')
lr_model.setPredictionCol('LR_prediction')
rf_model = RandomForestRegressor().setFeaturesCol('features').setLabelCol('label')
rf_model.setPredictionCol('RF_prediction')


In [0]:
model_training_stages  = [model_input_vector]
rul_pipeline = Pipeline(stages=model_training_stages)
rul_trained_df = rul_pipeline.fit(rul_indexed_train_df).transform(rul_indexed_train_df)

In [0]:
rul_trained_df

In [0]:


rul_lr_model = lr_model.fit(rul_trained_df)
rul_trained_df = rul_lr_model.transform(rul_trained_df)
print('Linear Regression Error: ' ,str(rul_lr_model.summary.meanAbsoluteError))

In [0]:
rul_rf_model = rf_model.fit(rul_trained_df)
rul_trained_df = rul_rf_model.transform(rul_trained_df)
# print('Randomforest Regression Error: ' ,str(rul_rf_model.stages[-1].summary.meanAbsoluteError))

In [0]:
for i in zip(independent_features,rul_rf_model.featureImportances):
  print(i)

In [0]:
display(rul_trained_df.select('SOURCE_KEY','DATE_TIME','RF_Prediction','LR_Prediction','label'))

SOURCE_KEY,DATE_TIME,RF_Prediction,LR_Prediction,label
1BY6WEcLGh8j5v7,2020-05-15T06:00:00.000+0000,1492.8727435615651,1676.9968061894442,240.0
1BY6WEcLGh8j5v7,2020-05-15T06:30:00.000+0000,1531.7112893499586,1590.691254321402,210.0
1BY6WEcLGh8j5v7,2020-05-15T06:45:00.000+0000,1531.7112893499586,1573.4689747629943,195.0
1BY6WEcLGh8j5v7,2020-05-15T07:00:00.000+0000,1531.7112893499586,1506.9580856546354,180.0
1BY6WEcLGh8j5v7,2020-05-15T07:15:00.000+0000,1531.7112893499586,1512.0755258955503,165.0
1BY6WEcLGh8j5v7,2020-05-15T07:30:00.000+0000,1531.7112893499586,1538.5590532057258,150.0
1BY6WEcLGh8j5v7,2020-05-15T08:30:00.000+0000,1512.979047513593,1283.2366657357916,90.0
1BY6WEcLGh8j5v7,2020-05-15T08:45:00.000+0000,1512.979047513593,1579.5554158766572,75.0
1BY6WEcLGh8j5v7,2020-05-15T09:00:00.000+0000,1441.9601956500726,1225.957894776613,60.0
1BY6WEcLGh8j5v7,2020-05-15T09:15:00.000+0000,1461.0837458020485,1553.3224048610136,45.0


In [0]:
rul_trained_df = rul_trained_df.withColumn('acdc',col('DC_POWER')/col('AC_POWER'))

In [0]:
display(rul_trained_df.filter(col('SOURCE_KEY')=='1BY6WEcLGh8j5v7').select('DATE_TIME',*(independent_features[:-1]),'label'))

DATE_TIME,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,AC_POWER,DC_POWER,DAILY_YIELD,label
2020-05-15T06:00:00.000+0000,24.088446066666663,22.2067566,0.0058869571866666,3.585714286,37.14285714,0.0,240.0
2020-05-15T06:30:00.000+0000,23.976731266666665,22.893282,0.0494097238666666,60.68571429,626.5714286,13.28571429,210.0
2020-05-15T06:45:00.000+0000,24.21899,24.442443933333333,0.0953944536,119.275,1224.75,34.875,195.0
2020-05-15T07:00:00.000+0000,24.5373984,27.185652866666665,0.1419404433333333,170.0142857,1734.571429,72.71428571,180.0
2020-05-15T07:15:00.000+0000,24.8159595,28.888477857142853,0.1547126757142856,181.9375,1856.375,116.25,165.0
2020-05-15T07:30:00.000+0000,24.988789866666668,29.6056438,0.1487991533333333,180.5857143,1842.285714,162.4285714,150.0
2020-05-15T08:30:00.000+0000,26.430782066666666,40.3180586,0.4053485726666666,424.1285714,4322.0,464.0,90.0
2020-05-15T08:45:00.000+0000,26.8318298,39.08195373333332,0.3124267953333333,417.7875,4257.125,556.125,75.0
2020-05-15T09:00:00.000+0000,27.6209698,45.0092326,0.6231526486666666,559.2285714,5706.714286,688.8571429,60.0
2020-05-15T09:15:00.000+0000,27.988362071428572,46.6177065,0.3448840357142857,394.0125,4015.5,811.625,45.0
