In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import datetime as datetime
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline

![](https://assets.newatlas.com/dims4/default/9d6cc41/2147483647/strip/true/crop/7360x4907+0+3/resize/1200x800!/quality/90/?url=http%3A%2F%2Fnewatlas-brightspot.s3.amazonaws.com%2Farchive%2Fbacteria-powered-solar-cell-1.jpg)

> Import all data

In [None]:
solar01 = pd.read_csv('../input/solar-power-generation-data/Plant_1_Generation_Data.csv')
sensor01 = pd.read_csv('../input/solar-power-generation-data/Plant_1_Weather_Sensor_Data.csv')
solar02 = pd.read_csv('../input/solar-power-generation-data/Plant_2_Generation_Data.csv')
sensor02 = pd.read_csv('../input/solar-power-generation-data/Plant_2_Weather_Sensor_Data.csv')

# Quick Data Analysis

Check Plant 1 and Plant 2 generation data

In [None]:
solar01.head()

In [None]:
solar02.head()

In [None]:
print("Solar Generation Plant 1's info")
solar01.info()
print('\n')
print("Solar Generation Plant 2's info")
solar02.info()

There is different format for DATE_TIME column and the type is object, not datetime64.
<br>
Check missing values.

In [None]:
print('Plant 1')
solar01.isnull().sum()

In [None]:
print('Plant 2')
solar02.isnull().sum()

How many inverter in plant 1 and plant 2?

In [None]:
print ('Plant 1 has '+ str(solar01['SOURCE_KEY'].nunique()) + ' inverters')
print ('Plant 2 has '+ str(solar02['SOURCE_KEY'].nunique()) + ' inverters')

Check data for each inverter

In [None]:
print('Plant 1')
solar01.groupby('SOURCE_KEY').count()

In [None]:
print('Plant 2')
solar02.groupby('SOURCE_KEY').count()

Wait a minute.
<br>
If this is data for 34 days with freq : 15mins.
<br>
That's should be 34days x 24hours x 4 = 3264 rows for each inverter.

Check Sensor 1 and Sensor 2 data.

In [None]:
sensor01.head()

In [None]:
sensor02.head()

In [None]:
print("Sensor 1's info")
sensor01.info()
print('\n')
print("Sensor 2's info")
sensor02.info()

How many source key in sensor data?

In [None]:
print('Sensor 1 has '+str(sensor01['SOURCE_KEY'].nunique())+' source key')
print('Sensor 2 has '+str(sensor02['SOURCE_KEY'].nunique())+' source key')

Check missing values

In [None]:
print('Sensor 1')
sensor01.isnull().sum()

In [None]:
print('Sensor 2')
sensor01.isnull().sum()

In [None]:
print('Sensor 1')
sensor01.count()

In [None]:
print('Sensor 2')
sensor02.count()

In [None]:
print('the total should be '+ str(34*24*4)+' rows')

# Data Cleaning

Task for solar data:<br>
1. Change format of datetime column as datetime64 <br>
2. Replace temporary source key with simple value <br>
3. Complete the missing data. <br>
<br>
Task for sensor data:<br>
1. Change format of datetime column as datetime64 <br>
2. Complete the missing data.

In [None]:
solar01.columns = solar01.columns.str.lower()
solar02.columns = solar02.columns.str.lower()
solar01.drop('plant_id', axis=1, inplace=True)
solar02.drop('plant_id', axis=1, inplace=True)

Change datetime format and create date and time column.

In [None]:
solar01['date_time'] = pd.to_datetime(solar01['date_time'],format ='%d-%m-%Y %H:%M')
solar02['date_time'] = pd.to_datetime(solar02['date_time'],format ='%Y-%m-%d %H:%M:%S')
solar01['date'] = solar01['date_time'].dt.date
solar01['time'] = solar01['date_time'].dt.time
solar02['date'] = solar02['date_time'].dt.date
solar02['time'] = solar02['date_time'].dt.time

Replace temporary source key with simple value.
<br>
Also keep them in variable.

In [None]:
solar01_inverter_id = solar01['source_key'].unique()
solar02_inverter_id = solar02['source_key'].unique()
solar01['source_key'] = solar01['source_key'].apply(lambda x :  int(np.where(solar01_inverter_id == x)[0]))
solar02['source_key'] = solar02['source_key'].apply(lambda x :  int(np.where(solar02_inverter_id == x)[0]))

In [None]:
solar01.head()

In [None]:
solar02.head()

In [None]:
solar01[(solar01['source_key']==0) & (solar01['date_time'].between('2020-05-15','2020-05-21'))]

Next, We try to sampling data from source_key = 0 from 2020-05-15 until 2020-05-20 at plant 1.
<br>
For learning something that can be useful to fill the missing data.

In [None]:
data = solar01[(solar01['source_key']==0) & (solar01['date_time'].between('2020-05-15','2020-05-21'))]
data['time'] = data['time'].astype(str)
g = sns.relplot(
        data=data,
        x='time',
        y='dc_power',
        row='date',
        kind='line',
        height=2,
        aspect=6)

g.set(xlim=('00:00:00', '23:45:00'), xticks=['00:00:00','06:00:00','12:00:00','18:00:00','23:45:00'])

Seems nothing is missing.
<br>
BUt when we used scatterplot, we can see the missing data.

In [None]:
data = solar01[(solar01['source_key']==0) & (solar01['date_time'].between('2020-05-15','2020-05-21'))]
data['time'] = data['time'].astype(str)
g = sns.relplot(
        data=data,
        x='time',
        y='dc_power',
        row='date',
        kind='scatter',
        height=2,
        aspect=6
        )
    
g.set(xlim=('00:00:00', '23:45:00'), xticks=['00:00:00','06:00:00','12:00:00','18:00:00','23:45:00'])

As we can see from those sample, we can conclude 2 things.
<br>
First, from 00:00 to 05:45 dc_power always zero (0).
<br>
Second, after 18:30 until midnight dc_power always zero (0) too.
<br>
those condition also same with ac_power.
<br>
We can use those to fill missing data, like on date 2020-05-16.

In [None]:
data = solar01[(solar01['source_key']==0) & (solar01['date_time'].between('2020-05-15','2020-05-21'))]
data['time'] = data['time'].astype(str)
g = sns.relplot(
        data=data,
        x='time',
        y='daily_yield',
        row='date',
        kind='scatter',
        height=2,
        aspect=6
        )

g.set(xlim=('00:00:00', '23:45:00'), xticks=['00:00:00','06:00:00','12:00:00','18:00:00','23:45:00'])

Next we investigate daily_yield. We also have 2 conclusion here.
<br>
First, from 00:00 until 05:45 daily_yield is zero (0).
<br>
Second,after 18:30 until midnight daily_yield's value is the same as at 18.15.
<br> 
this makes sense, because without dc_power or ac_power daily_yield will not increase.
<br>
I saw a little mistake, sometimes at 00:00 daily_yield's value is not zero. We can fix them.

In [None]:
data = solar01[(solar01['source_key']==0) & (solar01['date_time'].between('2020-05-15','2020-05-21'))]
data['date_time'] = data['date_time'].astype(str)
g = sns.relplot(
        data=data,
        x='date_time',
        y='total_yield',
        kind='scatter',
        height=6,
        aspect=2
        )

g.set(xlim=('2020-05-15 00:00:00', '2020-05-21 00:00:00'), xticks=['2020-05-15 00:00:00','2020-05-17 00:00:00','2020-05-19 00:00:00','2020-05-21 00:00:00'])

Now we are going to fill those missing data.
<br>
First we create datetime from 2020-05-15 to 2020-06-17 with freq 15mins.
<br>
Then we called it fulltime.

In [None]:
fulltime = pd.date_range(start='2020-05-15 00:00',end='2020-06-17 23:45' , freq='15T')
fulltime = pd.DataFrame({'date_time':fulltime})
fulltime

Now we try to fill data that source key is 0.

In [None]:
solar01_inv_0 = solar01[solar01['source_key']==0].reset_index(drop=True)

And then merge them with variable fulltime

In [None]:
solar01_inv_0 = pd.merge(fulltime, solar01_inv_0, how='outer')
solar01_inv_0

Now we going to use date_time column as index

In [None]:
solar01_inv_0.index = solar01_inv_0['date_time']
solar01_inv_0.drop('date_time', axis=1, inplace=True)

Check missing value

In [None]:
sns.heatmap(solar01_inv_0.isnull())

Fill missing value  in date and time by extracting from date_time index.
<br>
Fill source key by 0, because we focus on source key=0 now.

In [None]:
solar01_inv_0['date'] = solar01_inv_0.index.date
solar01_inv_0['time'] = solar01_inv_0.index.time
solar01_inv_0['source_key'] = 0

In [None]:
solar01_inv_0.isnull().sum()

So we have dc_power, ac_power, daily_yield and total_yield that contains missing value.
<br>
we will divided this dataframe into 3 group:
<br>
  early_morning (data from 00:00:00 until 05:45:00)
<br>
  afternoon     (data from 06:00:00 until 18:30:00)
<br>
  night         (data from 18:45:00 until 23:45:00)

In [None]:
early_morning = solar01_inv_0.between_time('00:00:00','05:45:00')
afternoon     = solar01_inv_0.between_time('06:00:00','18:30:00')
night         = solar01_inv_0.between_time('18:45:00','23:45:00')

Remember our conclusion before?
<br>
dc_power, ac_power and daily_yield value is always 0 before 05:45.

In [None]:
early_morning['dc_power'].fillna(value=0, inplace=True)
early_morning['ac_power'].fillna(value=0, inplace=True)
early_morning['daily_yield'].fillna(value =0, inplace=True)

after 18:45, dc_power and ac power also 0
<br>
for daily_yield,i use method ffill that means fill na values with values before that row.

In [None]:
night['dc_power'].fillna(value=0, inplace=True)
night['ac_power'].fillna(value=0, inplace=True)
night['daily_yield'].fillna(method='ffill', inplace=True)

After we combine early morning, afternoon, and night.
<br>
we can't fillna in afternoon, we have another trick for that.

In [None]:
solar01_inv_0 = pd.concat([early_morning,afternoon, night])
solar01_inv_0 = solar01_inv_0.sort_index()

In [None]:
data = solar01_inv_0
data['time'] = data['time'].astype(str)
sns.set(font_scale =1.5)

g = sns.relplot(
        data=data,
        x='time',
        y='dc_power',
        col='date',
        kind='scatter',
        height=2,
        aspect=3,
        col_wrap=3
        )

g.set(xlim=('00:00:00', '23:45:00'), xticks=['00:00:00','06:00:00','12:00:00','18:00:00','23:45:00'])

In [None]:
data = solar01_inv_0
data['time'] = data['time'].astype(str)
sns.set(font_scale =1.5)

g = sns.relplot(
        data=data,
        x='time',
        y='daily_yield',
        col='date',
        kind='scatter',
        height=2,
        aspect=3,
        col_wrap=3
        )

g.set(xlim=('00:00:00', '23:45:00'), xticks=['00:00:00','06:00:00','12:00:00','18:00:00','23:45:00'])

look at date 2020-05-26, because there are value 0(valid data) we can't fill it with last value on night.

In [None]:
solar01_inv_0.isnull().sum()

We still have dc_power, ac_power and daily_yield 's missing value.
<br>
I use interpolate function with time method. this function is guessing data by the time.
<br>
Check plot at date 2020-05-20.

In [None]:
solar01_inv_0[['ac_power','dc_power','daily_yield']] = solar01_inv_0[['ac_power','dc_power','daily_yield']].interpolate(method='time')

In [None]:
data = solar01_inv_0
data['time'] = data['time'].astype(str)
sns.set(font_scale =1.5)

g = sns.relplot(
        data=data,
        x='time',
        y='dc_power',
        col='date',
        kind='scatter',
        height=2,
        aspect=3,
        col_wrap=3
        )

g.set(xlim=('00:00:00', '23:45:00'), xticks=['00:00:00','06:00:00','12:00:00','18:00:00','23:45:00'])

In [None]:
data = solar01_inv_0
data['time'] = data['time'].astype(str)
sns.set(font_scale =1.5)

g = sns.relplot(
        data=data,
        x='time',
        y='daily_yield',
        col='date',
        kind='scatter',
        height=2,
        aspect=3,
        col_wrap=3
        )

g.set(xlim=('00:00:00', '23:45:00'), xticks=['00:00:00','06:00:00','12:00:00','18:00:00','23:45:00'])

In [None]:
solar01_inv_0.isnull().sum()

Last, total yield. We also can use interpolate too.

In [None]:
solar01_inv_0['total_yield'] = solar01_inv_0['total_yield'].interpolate(method='time')

In [None]:
data = solar01_inv_0
data.index = data.index.astype(str)

f, ax =plt.subplots(figsize=(12,8))
ax = sns.lineplot(x=data.index, 
                  y='total_yield',
                  data = data
                 )

ax.set(xlim=('2020-05-15 00:00:00','2020-06-17 00:00:00'),xticks=['2020-05-15 00:00:00','2020-06-17 00:00:00'])

In [None]:
solar01_inv_0.isnull().sum()

There is no missing data! Yes!
<br>
Wait, we aren't done yet. This only 1 source key. We must do these step for another 21 source key.
<br>
Relax, I already made funct all these step, so we just call that func 21 times more.

In [None]:
solar01_inv = [0]*22

def data_filling(inverter_id):
    #create dataframe based on inverter id.
    solar01_inv[inverter_id] = solar01[solar01['source_key']==inverter_id].reset_index(drop=True)
    
    #add full timestamp to dataframe.
    solar01_inv[inverter_id] = pd.merge(fulltime, solar01_inv[inverter_id], how='outer')
    
    #fill na with fix values.
    solar01_inv[inverter_id]['date'] = solar01_inv[inverter_id]['date_time'].dt.date
    solar01_inv[inverter_id]['time'] = solar01_inv[inverter_id]['date_time'].dt.time
    solar01_inv[inverter_id]['source_key'] = inverter_id
    
    #convert column date time as index.
    solar01_inv[inverter_id].index = solar01_inv[inverter_id]['date_time']
    solar01_inv[inverter_id].drop('date_time', axis=1, inplace=True)
    
    #divide dateframe into 3 group.
    early_morning = solar01_inv[inverter_id].between_time('00:00:00','05:45:00')
    afternoon     = solar01_inv[inverter_id].between_time('06:00:00','18:30:00')
    night         = solar01_inv[inverter_id].between_time('18:45:00','23:45:00')
    
    #fill na values on early_morning group with zero.
    early_morning['dc_power'].fillna(value=0, inplace=True)
    early_morning['ac_power'].fillna(value=0, inplace=True)
    early_morning['daily_yield'].fillna(value =0, inplace=True)
    
    #fill na values on night group with zero and fflill method for daily_yield.
    night['dc_power'].fillna(value=0, inplace=True)
    night['ac_power'].fillna(value=0, inplace=True)
    night['daily_yield'].fillna(method='ffill', inplace=True)
    
    #join them together again and sort index, so we get sorted timeline.
    solar01_inv[inverter_id] = pd.concat([early_morning,afternoon, night])
    solar01_inv[inverter_id] = solar01_inv[inverter_id].sort_index()
    
    #fill others na with interpolate function that use method time
    solar01_inv[inverter_id]['dc_power'] = solar01_inv[inverter_id]['dc_power'].interpolate(method='time')
    solar01_inv[inverter_id]['ac_power'] = solar01_inv[inverter_id]['ac_power'].interpolate(method='time')
    solar01_inv[inverter_id]['daily_yield'] = solar01_inv[inverter_id]['daily_yield'].interpolate(method='time')
    solar01_inv[inverter_id]['total_yield'] = solar01_inv[inverter_id]['total_yield'].interpolate(method='time')
    
for i in range (22):
    data_filling(i)

Now we have array of dataframe solar01_inv with index of array is source_key.
<br>
solar01_inv[5] that means dataframe with source_key is 5
<br>
we can join all them into 1 dataframe.

In [None]:
solar01 = pd.concat(solar01_inv)

Also don't forget with data from plant 2
<br>
we just copy paste those line of funtion change solar01 into solar02.

In [None]:
solar02_inv = [0]*22


def data_filling(inverter_id):
    #create dataframe based on inverter id.
    solar02_inv[inverter_id] = solar02[solar02['source_key']==inverter_id].reset_index(drop=True)
    
    #add full timestamp to dataframe.
    solar02_inv[inverter_id] = pd.merge(fulltime, solar02_inv[inverter_id], how='outer')
    
    #fill na with fix values.
    solar02_inv[inverter_id]['date'] = solar02_inv[inverter_id]['date_time'].dt.date
    solar02_inv[inverter_id]['time'] = solar02_inv[inverter_id]['date_time'].dt.time
    solar02_inv[inverter_id]['source_key'] = inverter_id
    
    #convert column date time as index.
    solar02_inv[inverter_id].index = solar02_inv[inverter_id]['date_time']
    solar02_inv[inverter_id].drop('date_time', axis=1, inplace=True)
    
    #divide dateframe into 3 group.
    early_morning = solar02_inv[inverter_id].between_time('00:00:00','05:45:00')
    afternoon     = solar02_inv[inverter_id].between_time('06:00:00','18:30:00')
    night         = solar02_inv[inverter_id].between_time('18:45:00','23:45:00')
    
    #fill na values on early_morning group with zero.
    early_morning['dc_power'].fillna(value=0, inplace=True)
    early_morning['ac_power'].fillna(value=0, inplace=True)
    early_morning['daily_yield'].fillna(value =0, inplace=True)
    
    #fill na values on night group with zero and fflill method for daily_yield.
    night['dc_power'].fillna(value=0, inplace=True)
    night['ac_power'].fillna(value=0, inplace=True)
    night['daily_yield'].fillna(method='ffill', inplace=True)
    
    #join them together again and sort index, so we get sorted timeline.
    solar02_inv[inverter_id] = pd.concat([early_morning,afternoon, night])
    solar02_inv[inverter_id] = solar02_inv[inverter_id].sort_index()
    
    #fill others na with interpolate function that use method time
    solar02_inv[inverter_id]['dc_power'] = solar02_inv[inverter_id]['dc_power'].interpolate(method='time')
    solar02_inv[inverter_id]['ac_power'] = solar02_inv[inverter_id]['ac_power'].interpolate(method='time')
    solar02_inv[inverter_id]['daily_yield'] = solar02_inv[inverter_id]['daily_yield'].interpolate(method='time')
    solar02_inv[inverter_id]['total_yield'] = solar02_inv[inverter_id]['total_yield'].interpolate(method='time')
    
for i in range (22):
    data_filling(i)

In [None]:
solar02 = pd.concat(solar02_inv)

Next, Sensor data.<br>
Change date_time format

In [None]:
sensor01.columns = sensor01.columns.str.lower()
sensor02.columns = sensor02.columns.str.lower()
sensor01.drop(['plant_id','source_key'], axis=1, inplace=True)
sensor02.drop(['plant_id','source_key'], axis=1, inplace=True)

In [None]:
sensor01['date_time'] = pd.to_datetime(sensor01['date_time'],format ='%Y-%m-%d %H:%M:%S')
sensor02['date_time'] = pd.to_datetime(sensor02['date_time'],format ='%Y-%m-%d %H:%M:%S')
sensor01['date'] = sensor01['date_time'].dt.date
sensor01['time'] = sensor01['date_time'].dt.time
sensor02['date'] = sensor02['date_time'].dt.date
sensor02['time'] = sensor02['date_time'].dt.time

In [None]:
sensor01.head()

In [None]:
sensor02.head()

Plotting missing value.

In [None]:
data = sensor01
data['time'] = data['time'].astype(str)
sns.set(font_scale =1.5)
g = sns.relplot(data=data,
            x='time',
            y='ambient_temperature',
            col='date',
            kind='scatter',
            height=3,
            aspect=3,
            col_wrap=3
               )
g.set(xlim=('00:00:00', '23:45:00'), xticks=['00:00:00','06:00:00','12:00:00','18:00:00','23:45:00'])

In [None]:
data = sensor01
data['time'] = data['time'].astype(str)
sns.set(font_scale =1.5)
g = sns.relplot(data=data,
            x='time',
            y='module_temperature',
            col='date',
            kind='scatter',
            height=3,
            aspect=3,
            col_wrap=3)
g.set(xlim=('00:00:00', '23:45:00'), xticks=['00:00:00','06:00:00','12:00:00','18:00:00','23:45:00'])

In [None]:
data = sensor01
data['time'] = data['time'].astype(str)
sns.set(font_scale =1.5)
g = sns.relplot(data=data,
            x='time',
            y='irradiation',
            col='date',
            kind='scatter',
            height=3,
            aspect=3,
            col_wrap=3)
g.set(xlim=('00:00:00', '23:45:00'), xticks=['00:00:00','06:00:00','12:00:00','18:00:00','23:45:00'])

We join with variable fulltime and change index into datetime

In [None]:
sensor01 = pd.merge(fulltime, sensor01, how='outer')
sensor02 = pd.merge(fulltime, sensor02, how='outer')
sensor01.index = sensor01['date_time']
sensor02.index = sensor02['date_time']

In [None]:
sns.heatmap(sensor01.isnull())

In [None]:
sensor01['date'] = sensor01.index.date
sensor01['time'] = sensor01.index.time
sensor02['date'] = sensor02.index.date
sensor02['time'] = sensor02.index.time

but this time we just use interpolate func with time method to fill all missing data.

In [None]:
sensor01[['ambient_temperature','module_temperature', 'irradiation']] = sensor01[['ambient_temperature','module_temperature', 'irradiation']].interpolate(method='time')
sensor02[['ambient_temperature','module_temperature', 'irradiation']] = sensor02[['ambient_temperature','module_temperature', 'irradiation']].interpolate(method='time')

now we see the result

In [None]:
data = sensor01
data['time'] = data['time'].astype(str)
sns.set(font_scale =1.5)
g = sns.relplot(data=data,
            x='time',
            y='module_temperature',
            col='date',
            kind='scatter',
            height=3,
            aspect=3,
            col_wrap=3)
g.set(xlim=('00:00:00', '23:45:00'), xticks=['00:00:00','06:00:00','12:00:00','18:00:00','23:45:00'])

We can combine sensor data with solar generation data.

In [None]:
solar01_with_sensor01_inv= [0]*22
solar02_with_sensor02_inv= [0]*22

for i in range(22):
    solar01_with_sensor01_inv[i] = pd.concat([solar01_inv[i],sensor01.drop(['date','time'], axis=1)], axis=1)
    

for i in range(22):
    solar02_with_sensor02_inv[i] = pd.concat([solar02_inv[i],sensor02.drop(['date','time'], axis=1)], axis=1)

In [None]:
solar01_with_sensor01 = pd.concat(solar01_with_sensor01_inv)
solar02_with_sensor02 = pd.concat(solar02_with_sensor02_inv)

In [None]:
solar01_with_sensor01.head()

In [None]:
solar02_with_sensor02.head()

Actually I make another dataframe that combine both of those dataframe as one.
<br>
We call it full data.

In [None]:
solar01_with_sensor01['plant_ID'] = '1'
solar02_with_sensor02['plant_ID'] = '2'

In [None]:
full_data = pd.concat([solar01_with_sensor01, solar02_with_sensor02], ignore_index=True)

In [None]:
full_data

Now we have:
<br>
solar01 : all solar generation data in plant 1
<br>
solar01 : all solar generation data in plant 2
<br>
sensor01 : all sensor data in plant 1
<br>
sensor02 : all sensor data in plant 1
<br>
solar01_inv : solar generation data for each inverter in plant 1
<br>
solar02_inv : solar generation data for each inverter in plant 2
<br>
full_data : all solar generation and sensor data in both plant

# Visualization

First, let's investigate which inverter is most contribute to power production.

In [None]:
data1=solar01.groupby(['source_key']).sum().reset_index()
data1['source_key'] = data1['source_key'].apply(lambda x: solar01_inverter_id[x])
data2=solar02.groupby(['source_key']).sum().reset_index()
data2['source_key'] = data2['source_key'].apply(lambda x: solar02_inverter_id[x])

specs = [[{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(rows=1, cols=2, specs=specs)

pull_factor = [0]*22
pull_factor[7] = 0.05

fig.add_trace(go.Pie(labels='P1 '+ data1['source_key'], 
                     values=data1['dc_power'], 
                     name='Plant 1', 
                     title='Plant 1',
                     titlefont=dict(
                                     size=25
                                   ),
                     hovertemplate="%{label} <br />generates %{value:,.0f} kW",
                     marker_colors = px.colors.qualitative.Dark24,
                     legendgroup = 'Plant 1',
                    ), 1, 1)

fig.add_trace(go.Pie(labels='P2 '+ data2['source_key'], 
                     values=data2['dc_power'], 
                     name='Plant 2', 
                     title='Plant 2',
                     titlefont=dict(
                                     size=25
                                   ),
                     hovertemplate="%{label} <br />generates %{value:,.0f} kW",
                     marker_colors = px.colors.qualitative.Light24,   
                     legendgroup = 'Plant 2',
                     pull =pull_factor,
                    ), 1, 2)

fig.update_traces(hole=.4)

fig.update_layout(
    title_text="DC Power Generation of each Inverter"
)

fig.show()

Based on 2 pie charts.
<br>
At Plant 1, each inverter contribute to DC power production equally. (9,2 GW)
<br>
But at Plant 2, inverter Mx2yZCDsyf6DPfv is the one who contributes the most. (930.5 MW) 

In [None]:
data=solar01.groupby(['source_key','date']).sum().reset_index()
data['source_key'] = data['source_key'].apply(lambda x: solar01_inverter_id[x])

fig=px.bar( 
    data_frame = data,
    x = data['date'],
    y = data['dc_power'],
    color = 'source_key',
    color_discrete_sequence = px.colors.qualitative.Dark24,
    hover_data = {'date':True,
                  'source_key':True,
                  'dc_power':':,.0f',
                 },
    opacity = 0.8,
    labels={'date':'date',
            'dc_power':'DC Power Generated (kW)',
            'source_key':'Inverter ID'
           },
    title='DC Power Generated in Plant 1 based on date',
    height = 650
)

fig.show()

For plant 1, on 2020-05-25 is the date to produce the most DC Power. (7.4 GW)

In [None]:
data=solar02.groupby(['source_key','date']).sum().reset_index()
data['source_key'] = data['source_key'].apply(lambda x: solar02_inverter_id[x])

fig=px.bar( 
    data_frame = data,
    x = data['date'],
    y = data['dc_power'],
    color = 'source_key',
    color_discrete_sequence = px.colors.qualitative.Light24,
    hover_data = {'date':True,
                  'source_key':True,
                  'dc_power':':,.0f',
                 },
    opacity = 0.8,
    labels={'date':'date',
            'dc_power':'DC Power Generated (kW)',
            'source_key':'Inverter ID'},
    title='DC Power Generated in Plant 2 based on date',
    height = 650
)


fig.show()

For plant 2, on 2020-05-15 is the date to produce the most DC Power. (666.6 MW)

In [None]:
data1=solar01.groupby(['source_key']).sum().reset_index()
data1['source_key'] = data1['source_key'].apply(lambda x: solar01_inverter_id[x])
data2=solar02.groupby(['source_key']).sum().reset_index()
data2['source_key'] = data2['source_key'].apply(lambda x: solar02_inverter_id[x])

specs = [[{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(rows=1, cols=2, specs=specs)

pull_factor = [0]*22
pull_factor[7] = 0.05

fig.add_trace(go.Pie(labels='P1 '+ data1['source_key'], 
                     values=data1['ac_power'], 
                     name='Plant 1', 
                     title='Plant 1',
                     titlefont=dict(
                                     size=25
                                   ),
                     hovertemplate="%{label} <br />generates %{value:,.0f} kW",
                     marker_colors = px.colors.qualitative.Dark24,
                     legendgroup = 'Plant 1',
                    ), 1, 1)

fig.add_trace(go.Pie(labels='P2 '+ data2['source_key'], 
                     values=data2['ac_power'], 
                     name='Plant 2', 
                     title='Plant 2',
                     titlefont=dict(
                                     size=25
                                   ),
                     hovertemplate="%{label} <br />generates %{value:,.0f} kW",
                     marker_colors = px.colors.qualitative.Light24,   
                     legendgroup = 'Plant 2',
                     pull =pull_factor,
                    ), 1, 2)

fig.update_traces(hole=.4)

fig.update_layout(
    title_text="AC Power Generation of each Inverter"
)


fig.show()

Next is AC Power
<br>
At Plant 1, each inverter contribute to AC power production equally. (901.1 MW)
<br>
But at Plant 2, inverter Mx2yZCDsyf6DPfv is the one who contributes the most. (909.7 MW) 

In [None]:
data=solar01.groupby(['source_key','date']).sum().reset_index()
data['source_key'] = data['source_key'].apply(lambda x: solar01_inverter_id[x])

fig=px.bar( 
    data_frame = data,
    x = data['date'],
    y = data['ac_power'],
    color = 'source_key',
    color_discrete_sequence = px.colors.qualitative.Dark24,
    hover_data = {'date':True,
                  'source_key':True,
                  'ac_power':':,.0f',
                 },
    opacity = 0.8,
    labels={'date':'date',
            'ac_power':'AC Power Generated (kW)',
            'source_key':'Inverter ID'},
    title='AC Power Generated in Plant 1 based on date',
    height = 650
)


fig.show()

For plant 1, on 2020-05-25 is the date to produce the most AC Power. (729.6 MW)

In [None]:
data=solar02.groupby(['source_key','date']).sum().reset_index()
data['source_key'] = data['source_key'].apply(lambda x: solar02_inverter_id[x])

fig=px.bar( 
    data_frame = data,
    x = data['date'],
    y = data['ac_power'],
    color = 'source_key',
    color_discrete_sequence = px.colors.qualitative.Light24,
    hover_data = {'date':True,
                  'source_key':True,
                  'ac_power':':,.0f',
                 },
    opacity = 0.8,
    labels={'date':'date',
            'ac_power':'AC Power Generated (kW)',
            'source_key':'Inverter ID'},
    title='AC Power Generated in Plant 2 based on date',
    height = 650
)


fig.show()

For plant 2, on 2020-05-15 is the date to produce the most DC Power. (651.4 MW)

In [None]:
data1=solar01
data2=solar02

fig = go.Figure()

fig.add_trace(go.Scattergl(x=data1['time'], 
                         y=data1['dc_power'],
                         mode='markers',
                         marker=dict(
                             size=4,
                             color= data1['dc_power'],
                             cauto=True,
                             colorscale ='Oryel',
                             opacity=0.3
                         ),
                         name='Plant 1 DC power'))

fig.add_trace(go.Scatter(x=data1['time'], 
                         y=data1.groupby('time').mean()['dc_power'],
                         mode='lines',
                             line=dict(
                             color='DarkGray',
                             width=3
                         ),
                         name='Plant 1 Mean'))

fig.add_trace(go.Scattergl(x=data2['time'], 
                         y=data2['dc_power'],
                         mode='markers',
                         marker=dict(
                             size=4,
                             color= data2['dc_power'],
                             cauto=True,
                             colorscale ='Blugrn',
                             opacity=0.3
                         ),
                         name='Plant 2 DC power'))

fig.add_trace(go.Scatter(x=data2['time'], 
                         y=data2.groupby('time').mean()['dc_power'],
                         mode='lines',
                             line=dict(
                             color='DarkOliveGreen',
                             width=3
                         ),
                         name='Plant 2 Mean'))

fig.update_layout(title= 'DC Power Generation by time',
                  height = 600)
fig.show()

At plant 1, DC power can be generated almost 13.35 MW on 11:30
<br>
But at plant 2, only 1.4 MW on 13:15

In [None]:
data1=solar01
data2=solar02

fig = go.Figure()

fig.add_trace(go.Scattergl(x=data1['time'], 
                         y=data1['ac_power'],
                         mode='markers',
                         marker=dict(
                             size=4,
                             color= data1['ac_power'],
                             cauto=True,
                             colorscale ='Oryel',
                             opacity=0.3
                         ),
                         name='Plant 1 AC power'))

fig.add_trace(go.Scatter(x=data1['time'], 
                         y=data1.groupby('time').mean()['ac_power'],
                         mode='lines',
                             line=dict(
                             color='DarkGray',
                             width=3
                         ),
                         name='Plant 1 Mean'))

fig.add_trace(go.Scattergl(x=data2['time'], 
                         y=data2['ac_power'],
                         mode='markers',
                         marker=dict(
                             size=4,
                             color= data2['ac_power'],
                             cauto=True,
                             colorscale ='Blugrn',
                             opacity=0.3
                         ),
                         name='Plant 2 AC power'))

fig.add_trace(go.Scatter(x=data2['time'], 
                         y=data2.groupby('time').mean()['ac_power'],
                         mode='lines',
                             line=dict(
                             color='DarkOliveGreen',
                             width=3
                         ),
                         name='Plant 2 Mean'))

fig.update_layout(title= 'AC Power Generation by time',
                  height = 600)
fig.show()

At plant 1, DC power can be generated almost 1.30 MW on 11:30
<br>
At plant 2, 1.3 MW on 13:15

Next, Let's see correlation among those varibles.
<br>
First, Plant 1.

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(solar01_with_sensor01.corr(), annot=True)

Plant 2's correlation

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(solar02_with_sensor02.corr(), annot=True)

As we can see DC power and irradiation has biggest correlation (0.97 on plant 1, 0.75).
Let's plot how irradiation's value correlation with ambient and module temperature.

In [None]:
data = full_data.groupby(['plant_ID','date','time']).mean().reset_index()
data = data.drop(['date','time'], axis=1)
sns.relplot(data=data, 
            x="ambient_temperature", 
            y="module_temperature", 
            hue='irradiation',
            size='irradiation',
            sizes=(50,200),
            palette='gist_heat',
            height=12,
            col='plant_ID'
           )


Based on graph.
<br>
Even ambient temperature is increasing, irradiation is not always increase.
<br>
for module temperature, that's quite significant.

Let's plot  DC Power vs irradiation.

In [None]:
data = full_data.groupby(['plant_ID','date','time']).mean().reset_index()
data = data.drop(['date','time'], axis=1)
sns.relplot(data=data, 
            x="irradiation", 
            y="dc_power", 
            hue='ac_power',
            size='irradiation',
            sizes=(50,200),
            palette='gist_heat',
            height=12,
            col='plant_ID'
           )

As we can see, the more irradiation's value increases, DC Power will increase.
<br>
But there are some data, DC Power is 0. 
<br>
Also Plant 1's DC Power production is better than Plant 2's 

That's all. Thank you for looking my notebook.
<br>
Next, I will to predict.
<br>
If you think this notebook is useful, please upvote.
<br>
and also you can commend too.
<br>

In [None]:
sensor01

In [None]:
solar01_with_sensor01.drop('date_time', axis=1, inplace =True)

In [None]:
plant01 = solar01_with_sensor01.groupby('date_time').mean()[['dc_power','ambient_temperature','module_temperature','irradiation']]

In [None]:
plant01

In [None]:
plant01.plot(subplots=True,figsize=(18,8))

In [None]:
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Bidirectional, LSTM, Dropout, Reshape
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
fft = tf.signal.rfft(plant01['dc_power'])
f_per_dataset = np.arange(0, len(fft))

n_samples_h = len(plant01['dc_power'])
minutes_per_day = 4*24
day_per_dataset = n_samples_h/(minutes_per_day)

f_per_year = f_per_dataset/day_per_dataset
plt.figure(figsize=(20,6))
plt.step(f_per_year, np.abs(fft))
plt.xscale('log')
plt.ylim(0, 8000000)
plt.xlim([0.1, max(plt.xlim())])
plt.xticks([1,2,24,  4*24], labels=['1/day', 'half day','1/hour','1/15min'])
_ = plt.xlabel('Frequency (log scale)')

In [None]:
plant01 = plant01.reset_index()

In [None]:
plant01.describe().transpose()

In [None]:
date_time = pd.to_datetime(plant01.pop('date_time'), format='%d.%m.%Y %H:%M:%S')
timestamp_s = date_time.map(datetime.datetime.timestamp)
timestamp_s

In [None]:
half_day = 12*60*60
day = (2)*half_day

plant01['half_day_sin'] = np.sin(timestamp_s * (2 * np.pi / half_day))
plant01['half_day_cos'] = np.cos(timestamp_s * (2 * np.pi / half_day))
plant01['day_sin'] = np.sin(timestamp_s * (2 * np.pi / day))
plant01['day_cos'] = np.cos(timestamp_s * (2 * np.pi / day))

In [None]:
plt.plot(np.array(plant01['day_sin'])[:24*4])
plt.plot(np.array(plant01['half_day_sin'])[:24*4])
plt.xlabel('Time [15m]')
plt.title('Time of day signal')

In [None]:
num_features = plant01.shape[1]
plant01

In [None]:
scaler = MinMaxScaler()
x_df = scaler.fit_transform(plant01)
y_df = plant01.values

In [None]:
x_df[0:5]

In [None]:
y_df[0:5]

In [None]:
def create_dataset(x_df, y_df, time_steps=1):
    train_len = int(0.8*len(x_df))
    val_len = int(0.2*len(x_df))
    
    #train data
    x_train, y_train =[],[]
    for i in range(train_len - (2*time_steps)):
        value_in_x = x_df[i : (i + time_steps)]
        x_train.append(value_in_x)
        value_in_y = y_df[(i + time_steps) : (i + 2*time_steps)]
        y_train.append(value_in_y)
    
    #val data  
    x_val, y_val =[],[]
    for i in range(val_len - (2*time_steps)):
        value_in_x = x_df[(train_len - time_steps + i) : (train_len + i)]
        x_val.append(value_in_x)
        value_in_y = y_df[(train_len + i):(train_len + time_steps + i)]
        y_val.append(value_in_y)
        
    #test data
    x_pred =[]
    value_in_x = x_df[(-time_steps):]
    x_pred.append(value_in_x)
    
    return np.array(x_train), np.array(y_train), np.array(x_val), np.array(y_val), np.array(x_pred)

In [None]:
TIME_STEPS = 4*24 #24 hours x 4 x 15mins
x_train, y_train, x_val, y_val, x_pred = create_dataset(x_df, y_df, time_steps=TIME_STEPS)

In [None]:
print('x train : '+str(x_train.shape))
print('y train : '+str(y_train.shape))
print('x val   : '+str(x_val.shape))
print('y val   : '+str(y_val.shape))
print('x pred  : '+str(x_pred.shape))

In [None]:
class FeedBack(tf.keras.Model):
  def __init__(self, units, out_steps):
    super().__init__()
    self.out_steps = out_steps
    self.units = units
    self.lstm_cell = tf.keras.layers.LSTMCell(units)
    # Also wrap the LSTMCell in an RNN to simplify the `warmup` method.
    self.lstm_rnn = tf.keras.layers.RNN(self.lstm_cell, return_state=True)
    self.dense = tf.keras.layers.Dense(num_features)

In [None]:
feedback_model = FeedBack(units=96, out_steps=96)

In [None]:
def warmup(self, inputs):
  # inputs.shape => (batch, time, features)
  # x.shape => (batch, lstm_units)
  x, *state = self.lstm_rnn(inputs)

  # predictions.shape => (batch, features)
  prediction = self.dense(x)
  return prediction, state

FeedBack.warmup = warmup

In [None]:
def call(self, inputs, training=None):
  # Use a TensorArray to capture dynamically unrolled outputs.
  predictions = []
  # Initialize the lstm state
  prediction, state = self.warmup(inputs)

  # Insert the first prediction
  predictions.append(prediction)

  # Run the rest of the prediction steps
  for n in range(1, self.out_steps):
    # Use the last prediction as input.
    x = prediction
    # Execute one lstm step.
    x, state = self.lstm_cell(x, states=state,
                              training=training)
    # Convert the lstm output to a prediction.
    prediction = self.dense(x)
    # Add the prediction to the output
    predictions.append(prediction)

  # predictions.shape => (time, batch, features)
  predictions = tf.stack(predictions)
  # predictions.shape => (batch, time, features)
  predictions = tf.transpose(predictions, [1, 0, 2])
  return predictions

FeedBack.call = call

In [None]:
model = Sequential()
model.add(
    LSTM(
        units=32,
        #input_shape=(X_train.shape[1], X_train.shape[2]),
        return_sequences=False,
    )
)

model.add(Dropout(rate=0.2))
model.add(Dense(TIME_STEPS*num_features, kernel_initializer=tf.initializers.zeros))
model.add(Reshape([TIME_STEPS, num_features]))
#model.add(Dense(units=8))

In [None]:
model.compile(loss='mean_squared_error', optimizer ='adam')

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)

In [None]:
history = model.fit(
    x_train, y_train,
    verbose=2,
    epochs=4000,
    batch_size = 32,
    validation_data=(x_val,y_val),
    shuffle=False,
    callbacks=[early_stop]
)

In [None]:
y_pred = model.predict(x_pred)

In [None]:
scaler.inverse_transform(y_pred)

In [None]:
y_pred.shape

In [None]:
y_pred[0][:,0].shape

In [None]:
y_df

In [None]:
y_df[-96:,0].shape

In [None]:
plt.figure(figsize=(16,8))

plt.plot(y_pred[0][:,0], marker='.', color='blue')
plt.plot(y_df[-96:,0], marker='.', color='red')

In [None]:
sensor02.head()

In [None]:
data = full_data.groupby(['plant_ID','date','time']).mean().reset_index()
data = data.drop(['date','time'], axis=1)
sns.relplot(data=data, 
            x="ambient_temperature", 
            y="module_temperature", 
            hue='irradiation',
            size='irradiation',
            sizes=(50,200),
            palette='gist_heat',
            height=12,
            col='plant_ID'
           )
