import tools

In [None]:
## 基础工具
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time

from fbprophet import Prophet

warnings.filterwarnings('ignore')
%matplotlib inline
#show all output lines
pd.set_option('display.max_columns', None)
from IPython.core.interactiveshell import InteractiveShell 
#InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

load and read data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#load and read Plant_1_Generation_Data
gen1=pd.read_csv('/kaggle/input/solar-power-generation-data/Plant_1_Generation_Data.csv', sep=',')
#load and read Plant_1_Weather_Sensor_Data
wth1=pd.read_csv('/kaggle/input/solar-power-generation-data/Plant_1_Weather_Sensor_Data.csv',sep=',')

In [None]:
#convert date_time
gen1['DATE_TIME']=pd.to_datetime(gen1['DATE_TIME'],format='%d-%m-%Y %H:%M')
wth1['DATE_TIME']=pd.to_datetime(wth1['DATE_TIME'])
gen1.rename(columns={'SOURCE_KEY':'inverter_ID'},inplace=True) #rename "SOURKEY_KEY" column of Plant_1_Generation_Data dataset
"Power generation data"
gen1.head()
"Wheather data"
wth1.head()

# Descriptive analytics

## check for missing data

In [None]:
gen1.shape
wth1.shape

In [None]:
missing_gen1= gen1.isnull().sum()
missing_wth1=wth1.isnull().sum()
missing_gen1
missing_wth1

In [None]:
#Find out missing data
pivoted_DC=gen1.pivot('DATE_TIME','inverter_ID','DC_POWER')
DC_POWER=pivoted_DC.stack(dropna=False).reset_index().rename(columns={0:'DC_POWER'})
DC_POWER.drop(['DC_POWER'],axis=1,inplace=True)
gen1_full=pd.merge(DC_POWER,gen1,left_on=['inverter_ID','DATE_TIME'], right_on=['inverter_ID','DATE_TIME'],how='left')


In [None]:
#visualize "missing data"
sns.heatmap(pivoted_DC.isnull(), yticklabels = False, cmap="YlGnBu")

gen1_full[gen1_full['DC_POWER'].isnull()]
data=gen1_full.copy()
data['DC_POWER_isnull']=data['DC_POWER'].isnull()
plt.figure(figsize=(16,8))
sns.lineplot(data=data, x="DATE_TIME", y="DC_POWER_isnull", hue="inverter_ID")
plt.legend(loc="right")


## What is the mean value of daily_yield

In [None]:
#mean daily_yield (per day of all inverters)
gen1.set_index('DATE_TIME').groupby('inverter_ID').resample('D').apply({'DAILY_YIELD':lambda x:x.max()-x.min()}).groupby('DATE_TIME').sum(
).mean()
#Daily yield is a cumulative sum of power generated on that day(from 00：00：00), till that point in time

## What is the total irradiation per day?

suppose that the unit of Irradiation is W/m2,since data was measured in a interval of 15min (1/4 hour),the total irradiation per day (measured on Wh) is equal to :sum of irradiation /4

In [None]:
total_irradiation_day=wth1.set_index('DATE_TIME').resample('D')['IRRADIATION'].sum()/4
plt.figure(figsize=(12,5))
ax=sns.lineplot(data=total_irradiation_day)
ax.set_ylim(0,7)
ax.set_ylabel('Day Irradiation: W/m2')
plt.tight_layout()

# What is the max ambient and module temperature?

In [None]:
ambiant_temp_max=wth1['AMBIENT_TEMPERATURE'].max()
module_temp_max=wth1['MODULE_TEMPERATURE'].max()
print("max ambient temperature is:",ambiant_temp_max)
print("max ambient temperature is:",module_temp_max)

## What is the maximum/minimum amount of DC/AC Power generated in a time interval/day?

In [None]:
day_DC=gen1.set_index('DATE_TIME').resample('D')['DC_POWER'].sum()/4
day_AC=gen1.set_index('DATE_TIME').resample('D')['AC_POWER'].sum()/4
plt.figure(figsize=(12,6))
ax=sns.lineplot(data=day_DC,color='b',label='DC POWER')
ax.set_ylim(0,2*1000000)
plt.legend()
ax1=ax.twinx()
ax1=sns.lineplot(data=day_AC,color='r',label='AC POWER')
ax1.set_ylim(0,2*100000)
plt.legend()
plt.tight_layout()
day_DC_max=gen1.set_index('DATE_TIME').resample('D')['DC_POWER'].sum().max()/4
day_AC_max=gen1.set_index('DATE_TIME').resample('D')['AC_POWER'].sum().max()/4
day_DC_min=gen1.set_index('DATE_TIME').resample('D')['DC_POWER'].sum().min()/4
day_AC_min=gen1.set_index('DATE_TIME').resample('D')['AC_POWER'].sum().min()/4
print("Maximum amount of DC Power generated in a time interval/day is ",day_DC_max,"kW")
print("Maximum amount of AC Power generated in a time interval/day is ",day_AC_max,"kW")
print("Minimum amount of DC Power generated in a time interval/day is ",day_DC_min,"kW")
print("Minimum amount of AC Power generated in a time interval/day is ",day_AC_min,"kW")

## DC/AC produced by each inverter

In [None]:
#Number
gen1['inverter_ID'].unique().shape


#total DC Power generated by each inverter during periode of data
total_DC_per_inverter=gen1.groupby('inverter_ID')['DC_POWER'].agg(lambda x:x.sum()/4).sort_values(ascending=True)
total_DC_per_inverter.plot.barh(title="Total DC power generated per inverter", color='b', alpha=0.7)

There are 22 inverters in plant1.The total amount of DC Power produced by each inverter are close to each other, but the inverter "1BY6WEcLGh8j5v7" and “bvBOhCH3iADSZry" produce less DC power than other inverters.

## Combine power generation data and weather data

In [None]:
gen1=gen1.drop(["PLANT_ID"],axis=1)
wth1=wth1.drop(["PLANT_ID","SOURCE_KEY"],axis=1)
all_data=pd.merge(gen1, wth1, how='left', on=['DATE_TIME'])
all_data.head()

In [None]:
#get numerizal features
numerical_cols = gen1.select_dtypes(exclude = 'object').columns
print(numerical_cols)

#get categorical features
categorical_cols = gen1.select_dtypes(include = 'object').columns
print(categorical_cols)
#date_features
date_features = ['DATE_TIME']
numeric_features = ['DC_POWER', 'AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD','AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']
categorical_features = ['inverter_ID']

##  Day,Hour variation analyse

In [None]:
#extract time features
from tqdm import tqdm

def num_to_date(df,date_cols):
    for f in tqdm(date_cols):
        #df[f] = pd.to_datetime(df[f],format='%d-%m-%Y %H:%M')
        df[f + '_date'] = df[f].dt.date
        df[f + '_month'] = df[f].dt.month
        df[f + '_day'] = df[f].dt.day
        df[f + '_dayofweek'] = df[f].dt.dayofweek
        df[f + '_hour'] = df[f].dt.hour
        df[f + '_time'] = df[f].dt.time
    return df

In [None]:
all_data = num_to_date(all_data,date_features)
all_data.head()

In [None]:
#"value_counts" over time
plt.figure()
plt.figure(figsize=(16, 6))
i = 1
for f in date_features:
    for col in ['date','month', 'hour']:
        plt.subplot(2, 4, i)
        i += 1
        v = all_data[f + '_' + col].value_counts()
        fig = sns.barplot(x=v.index, y=v.values)
        for item in fig.get_xticklabels():
            item.set_rotation(90)
        plt.title(f + '_' + col)
plt.tight_layout()
plt.show()

 The number of data should be equal each day and each hour,there should be missing data on some days.

In [None]:
#time series analyse AC POWER, DC POWER,Irradiation ambiant temperature and module temperature
plt.figure()
plt.figure(figsize=(16,10))

i = 1
for f in date_features: 
    for col in ['date','month','hour']:
        for y_col in ["DC_POWER","AC_POWER"]:
            plt.subplot(2, 3, i)
            i += 1
            fig = sns.boxplot(x=all_data[f + '_' + col], y=all_data[y_col])
            for item in fig.get_xticklabels():
                item.set_rotation(90)
            plt.title(col + '' + "variation of "+y_col)
plt.tight_layout()
plt.show()

In [None]:
plt.figure()
plt.figure(figsize=(16,16))

i = 1
for f in date_features: 
    for y_col in ["AMBIENT_TEMPERATURE","MODULE_TEMPERATURE","IRRADIATION"]:
        for col in ['month', 'date','hour']:
            plt.subplot(4, 3, i)
            i += 1
            fig = sns.boxplot(x=all_data[f + '_' + col], y=all_data[y_col])
            for item in fig.get_xticklabels():
                item.set_rotation(90)
            plt.title(col + " variation of "+y_col)
plt.tight_layout()
plt.show()

Day variation of the irradiation and module temperature are similar, they both increse from 0-12h and descend from 12h to night.At night, where there is no sunlight, module temperature descend to close to ambiant temperature, while irradiation descend to 0,which make sense.
Hour variation of ambiant temperature sheme shows that variances of ambiant temperature of higher in the afternoon.

## correlation analyse

In [None]:
corr = all_data[numeric_features].corr()
plt.figure(figsize=(8,6))
sns.heatmap(abs(np.around(corr,2)), linewidths=0.1, annot=True,cmap=sns.cm.rocket_r)
plt.show()

The correlation coefficient between DC power and AC power is 1, which means that we can find out AC power with DC power, so I will
concentrate on DC power for further analysis of power generation.
The correlation coefficients between DC power, irradiation and moudule temperature are very high as well.

In [None]:
sns.pairplot(all_data[['DC_POWER', 'AC_POWER','AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']],diag_kind='kde')

In [None]:
#calculate DC_POWER/IRRADIATION to evaluate efficiency of inverter
all_data['DC_irr']=all_data['DC_POWER']/all_data['IRRADIATION']
sns.distplot(all_data['DC_irr'], label="DCpower/irradiation")

Most values of DC_power/Irradiation concentrate situate between 5000 to 20000, values close to 0 should represent for irradiation=0 
In the next step, I will take a close look at the data for which DC_POWER/IRRADATION is between 6000 to 20000, and try to find out what cause diffrence on the ration DC_POWER/IRRADATION

In [None]:
left=6000
right=20000
cleaned_DCpower=all_data[all_data['DC_irr'].between(left,right, inclusive = True)]
sns.relplot(x="IRRADIATION", y="DC_POWER", hue="inverter_ID",height=6,data=cleaned_DCpower)


In [None]:
#module temperature & DC_POWER/IRRADIATION
mean_efficiency_DC=cleaned_DCpower.groupby('DATE_TIME').agg({'DC_irr':'mean','MODULE_TEMPERATURE':'mean','IRRADIATION':'mean'})
sns.lmplot(x="IRRADIATION", y="DC_irr",height=6,data=mean_efficiency_DC)

    DC POWER/IRRADIATION decreases as moudule temperature increases, this should be caused by overheating of equiment that results in decrease of efficiency.

In [None]:
#distplots of all values
data=all_data[all_data['DATE_TIME_hour'].between(6,18)]
plt.figure(figsize=(15, 15))
i = 1
for col in numeric_features:
    plt.subplot(5, 4, i)
    i += 1
    sns.distplot(data[col], label='', hist=False)
plt.tight_layout()
plt.show()

## Yield 15min

In [None]:
#calculate Yield in an interval of 15min
pivoted_yield=all_data.pivot('DATE_TIME','inverter_ID','TOTAL_YIELD')
gen_power=pivoted_yield.apply(lambda x: x.shift(-1)-x)
gen_power=gen_power.stack().reset_index().rename(columns={0:'yieled_15mn'})

all_data=pd.merge(all_data, gen_power, how='left', on=['DATE_TIME','inverter_ID'])

## Day variation

In [None]:
#calculate daily_yield of each inverter and compare to total DC/AC power
day_gen_inverter=gen1.set_index('DATE_TIME').groupby('inverter_ID').resample('D').apply({'DAILY_YIELD':lambda x:x.max()-x.min(),'DC_POWER':lambda x:x.sum()/4,'AC_POWER':lambda x:x.sum()/4})

In [None]:
day_gen_total=day_gen_inverter.groupby('DATE_TIME').sum()
day_gen_total.head()

In [None]:
daily_wth=wth1.set_index('DATE_TIME').resample('D').apply({'IRRADIATION':sum,'AMBIENT_TEMPERATURE':lambda x:x.max()-x.min(),'MODULE_TEMPERATURE':lambda x:x.mean()})
daily_wth.head()

In [None]:
fig, ax1 = plt.subplots(figsize=(16,5))

color11 = 'tab:green'
color12='tab:blue'
ax1.set_xlabel('DATE_TIME')
ax1.set_ylabel('TEMPERATURE C')
ax1.plot(wth1['DATE_TIME'], wth1['AMBIENT_TEMPERATURE'], color=color11,label='AMBIENT_TEMPERATURE')
ax1.plot(wth1['DATE_TIME'], wth1['MODULE_TEMPERATURE'], color=color12,label='MODULE_TEMPERATURE')
ax1.tick_params(axis='y')
plt.legend()
ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color2 = 'tab:orange'
ax2.set_ylabel('Irradiation : w/m2')  # we already handled the x-label with ax1
ax2.plot(wth1['DATE_TIME'], wth1['IRRADIATION'], color=color2,label='IRRADIATION',alpha=0.5)
ax2.tick_params(axis='y',labelcolor=color2)

fig.tight_layout() 
#plt.legend()
plt.show()

# Prediction

Try to prediction DC POWER of next days

## ByFbProphet

In [None]:
pred_gen_all=gen1.copy()
pred_gen_all=pred_gen_all.groupby('DATE_TIME')['DC_POWER'].sum().reset_index()
pred_gen_all.rename(columns={'DATE_TIME':'ds','DC_POWER':'y'},inplace=True)
#"training set" contains DC power genration data before 2020-06-10, data after 2020-06-10 will be used for "validation set"
pred_gen=pred_gen_all[pred_gen_all['ds']<'2020-06-10'].copy()
pred_gen.plot(x='ds',y='y',figsize=(17,5))
plt.legend('')
plt.title('DC_POWER',size=17)
plt.show()

In [None]:
m = Prophet(yearly_seasonality=False,daily_seasonality=True)
m.fit(pred_gen)

In [None]:
#try to predict 3 days in the fucture
future =m.make_future_dataframe(periods=7*24*4,freq='15min')
forecast = m.predict(future)

In [None]:
fig=m.plot(forecast,figsize=(15,7))
fig=m.plot_components(forecast,figsize=(15,7))

In [None]:
#compare predicted value with original data
plt.figure(figsize=(20,6))
plt.plot(pred_gen_all.set_index('ds'),label="original_data")
plt.plot(forecast.set_index('ds')['yhat'],label="prediction")
plt.legend()

## Predict DC Power by neural network

In [None]:
from keras.models import Sequential
from keras.layers import Dense,Dropout,Activation,Flatten,LSTM,TimeDistributed, RepeatVector
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping,ModelCheckpoint
from sklearn.preprocessing import StandardScaler,MinMaxScaler

from subprocess import check_output
from keras.models import Sequential
from sklearn.model_selection import  train_test_split
import time #helper libraries
\
from numpy import newaxis


In [None]:


df= pred_gen_all.copy()
df=df.set_index('ds')
split_date = pd.Timestamp('2020-06-09 00:00:00')
train = df.loc[:split_date]
test = df.loc[split_date:]

plt.figure(figsize=(10, 6))

ax = train.plot()

test.plot(ax=ax)

plt.legend(['train', 'test'])


In [None]:
scaler = MinMaxScaler(feature_range=(-1, 1))

train_sc = scaler.fit_transform(train)

test_sc = scaler.transform(test)

#prepare for training data and validation data

X_train = train_sc[:-1]

y_train = train_sc[1:]

X_test = test_sc[:-1]

y_test = test_sc[1:]

In [None]:
nn_model = Sequential()

nn_model.add(Dense(12, input_dim=1, activation='relu'))

nn_model.add(Dense(1))

nn_model.compile(loss='mean_squared_error', optimizer='adam')

early_stop = EarlyStopping(monitor='loss', patience=2, verbose=1)

history = nn_model.fit(X_train, y_train, epochs=100, batch_size=1, verbose=1, callbacks=[early_stop], shuffle=False)

In [None]:
y_pred_test_nn = nn_model.predict(X_test)

y_train_pred_nn = nn_model.predict(X_train)

print("The R2 score on the Train set is:t{:0.3f}".format(r2_score(y_train, y_train_pred_nn)))

print("The R2 score on the Test set is:t{:0.3f}".format(r2_score(y_test, y_pred_test_nn)))

