In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import data

In [None]:
plant1 = pd.read_csv('/kaggle/input/solar-power-generation-data/Plant_1_Generation_Data.csv')
plant2 = pd.read_csv('/kaggle/input/solar-power-generation-data/Plant_2_Generation_Data.csv')
weather1 = pd.read_csv('/kaggle/input/solar-power-generation-data/Plant_1_Weather_Sensor_Data.csv')
weather2 = pd.read_csv('/kaggle/input/solar-power-generation-data/Plant_2_Weather_Sensor_Data.csv')

In [None]:
plant1.tail()

In [None]:
plant2.tail()

In [None]:
plant1['DATE_TIME'] = pd.to_datetime(plant1['DATE_TIME'], format = '%d-%m-%Y %H:%M')
weather1['DATE_TIME'] = pd.to_datetime(weather1['DATE_TIME'], format = '%Y-%m-%d %H:%M:%S')
plant1.columns = plant1.columns.str.lower()
weather1.columns = weather1.columns.str.lower()
plant2['DATE_TIME'] = pd.to_datetime(plant2['DATE_TIME'], format = '%Y-%m-%d %H:%M:%S')
weather2['DATE_TIME'] = pd.to_datetime(weather2['DATE_TIME'], format = '%Y-%m-%d %H:%M:%S')
plant2.columns = plant2.columns.str.lower()
weather2.columns = weather2.columns.str.lower()

Firstly we take a look on DC and AC power, accumulating by 2 plants

In [None]:
print(plant1.head())
print(plant2.head())

In [None]:
plant1['date'] = plant1['date_time'].dt.date
plant1['time'] = plant1['date_time'].dt.time
plant2['date'] = plant2['date_time'].dt.date
plant2['time'] = plant2['date_time'].dt.time

In [None]:
dc_plant1 = plant1.groupby('time')['dc_power'].sum()
ac_plant1 = plant1.groupby('time')['ac_power'].sum()
dc_plant2 = plant2.groupby('time')['dc_power'].sum()
ac_plant2 = plant2.groupby('time')['ac_power'].sum()

In [None]:
fig, ax = plt.subplots(1, 2, dpi=100, figsize=(20, 5))
dc_plant1.plot(ax=ax[0])
dc_plant2.plot(ax=ax[0])
ac_plant1.plot(ax=ax[1])
ac_plant2.plot(ax=ax[1])
ax[0].legend(['plant1', 'plant2'])
ax[1].legend(['plant1', 'plant2'])
ax[0].set_title('DC Power')
ax[1].set_title('AC Power')
plt.show()

In this graphs we see that Plant 1 is accumulating much more DC Power than Plant 2, nevertheless AC Power of Plant 1 and Plant 2 doesn't differ such as DC Power. 
Maybe Plant 1 has some issues with converting DC Power to AC Power.

In [None]:
loss_p1 = plant1.copy()
loss_p2 = plant2.copy()
loss_p1 = loss_p1.groupby('date').sum()
loss_p1['losses'] = loss_p1['ac_power'] / loss_p1['dc_power'] * 100
loss_p2 = loss_p2.groupby('date').sum()
loss_p2['losses'] = loss_p2['ac_power'] / loss_p2['dc_power'] * 100

#Plot the losses
fig, ax = plt.subplots(2, 1, sharex = True, dpi=100, figsize=(13,7))
loss_p1['losses'].plot(style='o--', ax=ax[0])
loss_p2['losses'].plot(style='o--', ax=ax[1])
ax[0].set_title('Percentage of DC Power converted into AC Power for Plant 1')
ax[1].set_title('Percentage of DC Power converted into AC Power for Plant 2')
plt.xticks(rotation=45)
plt.show()

In these graphs we see that Plant 1 can convert about 9.78 percent of accumulated DC power, when Plant 2 can convert about 97.8 percent of DC Power

Now lets take a look if all solar batteries are working properly

In [None]:
unique_keyes = set(plant1['source_key'])
total_dc_power_p1 = {}
for key in unique_keyes:
    dc_power = plant1[plant1['source_key'] == key]['total_yield'].iloc[-1] - plant1[plant1['source_key'] == key]['total_yield'].iloc[0]
    total_dc_power_p1[key] = dc_power
print(total_dc_power_p1)
fig, ax = plt.subplots(figsize = (17, 5))
ax.plot(list(total_dc_power_p1.values()), marker = '^', linestyle = '-.')
ax.set(xlabel = 'Source', ylabel='kW', title='Total yielded power by different solar battery for plant 1')
plt.xticks(range(0, 22), list(total_dc_power_p1.keys()), rotation = 90)
plt.show()

In [None]:
unique_keyes2 = set(plant2['source_key'])
total_dc_power_p2 = {}
for key in unique_keyes2:
    dc_power = plant2[plant2['source_key'] == key]['total_yield'].iloc[-1] - plant2[plant2['source_key'] == key]['total_yield'].iloc[0]
    total_dc_power_p2[key] = dc_power
print(total_dc_power_p2)
fig, ax = plt.subplots(figsize = (17, 5))
ax.plot(list(total_dc_power_p2.values()), marker = '^', linestyle = '-.')
ax.set(xlabel = 'Source', ylabel='kW', title='Total yielded power by different solar battery for plant 2')
plt.xticks(range(0, 22), list(total_dc_power_p2.keys()), rotation = 90)
plt.show()

In these graphs we see that two solar batteries have less efficiency than others at Plant 1. At Plant 2 there are about 4 batteries that have less efficiency than others.

Now let's take a look on worst and best solar battery at Plant 1

In [None]:
worst_inverter_p1 = plant1[plant1['source_key'] == 'bvBOhCH3iADSZry'].reset_index(drop=True)

ax=worst_inverter_p1.groupby(['time', 'date'])['dc_power'].mean().unstack().plot(sharex=True,subplots=True,layout=(17,2),figsize=(20,30))
worst_inverter_p1.groupby(['time', 'date'])['daily_yield'].mean().unstack().plot(sharex=True,subplots=True,layout=(17,2),figsize=(20,30),ax=ax,style='-.')
cols=worst_inverter_p1.groupby(['time', 'date'])['dc_power'].mean().unstack().columns
a=0
for i in range(len(ax)):
    for j in range(len(ax[i])):
        ax[i,j].set_title(cols[a], size=15)
        ax[i,j].legend(['dc_power','daily_yield'])
        a=a+1
plt.tight_layout()
plt.show()

Here we can see that on June 7th and June 14th there is time that energy by the worst solar battery was not produced at all.
Also there are missing data for some hours since May 19th to to May 21st

In [None]:
best_inverter_p1 = plant1[plant1['source_key'] == 'adLQvlD726eNBSB'].reset_index(drop=True)

ax=best_inverter_p1.groupby(['time', 'date'])['dc_power'].mean().unstack().plot(sharex=True,subplots=True,layout=(17,2),figsize=(20,30))
best_inverter_p1.groupby(['time', 'date'])['daily_yield'].mean().unstack().plot(sharex=True,subplots=True,layout=(17,2),figsize=(20,30),ax=ax,style='-.')
cols=best_inverter_p1.groupby(['time', 'date'])['dc_power'].mean().unstack().columns
a=0
for i in range(len(ax)):
    for j in range(len(ax[i])):
        ax[i,j].set_title(cols[a], size=15)
        ax[i,j].legend(['dc_power','daily_yield'])
        a=a+1
plt.tight_layout()
plt.show()

Here we also see some missing data, but the best solar battery at Plant 1 was working properly all the time

In [None]:
worst_inverter_p2 = plant2[plant2['source_key'] == 'Quc1TzYxW2pYoWX'].reset_index(drop=True)

ax=worst_inverter_p2.groupby(['time', 'date'])['dc_power'].mean().unstack().plot(sharex=True,subplots=True,layout=(17,2),figsize=(20,30))
worst_inverter_p2.groupby(['time', 'date'])['daily_yield'].mean().unstack().plot(sharex=True,subplots=True,layout=(17,2),figsize=(20,30),ax=ax,style='-.')
cols=worst_inverter_p2.groupby(['time', 'date'])['dc_power'].mean().unstack().columns
a=0
for i in range(len(ax)):
    for j in range(len(ax[i])):
        ax[i,j].set_title(cols[a], size=15)
        ax[i,j].legend(['dc_power','daily_yield'])
        a=a+1
plt.tight_layout()
plt.show()

On these graphs we also see that worst inverter at Plant 2 wasn't producing energy during some days. Maybe it is because they were repairing some equipment during that days. If not, that battery should be repaired or replaced

Lets take a look on the correlation of the features


In [None]:
df_plant1 = plant1.merge(weather1, on='date_time', suffixes=['', '_w'])
df_plant1['hour'] = df_plant1['date_time'].dt.hour
df_plant1 = df_plant1.drop(['source_key_w', 'plant_id', 'plant_id_w', 'date', 'time', 'date_time'], axis = 1)
df_to_corr = df_plant1.drop('source_key', axis=1)
df_to_corr.corr()

In this table we see that power generation is strongly correlated with module temperature and irradiation, and have not so strong correlation with ambient_tempreture (pearson r = 0.725). Also daily yielded power is clearly correlated with the hour of the day.

# Task 2

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

We are going to predict AC Power that have generated each 15 minutes by Plant 1 and 2 on June 16th an 17th.

We assume that we have precise forecast of ambient temperature, but we don't have other features, so I will delete DC Power, daily yield, total yield, module temperature and irradiation columns. Also I will add hour column, as plant produces energy only since 6 AM to 6 PM.

### Step 1. Delete and add new features

In [None]:
df1 = plant1.merge(weather1, on='date_time', suffixes=['', '_w'])
columns_to_drop = ['date_time', 'date', 'time', 'dc_power', 'daily_yield', 'total_yield', 'module_temperature', 'irradiation', 'source_key_w', 'plant_id', 'plant_id_w']
df_p1 = df1.drop(columns_to_drop, axis=1)
df_p1['hour'] = df1['date_time'].dt.hour

### Step 2. Encode categorical features (source key)

In [None]:
values = np.array(df_p1['source_key'])
#Encoding
label_enc = LabelEncoder()
integer_enc = label_enc.fit_transform(values)
onehot_enc = OneHotEncoder(sparse=False)
enc_keys = onehot_enc.fit_transform(integer_enc.reshape(-1, 1))
enc_keys_df = pd.DataFrame(enc_keys)
# Create dictionary with categories of encoded feature, to use them in plotting
keys = label_enc.classes_
values = label_enc.transform(label_enc.classes_)
dictionary = dict(zip(values, keys))

In [None]:
df_p1 = pd.concat([df_p1, enc_keys_df], axis=1).drop('source_key', axis=1)
df_p1.head()

### Step 3. Split dataset into train and test

In [None]:
train_columns = df_p1.columns[1:]
X_train = df_p1[df1['date_time'] < '2020-06-16'][train_columns].reset_index().drop('index', axis=1)
X_test = df_p1[df1['date_time'] < '2020-06-16']['ac_power'].reset_index().drop('index', axis=1)
y_test = df_p1[df1['date_time'] >= '2020-06-16']['ac_power'].reset_index().drop('index', axis=1)
y_train = df_p1[df1['date_time'] >= '2020-06-16'][train_columns].reset_index().drop('index', axis=1)

### Step 4. Define the model

In [None]:
reg = RandomForestRegressor()
reg.fit(X_train, X_test)
y_pred_rf = reg.predict(y_train)
print(mean_absolute_error(y_pred_rf, y_test))
print(r2_score(y_pred_rf, y_test))

In [None]:
gbr = GradientBoostingRegressor(learning_rate = 0.05, n_estimators=200)
gbr.fit(X_train, X_test)
y_pred = gbr.predict(y_train)
print(mean_absolute_error(y_pred, y_test))
print(r2_score(y_pred, y_test))

### Step 5. Plot the results

In [None]:
fig = plt.figure(figsize=(20, 20))
fig.subplots_adjust(wspace=0.2, hspace=0.6)
for i in range(0, 22):
    ax = fig.add_subplot(6, 4, i+1)
    index = df1[(df1['source_key'] == dictionary[i])&(df1['date_time'] >= pd.to_datetime('2020-06-16'))]['date_time'] # just for index
    ax.plot(pd.DataFrame(y_test[y_train[y_train.columns[i+2]] == 1]).set_index(index))
    ax.plot(pd.DataFrame(y_pred[y_train[y_train.columns[i+2]] == 1]).set_index(index), color='darkorange')
    ax.set_title('"{}" source, Plant 1'.format(dictionary[i]))
    ax.legend(['real', 'predicted'])
    plt.xticks(rotation = 45)

As we can see on the graphs our model was able to predict AC power quite good. Unless there is a pattern that at June 16 real AC power was less then predicted one.
I think that might be because of some sort of cloudy weather, when solar engines couldn't get enough sunlight. If we would have another feature of weather type, maybe our model would be more precise

MAE score of our model for Plant 1 - 61 kW, R2 score - 0.88

### Now lets make the same for Plant 2

In [None]:
df2 = plant2.merge(weather2, on='date_time', suffixes=['', '_w'])
columns_to_drop = ['date_time', 'date', 'time', 'dc_power', 'daily_yield', 'total_yield', 'module_temperature', 'irradiation', 'source_key_w', 'plant_id', 'plant_id_w']
df_p2 = df2.drop(columns_to_drop, axis=1)
df_p2['hour'] = df2['date_time'].dt.hour

In [None]:
values = np.array(df_p2['source_key'])

#Encoding

label_enc = LabelEncoder()
integer_enc = label_enc.fit_transform(values)
onehot_enc = OneHotEncoder(sparse=False)
enc_keys = onehot_enc.fit_transform(integer_enc.reshape(-1, 1))
enc_keys_df = pd.DataFrame(enc_keys)

# Create dictionary with categories of encoded feature, to use them in plotting

keys = label_enc.classes_
values = label_enc.transform(label_enc.classes_)
dictionary = dict(zip(values, keys))

In [None]:
df_p2 = pd.concat([df_p2, enc_keys_df], axis=1).drop('source_key', axis=1)
df_p2.head()

In [None]:
train_columns = df_p2.columns[1:]
X_train2 = df_p2[df2['date_time'] < '2020-06-16'][train_columns].reset_index(drop=True)
X_test2 = df_p2[df2['date_time'] < '2020-06-16']['ac_power'].reset_index(drop=True)
y_test2 = df_p2[df2['date_time'] >= '2020-06-16']['ac_power'].reset_index(drop=True)
y_train2 = df_p2[df2['date_time'] >= '2020-06-16'][train_columns].reset_index(drop=True)

In [None]:
gbr = GradientBoostingRegressor(learning_rate = 0.05, n_estimators=200)
gbr.fit(X_train2, X_test2)
y_pred2 = gbr.predict(y_train2)
print(mean_absolute_error(y_pred2, y_test2))
print(r2_score(y_pred2, y_test2))

In [None]:
fig = plt.figure(figsize=(20, 20))
fig.subplots_adjust(wspace=0.2, hspace=0.6)
for i in range(0, 22):
    ax = fig.add_subplot(6, 4, i+1)
    index = df2[(df2['source_key'] == dictionary[i])&(df2['date_time'] >= pd.to_datetime('2020-06-16'))]['date_time'] # just for index
    ax.plot(pd.DataFrame(y_test2[y_train2[y_train2.columns[i+2]] == 1]).set_index(index))
    ax.plot(pd.DataFrame(y_pred2[y_train2[y_train2.columns[i+2]] == 1]).set_index(index), color='darkorange')
    ax.set_title('"{}" source, Plant 2'.format(dictionary[i]))
    ax.legend(['real', 'predicted'])
    plt.xticks(rotation = 45)

As we see prediction of our model for Plant 2 is less precise then for Plant 1. I think it might be because of days when particular solar batteries on plant 2 didn't work properly (we saw it in Task 1).

Also we see that for the most efficient solar batteries (xMbIugepa2P7lBB', 'IQ2d7wF4YD8zU1Q' and 'NgDl19wMapZy17u) our model gives more precise prediction. This is because they worked properly during all time in this period.

# Conclusion

## Task 1:

- We saw that plant 1 accumulate more DC Power but has problems with converting it to AC Power.
- We saw that not all solar batteries are working properly at Plant 1, as well as at Plant 2. We were able to give suggestions which batteries need to be repaired or replaced

## Task 2:

- We defined 2 models of ML to predict AC Power generation at both Plants
- GradientBoosting gave us more precise results than Randomforest (MAE and R2 Score were 61 kW and 88.4% respectively
- We plot predictions and real results for each battery to see patterns of our model.

# Thanks for reading my work!
## I am beginner at Data Science so if I did something wrong, please comment it so I can do better next time