In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
mydateparser = lambda x: pd.datetime.strptime(x, "%d-%m-%Y %H:%M")


### Let's concentrate on plant 1 for the time being

In [None]:
gen_data = pd.read_csv("../input/solar-power-generation-data/Plant_1_Generation_Data.csv",index_col = "DATE_TIME",parse_dates = ["DATE_TIME"] , date_parser = mydateparser)

In [None]:
gen_data.head()

In [None]:
gen_data["Date"] = pd.to_datetime(gen_data.index.map(lambda x : x.date()))
gen_data["Time"] = gen_data.index.map(lambda x : x.time())

In [None]:
gen_data.loc[(gen_data["DC_POWER"] == 0) & (gen_data["AC_POWER"] != 0)]

In [None]:
gen_data.loc[(gen_data["DC_POWER"] != 0) & (gen_data["AC_POWER"] == 0)]

### Exploration

Check for NaNs

In [None]:
[any(pd.isnull(gen_data[column])) for column in gen_data.columns]

In [None]:
numeric_columns = [column for column in gen_data.columns if gen_data[column].dtype in ['int64','float64']]
other_columns = [column for column in gen_data.columns if column not in numeric_columns]

In [None]:
[any(np.isnan(gen_data[column])) for column in numeric_columns]

In [None]:
gen_data["Date"].head()

Excellent, no NaNs or empty entries

Plot power production as a function of time for a few inverters

In [None]:
gen_data.columns

In [None]:
len(gen_data["SOURCE_KEY"].unique())

In [None]:
inverters = gen_data["SOURCE_KEY"].unique()

Visualizing a time lapse of power production for all inverters on a given day (15/05/2020 in this case)

In [None]:
fig = plt.figure(figsize = (25,16))
for i,inverter in enumerate(inverters,1):
    plt.subplot(6,4,i)
    plt.yscale("log")
    gen_data.loc[(gen_data["Date"] == "2020-05-15") &  (gen_data["SOURCE_KEY"] == inverter),"DC_POWER"].plot(label = inverter + " DC")
    gen_data.loc[(gen_data["Date"] == "2020-05-15") & (gen_data["SOURCE_KEY"] == inverter),"AC_POWER"].plot(label = inverter + " AC")
    plt.legend()

* DC and AC seem to follow each other i.e., inverter seems to produce DC and convert it to AC (sanity check)
* Inverters run from roughly 6AM to roughly 6PM

In [None]:
gen_data.groupby("SOURCE_KEY").count()

In [None]:
34 * 24 * 4 #Number of data points required

So not all plants have data at all points in time within a data taking period. I'm going to assume the plant is compact enough that the inverters are quite close to each other and get the same amount of solar irradiation at any point in time (the distribution plots roughly attest to this). So an imputing strategy will be to use the average for that day and that hour to fill a missing entry. This means the total power produced in a given time interval is given by the average produced multiplied by the total number of inverters

#### Day totals over the entire period of 34 days for each inverter

In [None]:
gen_data["Date"].unique()

In [None]:
gen_data.groupby("SOURCE_KEY").sum()["DC_POWER"]

In [None]:
#split the dataframes by inverter IDs first
split_by_inverters = {}
for inverter in inverters:
    split_by_inverters[inverter] = gen_data.loc[gen_data["SOURCE_KEY"] == inverter]

In [None]:
unique_dates = gen_data.index.map(lambda x : x.date()).unique()

In [None]:
temp = split_by_inverters['1BY6WEcLGh8j5v7']

In [None]:
fig = plt.figure(figsize = (30,25))
inverter_daily_power = {}
for i,(inverter,data) in enumerate(split_by_inverters.items(),1):
    plt.subplot(6,4,i)
    inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
    inverter_daily_power[inverter].plot(label = inverter)
    plt.legend()

We see a very rough pattern in the daily power production. Let us try to correlate this with the weather data we have

In [None]:
weather_data = pd.read_csv("../input/solar-power-generation-data/Plant_1_Weather_Sensor_Data.csv",index_col = "DATE_TIME",parse_dates = True)

In [None]:
weather_data["Date"] = pd.to_datetime(weather_data.index.map(lambda x : x.date()))
weather_data["Time"] = weather_data.index.map(lambda x : x.time())

In [None]:
weather_data.head()

Temperature and irradiation profile in a given day (15-05-2020)

In [None]:
fig = plt.figure(figsize = (18,4))
plt.subplot(131)
weather_data.loc[(weather_data["Date"] == "2020-05-15"), "IRRADIATION"].plot(legend = True)
#plt.legend()
plt.subplot(132)
weather_data.loc[(weather_data["Date"] == "2020-05-15"),"AMBIENT_TEMPERATURE"].plot(legend = True)
plt.subplot(133)
weather_data.loc[(weather_data["Date"] == "2020-05-15 00:00:00"),"MODULE_TEMPERATURE"].plot(legend = True)

In [None]:
sns.scatterplot(x = weather_data.loc[(weather_data["Date"] == "2020-05-15"),"AMBIENT_TEMPERATURE"], y =weather_data.loc[(weather_data["Date"] == "2020-05-15"),"MODULE_TEMPERATURE"])

The thing with temperatures is that they're not instantaneous (i.e., a change in ambient temperature or irradiation will not immediately change the module temperature). The ambient temperature for example is sort of a delayed response to solar irradiation (because ground takes some time to heat up, and a lot of time to cool down). The module temperature gets complicated because it's influenced by the sun during the day and the cooling ground at night

In [None]:
#Timestamp of maximum irradiation on the 15th of May
weather_data.loc[(weather_data["Date"] == "2020-05-15"),"IRRADIATION"].idxmax()

In [None]:
#Timestamp of maximum ambient temperature on the 15th of May
weather_data.loc[(weather_data["Date"] == "2020-05-15"),"AMBIENT_TEMPERATURE"].idxmax()

In [None]:
weather_data.loc[(weather_data["Date"] == "2020-05-15"),"MODULE_TEMPERATURE"].idxmax()

Checking the above assertion that the ambient temperature peaks a while after maximum irradiation/module temperature for all days

In [None]:
inverter_daily_power.keys()

Rough plot of net solar irradiation vs Power produced in a given inverter over the data taking period

In [None]:
weather_data["date"] = weather_data.index.map(lambda x : x.date())
daily_irradiation = weather_data.groupby("date").sum()["IRRADIATION"]
sns.scatterplot(x = daily_irradiation, y = inverter_daily_power["1BY6WEcLGh8j5v7"])

The above plot is only an estimate because we have issues with data taking over the course of days (i.e., some intervals don't have data). We need to take the intersection of timestamps of the inverter output data and the irradiation data to get a better estimate

 #### Irradiation vs Different temperature metrics 

#### Max and Min temperature vs data taking period

In [None]:
max_temps = weather_data.groupby("date").max()["AMBIENT_TEMPERATURE"]
min_temps = weather_data.groupby("date").min()["AMBIENT_TEMPERATURE"]

In [None]:
plt.figure(figsize = (12,6))
max_temps.plot(label = "Maximum Temperature")
min_temps.plot(label = "Minimum Temperature")
plt.legend()

In [None]:
max_temps = weather_data.groupby("date").max()["AMBIENT_TEMPERATURE"]
min_temps = weather_data.groupby("date").min()["AMBIENT_TEMPERATURE"]
diff_temps = max_temps - min_temps
daily_irradiation = weather_data.groupby("date").sum()["IRRADIATION"]

Irradiation vs (Maximum - Minimum) temperature

In [None]:
sns.scatterplot(daily_irradiation,diff_temps)

The first hint of nonzero radiation is when the sun appears in Line of Sight of the solar panels. The temperature at this point is our "baseline" temperature before any solar irradiation, and the maximum temperature is, well, the maximum. The difference between these two temperatures should tell us a measure of irradiation

In [None]:
temp_before_sunrise = weather_data.loc[(weather_data["Time"] < pd.to_datetime("07:00").time()) & (weather_data["IRRADIATION"] > 0)].groupby("date")["AMBIENT_TEMPERATURE"].min()

In [None]:
diff_temps = max_temps - temp_before_sunrise

In [None]:
sns.scatterplot(daily_irradiation,diff_temps)

The above temperature metric doesn't seem to be a very good indicator of irradiation. Temperature and irradiation seem to have a complicated relationship that could not be modelled linearly

Since not all inverters have readings for all intervals of time, we "impute" entries for the inverters (and subsequently calculate the total power produced in any instance of time) by using the average DC/AC power produced in that time period as the power produced by an inverter with a missing reading. This implies the total power produced in an interval of time is just the average for that time interval multiplied by the total number of inverters. 

Calculate the average DC and AC power produced for a given timestamp

In [None]:
average_power = gen_data.reset_index().groupby("DATE_TIME").mean()[["DC_POWER","AC_POWER"]]

In [None]:
total_power = average_power * gen_data["PLANT_ID"].nunique()

In [None]:
total_power["Date"] = total_power.index.map(lambda x : x.date())
fig = plt.figure()
total_power.groupby("Date").sum().plot()
plt.yscale("log")

In [None]:
sns.regplot(x = weather_data.groupby("Date")["IRRADIATION"].sum(), y = total_power.groupby("Date")["DC_POWER"].sum())

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(weather_data.groupby("Date")["IRRADIATION"].sum().values.reshape(-1,1),total_power.groupby("Date")["DC_POWER"].sum())
model.intercept_,model.coef_

In [None]:
sns.regplot(x = total_power.groupby("Date")["DC_POWER"].sum(),y = total_power.groupby("Date")["AC_POWER"].sum())

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
a = LinearRegression()
a.fit(total_power.groupby("Date")["DC_POWER"].sum().values.reshape(-1,1),total_power.groupby("Date")["AC_POWER"].sum())
a.intercept_,a.coef_

The first plant has an efficiency of 9.74%

### Repeating these with plant 2 and cross-checking the plants

In [None]:
gen_data_2 = pd.read_csv("../input/solar-power-generation-data/Plant_2_Generation_Data.csv",index_col = "DATE_TIME",parse_dates = ["DATE_TIME"])

In [None]:
gen_data_2["Date"] = gen_data_2.index.map(lambda x : x.date())
gen_data_2["Time"] = gen_data_2.index.map(lambda x : x.time())

In [None]:
gen_data_2.groupby("SOURCE_KEY").count()

In [None]:
[any(pd.isnull(gen_data_2[column])) for column in gen_data_2.columns]

In [None]:
inverters = gen_data_2["SOURCE_KEY"].unique()
fig = plt.figure(figsize = (25,16))
for i,inverter in enumerate(inverters,1):
    plt.subplot(6,4,i)
    plt.yscale("log")
    gen_data_2.loc[(gen_data_2["Date"] == pd.to_datetime("2020-05-15")) &  (gen_data_2["SOURCE_KEY"] == inverter),"DC_POWER"].plot(label = inverter + " DC")
    gen_data_2.loc[(gen_data_2["Date"] == pd.to_datetime("2020-05-15")) & (gen_data_2["SOURCE_KEY"] == inverter),"AC_POWER"].plot(label = inverter + " AC")
    plt.legend()

In [None]:
inverters

In [None]:
average_power_2 = gen_data_2.reset_index().groupby("DATE_TIME").mean()[["DC_POWER","AC_POWER"]]
total_power_2 = average_power_2 * gen_data_2["SOURCE_KEY"].nunique()
total_power_2["Date"] = total_power_2.index.map(lambda x : x.date())

In [None]:
weather_data_2 = pd.read_csv("../input/solar-power-generation-data/Plant_2_Weather_Sensor_Data.csv",index_col = "DATE_TIME",parse_dates = True)
weather_data_2["Date"] = weather_data_2.index.map(lambda x : x.date())
weather_data_2["Time"] = weather_data_2.index.map(lambda x : x.time())

In [None]:
sns.regplot(x = weather_data_2.groupby("Date")["IRRADIATION"].sum(),y = total_power_2.groupby("Date").sum()["DC_POWER"])

The fit is not as good as the first power plant. Maybe the nonlinearities play a bigger role here, but hey, we have 100% efficiency!

In [None]:
sns.regplot(x = total_power_2.groupby("Date").sum()["DC_POWER"], y = total_power_2.groupby("Date").sum()["AC_POWER"])

In [None]:
model_2 = LinearRegression()
model_2.fit(total_power_2.groupby("Date").sum()["DC_POWER"].values.reshape(-1,1), y = total_power_2.groupby("Date").sum()["AC_POWER"])
model_2.coef_,model_2.intercept_

97% efficiency! (too good to be true!), compared to 10% in case of the other plant