# This Kernel will focus on Data cleaning and Visualization

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# load data
plant_1_g = pd.read_csv("../input/solar-power-generation-data/Plant_1_Generation_Data.csv")
plant_1_w = pd.read_csv("../input/solar-power-generation-data/Plant_1_Weather_Sensor_Data.csv")
plant_2_g = pd.read_csv("../input/solar-power-generation-data/Plant_2_Generation_Data.csv")
plant_2_w = pd.read_csv("../input/solar-power-generation-data/Plant_2_Weather_Sensor_Data.csv")

## convert date column to pandas datetime

In [None]:
# convert date format to match other tables
plant_1_g['DATE_TIME'] = [ x[6:10] + x[2:6] + x[:2] + x[10:] + ":00" for x in plant_1_g['DATE_TIME']]

plant_1_g['DATE_TIME'] = pd.to_datetime(plant_1_g['DATE_TIME'], format="%Y-%m-%d %H:%M:%S")
plant_1_w['DATE_TIME'] = pd.to_datetime(plant_1_w['DATE_TIME'], format="%Y-%m-%d %H:%M:%S")
plant_2_g['DATE_TIME'] = pd.to_datetime(plant_2_g['DATE_TIME'], format="%Y-%m-%d %H:%M:%S")
plant_2_w['DATE_TIME'] = pd.to_datetime(plant_2_w['DATE_TIME'], format="%Y-%m-%d %H:%M:%S")

# add plants together
plant_g = plant_1_g.append(plant_2_g)
plant_w = plant_1_w.append(plant_2_w)

## Check for missing values

In [None]:
print(plant_g.isna().sum(), "\n"); print(plant_w.isna().sum())

## Power outcome for every day


In [None]:
plant_g_day_yield = plant_g[plant_g["DATE_TIME"].dt.time == pd.to_datetime("23:45:00", format="%H:%M:%S").time()]
plant_w_day_yield = plant_w[plant_w["DATE_TIME"].dt.time == pd.to_datetime("23:45:00", format="%H:%M:%S").time()]

# merge both tables to evaluate weather features
plant_gw_day = plant_g_day_yield.merge(plant_w_day_yield, how="inner", on=["DATE_TIME", "PLANT_ID"])

In [None]:
plot_temp = sns.lmplot(data=plant_gw_day, x="AMBIENT_TEMPERATURE", y="DAILY_YIELD")
# plot_temp.set(xlim=(20, 30))

As shown above, with higher AMBIENT_TEMPERATURE usually shines more sun. This means, even tough we dont have any information about the luminocity, we can use the AMBIENT_TEMPERATURE to help us.

In [None]:
ax = sns.scatterplot(data=plant_gw_day, x=plant_gw_day["DATE_TIME"].dt.day, y="DAILY_YIELD", hue="AMBIENT_TEMPERATURE")
# Put a legend to the right side
ax.legend(loc='center right', bbox_to_anchor=(1.5, 0.78), ncol=1)
plt.xlabel("DAY")

plt.show()

In addition, we can messure with the AMBIENT_TEMPERATURE distribution, that the days must have gotten longer, when the data was collected.
Longer days should result in more daylight and more energy.
So the time of the year is an important factor as well, to predict the energy output of a solar plant.

In [None]:
fig, ax = plt.subplots(figsize=(12,9))

sns.boxplot(data=plant_g_day_yield, x="DAILY_YIELD", y="SOURCE_KEY", ax=ax)

Interesting to see is, that all solar panels perform differently. Causes could be:
* Malfuntioning of the panel (panels that perform slightly worse than average)
* Objects between sun and panel (panels that perform worse than average)
* Disconnecting of pannels from grid, to balance out enery grid (panels with outliers at yield == 0)

In [None]:
plant_1_g_error = plant_1_g.copy()

plant_1_g_error["TIME"] = plant_1_g_error["DATE_TIME"].dt.time
plant_1_g_error = plant_1_g_error.groupby(["TIME", "SOURCE_KEY"])["DC_POWER"].mean().unstack()

cmap = sns.color_palette("Spectral", n_colors=12)

fig,ax = plt.subplots(dpi=100)
plant_1_g_error.plot(ax=ax, color=cmap)
ax.legend(loc='center right', bbox_to_anchor=(1.5, 0.78), ncol=1)
plt.ylabel("DC_POWER_AVERAGE")
plt.show()

There are two outliers, that underperform in comparison to the other inverters. This could explain, why some inverters gernerated 0 energy on some days: Someone could have repaired these inverters and has turn parts of the power plant off. ([Read more about underperforming inverters here](https://www.kaggle.com/virosky/how-identify-underperforming-inverters))

# Findings

* With DATE_TIME and additional webscraping is it possible to calcualte the average daytime in Inda. This could be used to predict the daily yield.
* Generally, with a higher AMBIENT_TEMPERATURE, should be more sunlight, which means higher enery generation.
* It stays hard to predict, when solar panels are temporary shut down on purpose (Repair, or balancing out the power grid)
* Module temperature and irradiation don't have a strong correlation between the daily yield