In [None]:
# data preprocessing
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display
import gc

# Dataset

In [None]:
def get_df(paths, **kwargs):
    
    ''' Read multiple data into a single dataframe '''
    
    if len(paths)==0:
        raise ValueError("there must be at least one valid path")
        
    df = pd.read_csv(paths[0], **kwargs)
    for path in paths[1:]:
        df = df.append(pd.read_csv(path, **kwargs), ignore_index=True)
        
    return df

In [None]:
plant = get_df(["../input/solar-power-generation-data/Plant_{}_Generation_Data.csv".format(i) for i in range(1,3)], parse_dates=["DATE_TIME"])
plant.head()

In [None]:
weather = get_df(["../input/solar-power-generation-data/Plant_{}_Weather_Sensor_Data.csv".format(i) for i in range(1,3)], parse_dates=["DATE_TIME"])
weather.head()

Weather and power generation data can be merged into a single dataframe.

In [None]:
df = plant.merge(weather, on=["DATE_TIME", "PLANT_ID"], suffixes=("_GENERATION", "_WEATHER"))
df.head()

In [None]:
df.describe(datetime_is_numeric=True)

In [None]:
del plant; del weather
_ = gc.collect()

In [None]:
df.isnull().sum()

Values in dataset are all valid, so they can be used as they are.

In [None]:
df = df.sort_values(["PLANT_ID", "DATE_TIME"]).reset_index(drop=True)

# Data visualization

In [None]:
def plot_date(df, date):
    
    ''' Plot data of the specified date'''
    
    plants = df.PLANT_ID.unique()
    df = df.loc[df.DATE_TIME.apply(lambda x: x.date()) == pd.Timestamp(date).date()]
    y_list = ["DC_POWER", "AC_POWER", "AMBIENT_TEMPERATURE", "MODULE_TEMPERATURE", "IRRADIATION"]
    x = "DATE_TIME"
    colors = ["C{}".format(i) for i in range(len(y_list))]
    
    fig,ax = plt.subplots(1, len(y_list), figsize=(7*len(y_list),5))
    
    for i,y in enumerate(y_list):
        sns.lineplot(data=df, x=x, y=y, hue="PLANT_ID", palette=["C0", "C1"], ax=ax[i])
        ax[i].set_xlabel("")
        ax[i].tick_params(axis="x", rotation=45)

In [None]:
for date in ["2020-05-15", "2020-06-17"]:
    plot_date(df, date)

Plant `4136001` seems to produce less power than `4135001` looking at `DC_POWER` production.
Other variables do not seem to differ a lot between the plants.

In [None]:
fig,ax = plt.subplots(figsize=(8,7))
ax = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True, ax=ax, fmt="1.2f", cmap="bwr", mask=np.triu(np.ones(shape=df.corr().shape), k=1).astype(bool))

Correlation study shows some information:
- second plant produces less DC power;
- temperature increases with irradiation;
- power production increases with irradiation (and so collateraly with temperature);
- power production is more related to module's then to ambient temperature.