In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

    
import matplotlib.pyplot as plt
import seaborn as sns
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
plant1_generation = pd.read_csv("/kaggle/input/solar-power-generation-data/Plant_1_Generation_Data.csv")
plant1_sensor = pd.read_csv("/kaggle/input/solar-power-generation-data/Plant_1_Weather_Sensor_Data.csv")
plant2_generation = pd.read_csv("/kaggle/input/solar-power-generation-data/Plant_2_Generation_Data.csv")
plant2_sensor = pd.read_csv("/kaggle/input/solar-power-generation-data/Plant_2_Weather_Sensor_Data.csv")

print("Missing Generation Values: Plant 1 \n" + '-' * 80)
print(plant1_generation.isnull().sum())

print('\n')

print("Missing Weather Sensor Values: Plant 1 \n" + '-' * 80)
print(plant1_sensor.isnull().sum())
print('\n')

print("Missing Generation Values: Plant 2 \n" + '-' * 80)
print(plant1_generation.isnull().sum())

print('\n')

print("Missing Weather Sensor Values: Plant 2 \n" + '-' * 80)
print(plant1_sensor.isnull().sum())


In [None]:

def process_time (data):
    """
    Date time observations are recorded at 15 minute intervals
    make sure that the date times are in the correct format
    
    Additionally add Features to utilize in futher analysis
    
    """
    data['DATE_TIME'] = pd.to_datetime(data['DATE_TIME'])

    data['DATE'] = data['DATE_TIME'].dt.date
    data['HOUR'] = data['DATE_TIME'].dt.hour

    return data

plant1_generation = process_time(plant1_generation)
plant1_sensor = process_time(plant1_sensor)

plant2_generation = process_time(plant2_generation)
plant2_sensor = process_time(plant2_sensor)

In [None]:
# which inverter/array perform the best
def ranking_of_mean_inverter_daily_yield(data, plant):
    
    # get a list of daily_yield values with no duplicates
    data = data.sort_values(['SOURCE_KEY', 'DATE_TIME', 'DAILY_YIELD']).drop_duplicates(['SOURCE_KEY', 'DATE'], keep='last')
    
    inverter_output_means = pd.DataFrame()
    
    # for each individual inverter sum up the daily 
    for inverter in data['SOURCE_KEY'].unique():
        
        inverter_mean = data[data['SOURCE_KEY'] == inverter]['DAILY_YIELD'].mean()
        
        inverter_output_means = inverter_output_means.append({'SOURCE_KEY': inverter, 'DAILY_YIELD_MEAN': inverter_mean}, ignore_index=True)
            
    inverter_output_means = inverter_output_means.sort_values(['DAILY_YIELD_MEAN'], ascending=False).reset_index(drop=True)
    
    return inverter_output_means
    





In [None]:
plant1_inverter_ranking = ranking_of_mean_inverter_daily_yield(plant1_generation, 'Plant 1');
print("Plant1: Inverter Ranking")
print(plant1_inverter_ranking)
print(('-' * 80 + '\n')*3)
plant2_inverter_ranking = ranking_of_mean_inverter_daily_yield(plant2_generation, 'Plant2');
print('Plant2: Inverter Ranking')
print(plant2_inverter_ranking)

In [None]:
# Get Plot of Average Day 
def avg_yield_by_hour_for_inverter (data, inverter):
    """ Get the average output of the inverter/grid """
    hourly_output = pd.DataFrame()
    
    for hour in range(0, 23):
        hourly_avg = data[data['SOURCE_KEY'] == inverter]
        hourly_avg_ac = hourly_avg[hourly_avg['HOUR'] == hour]['AC_POWER'].mean()
        
       # hourly_avg_dc = data[data[['SOURCE_KEY'] == inverter]
        hourly_avg_dc = hourly_avg[hourly_avg['HOUR'] == hour]['DC_POWER'].mean()
                             
        hourly_output  = hourly_output.append({'SOURCE_KEY': inverter, 'HOUR_AVG_OUPTUT_AC': hourly_avg_ac, 'HOUR_AVG_OUTPUT_DC': hourly_avg_dc, 'HOUR': hour}, ignore_index=True)
        
    return hourly_output




def plot_inverter_daily_performance(plant=1, inverter_ranking=1):
    """
    Plots the how a specific inverter at a specific plant performs. Plots every output over
    
    plant: plant number
    inverter_ranking: ranking of the inverter that you want to analysize,
        - example : inverter_ranking=1 -> best performing inverter at that plant
    """
    if plant == 1:
        data = plant1_generation
        inverter = plant1_inverter_ranking['SOURCE_KEY'].iloc[inverter_ranking-1]
    elif plant == 2:
        data = plant2_generation
        inverter = plant2_inverter_ranking['SOURCE_KEY'].iloc[inverter_ranking-1]
    else:
        raise ValueError('Enter a valid plant 1 or 2')
        
    
    avg_for_hour = avg_yield_by_hour_for_inverter(data, inverter)
    
    all_data_for_inverter = data[data['SOURCE_KEY'] == inverter]
    
    

    plt.figure(figsize=(10, 10))
    
    sns.scatterplot(x='HOUR', y='AC_POWER', data=all_data_for_inverter, color='red')
    sns.scatterplot(x='HOUR', y='DC_POWER', data=all_data_for_inverter, color='blue')
    
    sns.lineplot(x='HOUR', y='HOUR_AVG_OUPTUT_AC', data=avg_for_hour, color='red')
    sns.lineplot(x='HOUR', y='HOUR_AVG_OUTPUT_DC', data=avg_for_hour, color='blue')
    
    plt.legend(labels=["AC_POWER", "DC_POWER"])
    plt.ylabel("Power Output (Watts)")
    plt.title(f"Plant{plant} , Inverter Perfomance Rank: ({inverter_ranking}/{len(data['SOURCE_KEY'].unique())}) // Performance Over a Day")
    
plot_inverter_daily_performance(1, 1)
plot_inverter_daily_performance(1, 22)

plot_inverter_daily_performance(2, 1)
plot_inverter_daily_performance(2, 22)

We see that plant 1 outputs significantly less AC Power that plant2. This could have multiple explainations:

1. The inverters are broken and are operating with heinously low efficency
2. The power is going to a DC Load such as a batery array and therefore the inverter doesn't need to convert that power