In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Descriptive analytics
### Task Details
 *Explore the data and try to answer questions like:*
 - What is the mean value of daily yield?
 - What is the total irradiation per day?
 - What is the max ambient and module temperature?
 - How many inverters are there for each plant?
 - What is the maximum/minimum amount of DC/AC Power generated in a time interval/day?
 - Which inverter (source_key) has produced maximum DC/AC power?
 - Rank the inverters based on the DC/AC power they produce
 - Is there any missing data?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df1 = pd.read_csv('/kaggle/input/solar-power-generation-data/Plant_1_Generation_Data.csv')
df2 = pd.read_csv('/kaggle/input/solar-power-generation-data/Plant_1_Weather_Sensor_Data.csv')
df3 = pd.read_csv('/kaggle/input/solar-power-generation-data/Plant_2_Generation_Data.csv')
df4 = pd.read_csv('/kaggle/input/solar-power-generation-data/Plant_2_Weather_Sensor_Data.csv')
print('Load Done')

In [None]:
print(df1.info())
print('Describe\n', df1.describe())

In [None]:
print(df2.info())
print('Describe\n', df2.describe())

In [None]:
print(df3.info())
print('Describe\n', df3.describe())

In [None]:
print(df4.info())
print('Describe\n', df4.describe())

In [None]:
p1_generation_df = df1.astype({'DATE_TIME' : np.datetime64})
p1_generation_df.info()
p2_generation_df = df3.astype({'DATE_TIME' : np.datetime64})
p2_generation_df.info()
p1_sensor_df = df2.astype({'DATE_TIME' : np.datetime64})
p1_sensor_df.info()
p2_sensor_df = df4.astype({'DATE_TIME' : np.datetime64})
p2_sensor_df.info()

### What is the mean value of daily yield?

In [None]:
print('Daily yield Generator 1 mean value\n', p1_generation_df['DAILY_YIELD'].mean())
print('Daily yield Generator 2 mean value\n', p2_generation_df['DAILY_YIELD'].mean())

### What is the total irradiation per day?

In [None]:
import datetime
p1_sensor_df['DATE'] = pd.to_datetime(p1_sensor_df['DATE_TIME']).dt.date
p2_sensor_df['DATE'] = pd.to_datetime(p2_sensor_df['DATE_TIME']).dt.date

In [None]:
plt.figure(figsize=(16,9))
plt.plot(p1_sensor_df['IRRADIATION'].groupby(p1_sensor_df['DATE']).sum())
plt.title('Irradiation per day Sensor 1')
plt.xlabel('Date')
plt.ylabel('Irradiation')
plt.show()

In [None]:
plt.figure(figsize=(16,9))
plt.plot(p2_sensor_df['IRRADIATION'].groupby(p2_sensor_df['DATE']).sum())
plt.title('Irradiation per day Sensor 2')
plt.xlabel('Date')
plt.ylabel('Irradiation')
plt.show()

### What is the max ambient and module temperature?

In [None]:
print('Max ambient temperature Sensor 1\n', p1_sensor_df['AMBIENT_TEMPERATURE'].max())
print('Max module temperature Sensor 1\n', p1_sensor_df['MODULE_TEMPERATURE'].max())

In [None]:
print('Max ambient temperature Sensor 2\n', p2_sensor_df['AMBIENT_TEMPERATURE'].max())
print('Max module temperature Sensor 2\n', p2_sensor_df['MODULE_TEMPERATURE'].max())

### How many inverters are there for each plant?

In [None]:
print('Plant 1 inverters\n', p1_generation_df['SOURCE_KEY'].nunique())
print('Plant 2 inverters\n', p2_generation_df['SOURCE_KEY'].nunique())

### What is the maximum/minimum amount of DC/AC Power generated in a time interval/day?

In [None]:
p1_generation_df['DATE'] = pd.to_datetime(p1_generation_df['DATE_TIME']).dt.date
p2_generation_df['DATE'] = pd.to_datetime(p2_generation_df['DATE_TIME']).dt.date
p1_generation_df['TIME'] = pd.to_datetime(p1_generation_df['DATE_TIME']).dt.time
p2_generation_df['TIME'] = pd.to_datetime(p2_generation_df['DATE_TIME']).dt.time

In [None]:
print('Maximum amount of DC POWER in a day Gen 1\n',
p1_generation_df['DC_POWER'].groupby(p1_generation_df['DATE']).sum().max())
print('Minimum amount of DC POWER in a day Gen 1\n',
p1_generation_df['DC_POWER'].groupby(p1_generation_df['DATE']).sum().min())
print('Maximum amount of DC POWER in a day Gen 2\n',
p2_generation_df['DC_POWER'].groupby(p2_generation_df['DATE']).sum().max())
print('Minimum amount of DC POWER in a day Gen 2\n',
p2_generation_df['DC_POWER'].groupby(p2_generation_df['DATE']).sum().min())

In [None]:
print('Maximum amount of AC POWER in a time interval Gen 1\n',
p1_generation_df['AC_POWER'].groupby(p1_generation_df['TIME']).sum().max())
print('Minimum amount of AC POWER in a time interval Gen 1\n',
p1_generation_df['AC_POWER'].groupby(p1_generation_df['TIME']).sum().min())
print('Maximum amount of AC POWER in a time interval Gen 2\n',
p2_generation_df['AC_POWER'].groupby(p2_generation_df['TIME']).sum().max())
print('Minimum amount of AC POWER in a time interval Gen 2\n',
p2_generation_df['AC_POWER'].groupby(p2_generation_df['TIME']).sum().min())

### Which inverter (source_key) has produced maximum DC/AC power?


In [None]:
print('Inverter that produce maximum DC/AC Power Generator 1\n',
      p1_generation_df.loc[(p1_generation_df['DC_POWER']==p1_generation_df['DC_POWER'].max()) & 
                           (p1_generation_df['AC_POWER']==p1_generation_df['AC_POWER'].max()), 
                           ['SOURCE_KEY']])

In [None]:
print('Inverter that produce maximum DC/AC Power Generator 2\n',
      p2_generation_df.loc[(p2_generation_df['DC_POWER']==p2_generation_df['DC_POWER'].max()) & 
                           (p2_generation_df['AC_POWER']==p2_generation_df['AC_POWER'].max()), 
                           ['SOURCE_KEY']])

### Rank the inverters based on the DC/AC power they produce

In [None]:
p1_generation_df['RANK'] = p1_generation_df.sort_values(['AC_POWER', 'DC_POWER'])['DC_POWER'].index + 1
print('Inverters rank Generation 1\n',p1_generation_df[['SOURCE_KEY', 'RANK']])

In [None]:
p2_generation_df['RANK'] = p2_generation_df.sort_values(['AC_POWER', 'DC_POWER'])['DC_POWER'].index + 1
print('Inverters rank Generation 2\n',p2_generation_df[['SOURCE_KEY', 'RANK']])

### Is there any missing data?

Here is no any missing data, I guess, becasuse here is non non-null data in the rows, no duplicates. To answer this questions enogh data to solve task.