# 1 sample 

In [None]:
# app.py

import streamlit as st
import pandas as pd
from preprocessor import load_data, preprocess_generation_data, preprocess_weather_data
import matplotlib.pyplot as plt
import seaborn as sns

# App title and description
st.title("Solar Power Data Analysis and Visualization")
st.write("This dashboard provides insights into solar power generation and weather data across two plants.")

# Step 1: Load and preprocess data using the preprocessor functions
df_generation, df_weather = load_data()
df_generation = preprocess_generation_data(df_generation)
df_weather = preprocess_weather_data(df_weather)

# Sidebar for data selection and filters
st.sidebar.header("Filter Data")

# Choose dataset to visualize (Generation or Weather Data)
data_choice = st.sidebar.radio("Choose a dataset to visualize:", ("Generation Data", "Weather Data"))

# Date filter for both datasets
start_date = st.sidebar.date_input("Start date", value=pd.to_datetime("2020-05-15"))
end_date = st.sidebar.date_input("End date", value=pd.to_datetime("2020-06-15"))

# Filter data based on the selected date range
if data_choice == "Generation Data":
    df_selected = df_generation[(df_generation['DATE_TIME'] >= pd.to_datetime(start_date)) & 
                                (df_generation['DATE_TIME'] <= pd.to_datetime(end_date))]
elif data_choice == "Weather Data":
    df_selected = df_weather[(df_weather['DATE_TIME'] >= pd.to_datetime(start_date)) & 
                             (df_weather['DATE_TIME'] <= pd.to_datetime(end_date))]

# Display filtered data
st.write(f"Showing data from {start_date} to {end_date}")
st.write(df_selected.head())

# Step 2: Summarize the dataset
st.subheader("Summary Statistics")
if data_choice == "Generation Data":
    st.write(f"Total AC Power Generated: {df_selected['AC_POWER'].sum()} kW")
    st.write(f"Total DC Power Generated: {df_selected['DC_POWER'].sum()} kW")
elif data_choice == "Weather Data":
    st.write(f"Average Ambient Temperature: {df_selected['AMBIENT_TEMPERATURE'].mean()} °C")
    st.write(f"Average Irradiation: {df_selected['IRRADIATION'].mean()} W/m²")

# Step 3: Visualization
st.subheader("Visualizations")

if data_choice == "Generation Data":
    # Time series plot of AC Power over time
    st.write("AC Power Over Time")
    fig, ax = plt.subplots()
    ax.plot(df_selected['DATE_TIME'], df_selected['AC_POWER'], color='blue', label='AC Power')
    plt.xticks(rotation=45)
    plt.xlabel('Date')
    plt.ylabel('AC Power (kW)')
    st.pyplot(fig)

    # Correlation matrix between AC_POWER, DC_POWER, DAILY_YIELD, TOTAL_YIELD
    st.write("Correlation Matrix")
    corr = df_selected[['AC_POWER', 'DC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD']].corr()
    fig, ax = plt.subplots()
    sns.heatmap(corr, annot=True, cmap='coolwarm', ax=ax)
    st.pyplot(fig)

elif data_choice == "Weather Data":
    # Scatter plot: Ambient Temperature vs. Irradiation
    st.write("Ambient Temperature vs. Irradiation")
    fig, ax = plt.subplots()
    ax.scatter(df_selected['AMBIENT_TEMPERATURE'], df_selected['IRRADIATION'], color='green')
    plt.xlabel('Ambient Temperature (°C)')
    plt.ylabel('Irradiation (W/m²)')
    st.pyplot(fig)

    # Distribution plot of Ambient Temperature
    st.write("Distribution of Ambient Temperature")
    fig, ax = plt.subplots()
    sns.histplot(df_selected['AMBIENT_TEMPERATURE'], bins=30, ax=ax)
    plt.xlabel('Ambient Temperature (°C)')
    st.pyplot(fig)

# Step 4: Option to download filtered data
st.sidebar.subheader("Download Filtered Data")

# Use st.cache_data for caching data-related operations
@st.cache_data
def convert_df_to_csv(df):
    return df.to_csv(index=False).encode('utf-8')

csv = convert_df_to_csv(df_selected)


st.sidebar.download_button(
    label="Download CSV",
    data=csv,
    file_name=f"{data_choice}_filtered_data.csv",
    mime='text/csv',
)


In [None]:
# preprocessor
import pandas as pd

# Function to load both generation and weather data
def load_data():
    # Load generation data for both plants
    df1 = pd.read_csv('Plant_1_Generation_Data.csv')
    df1_ = pd.read_csv('Plant_2_Generation_Data.csv')
    
    # Load weather data for both plants
    df2 = pd.read_csv('Plant_1_Weather_Sensor_Data.csv')
    df2_ = pd.read_csv('Plant_2_Weather_Sensor_Data.csv')
    
    # Concatenate the data row-wise for both generation and weather data
    df_generation = pd.concat([df1, df1_], axis=0, ignore_index=True)
    df_weather = pd.concat([df2, df2_], axis=0, ignore_index=True)
    
    return df_generation, df_weather

# Function to preprocess the generation data
def preprocess_generation_data(df):
    # Convert DATE_TIME column to datetime
    df['DATE_TIME'] = pd.to_datetime(df['DATE_TIME'], errors='coerce')

    # Convert PLANT_ID to category (optional)
    df['DATE_TIME'] = pd.to_datetime(df['DATE_TIME'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

    
    return df

# Function to preprocess the weather data
def preprocess_weather_data(df):
    # Convert DATE_TIME column to datetime
    df['DATE_TIME'] = pd.to_datetime(df['DATE_TIME'], dayfirst=True, errors='coerce')

    # Convert PLANT_ID to category (optional)
    df['DATE_TIME'] = pd.to_datetime(df['DATE_TIME'], format='mixed', errors='coerce')

    return df


# sample 2

In [None]:
# pre
import pandas as pd

def load_data():
    """
    Load generation and weather data from CSV files for both plants.
    Returns:
        df_generation: DataFrame containing the generation data for both plants.
        df_weather: DataFrame containing the weather sensor data for both plants.
    """
    try:
        # Load the datasets
        df1 = pd.read_csv('Plant_1_Generation_Data.csv')
        df1_ = pd.read_csv('Plant_2_Generation_Data.csv')
        df2 = pd.read_csv('Plant_1_Weather_Sensor_Data.csv')
        df2_ = pd.read_csv('Plant_2_Weather_Sensor_Data.csv')

        # Concatenate the generation data and weather data for both plants
        df_generation = pd.concat([df1, df1_], axis=0, ignore_index=True)
        df_weather = pd.concat([df2, df2_], axis=0, ignore_index=True)

        return df_generation, df_weather
    
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None, None


def preprocess_data(df_generation, df_weather):
    """
    Preprocess the generation and weather data.
    Steps:
        - Merge generation and weather data
        - Handle missing values
        - Feature engineering (extracting new time-based columns)
        - Calculating performance metrics
    """
    if df_generation is None or df_weather is None:
        return pd.DataFrame()  # Return empty DataFrame if there's an error in loading

    # Convert 'DATE_TIME' to datetime format
    df_generation['DATE_TIME'] = pd.to_datetime(df_generation['DATE_TIME'], errors='coerce')
    df_weather['DATE_TIME'] = pd.to_datetime(df_weather['DATE_TIME'], errors='coerce')
    
    # Merge generation and weather data on 'DATE_TIME'
    df = pd.merge(df_generation, df_weather, on='DATE_TIME', how='inner')
    
    # Handling missing values
    df.fillna(method='ffill', inplace=True)  # Forward fill for missing values
    df.fillna(method='bfill', inplace=True)  # Backward fill as a backup
    df['IRRADIATION'].fillna(df['IRRADIATION'].mean(), inplace=True)  # Fill IRRADIATION with mean

    # Feature engineering: extracting useful columns from 'DATE_TIME'
    df['HOUR_OF_DAY'] = df['DATE_TIME'].dt.hour
    df['DAY_OF_WEEK'] = df['DATE_TIME'].dt.dayofweek  # 0=Monday, 6=Sunday
    df['MONTH'] = df['DATE_TIME'].dt.month

    # Calculate performance ratio: AC Power / DC Power (assuming DC_POWER column exists)
    df['PERFORMANCE_RATIO'] = df['AC_POWER'] / df['DC_POWER']
    
    # Drop unnecessary columns if needed
    df.drop(columns=['SOURCE_KEY'], inplace=True, errors='ignore')

    return df


def summary_statistics(df):
    """
    Generate summary statistics for key columns.
    Returns:
        A dictionary with summary stats for key metrics.
    """
    if df.empty:
        return {}

    stats = {
        'Total AC Power (MWh)': df['AC_POWER'].sum() / 1000,  # Convert to MWh
        'Average Ambient Temperature': df['AMBIENT_TEMPERATURE'].mean(),
        'Max Module Temperature': df['MODULE_TEMPERATURE'].max(),
        'Min Irradiation': df['IRRADIATION'].min(),
        'Max Irradiation': df['IRRADIATION'].max(),
    }
    return stats


In [None]:
# main
import streamlit as st
import pandas as pd
import plotly.express as px
from preprocessor import load_data, preprocess_data, summary_statistics

# Title and Description
st.title("🌞 Solar Power Data Analysis and Visualization 🌞")
st.write("Analyze solar power generation data from two plants and gain key insights through visualizations.")

# Load and preprocess the data
df_generation, df_weather = load_data()
df = preprocess_data(df_generation, df_weather)

# Summary Statistics
st.sidebar.subheader("Summary Statistics")
stats = summary_statistics(df)
for key, value in stats.items():
    st.sidebar.write(f"{key}: {value:.2f}")

# Sidebar for filtering
st.sidebar.header("Filter Options")
plant_option = st.sidebar.selectbox("Select Plant", ['Both', 'Plant 1', 'Plant 2'])
date_range = st.sidebar.date_input("Select Date Range", [])

# Apply filters for plant
if plant_option == 'Plant 1':
    df_filtered = df[df['PLANT_ID'] == 'Plant_1']
elif plant_option == 'Plant 2':
    df_filtered = df[df['PLANT_ID'] == 'Plant_2']
else:
    df_filtered = df

# Apply date filters
if len(date_range) == 2:
    start_date, end_date = date_range
    df_filtered = df_filtered[df_filtered['DATE_TIME'].between(start_date, end_date)]

# Display filtered data (Optional)
st.write(f"Displaying data for {plant_option}")
st.dataframe(df_filtered.head())

# Visualization 1: Power Generation Over Time
st.subheader("Power Generation Over Time")
fig = px.line(df_filtered, x='DATE_TIME', y='AC_POWER', title="AC Power Generation Over Time")
st.plotly_chart(fig)

# Visualization 2: Ambient vs Module Temperature
st.subheader("Ambient vs Module Temperature")
fig2 = px.scatter(df_filtered, x='AMBIENT_TEMPERATURE', y='MODULE_TEMPERATURE', color='PLANT_ID',
                  title="Ambient Temperature vs Module Temperature")
st.plotly_chart(fig2)

# Additional Visualizations
# Example of bar chart for average power
st.subheader("Average Daily Power Generation")
df_filtered['DATE'] = df_filtered['DATE_TIME'].dt.date
daily_power = df_filtered.groupby('DATE')['AC_POWER'].mean().reset_index()
fig3 = px.bar(daily_power, x='DATE', y='AC_POWER', title="Average Daily Power Generation")
st.plotly_chart(fig3)

# Footer with Key Insights
st.write("🚀 **Key Insights:**")
st.write("""
- **Peak Power Generation**: Typically between 10 AM and 2 PM.
- **Seasonal Impact**: Higher generation in summer.
- **Plant Performance**: Certain plants perform better, suggesting geographical advantages.
""")


# this is original

# 1