# **Load Libraries and Data :**

In [None]:
import pandas as pd
import numpy as np # Will be useful for numerical operations later
import matplotlib.pyplot as plt # Will be useful for plotting
import seaborn as sns # Will be useful for plotting
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Uploading our input csv file :

In [None]:
from google.colab import files
uploaded = files.upload()

Saving input_data.csv to input_data.csv


In [None]:
df = pd.read_csv('input_data.csv')

print("Data loaded successfully")

Data loaded successfully


Initial Data Inspection :


In [None]:
df.head()

Unnamed: 0,time,channel,current,voltage,power,sunlight,performance
0,2025-01-09 00:00:00,1,0.0,0.0,0.0,0.0,0
1,2025-01-09 00:00:00,25,0.0,0.0,0.0,0.0,0
2,2025-01-09 00:00:00,27,0.0,0.0,0.0,0.0,0
3,2025-01-09 00:00:00,3,0.0,0.0,0.0,0.0,0
4,2025-01-09 00:00:00,5,0.0,0.0,0.0,0.0,0


In [None]:
df.describe()

Unnamed: 0,channel,current,voltage,power,sunlight,performance
count,2688.0,2688.0,2688.0,2688.0,2688.0,2688.0
mean,14.5,1.548177,674.80253,1.842645,192.074688,26.614955
std,8.07925,2.973587,511.165682,3.532544,275.504947,40.787122
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,7.75,0.0,0.0,0.0,6.74,0.0
50%,14.5,0.0,556.2,0.0,6.74,0.0
75%,21.25,1.3,1186.5,1.6025,290.245,72.0
max,28.0,11.4,1260.3,13.01,783.0,100.0


In [None]:
df.info() #it is info about our data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2688 entries, 0 to 2687
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   time         2688 non-null   object 
 1   channel      2688 non-null   int64  
 2   current      2688 non-null   float64
 3   voltage      2688 non-null   float64
 4   power        2688 non-null   float64
 5   sunlight     2688 non-null   float64
 6   performance  2688 non-null   int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 147.1+ KB


In [None]:
df.isnull().sum() #checking if data have any null values  : ( we dont have any null values in our data )

Unnamed: 0,0
time,0
channel,0
current,0
voltage,0
power,0
sunlight,0
performance,0


Preprocessing the data :

In [None]:
df['time'] = pd.to_datetime(df['time'])
# time coloumn is a string but we need to treat it as a time series so we converted time into Datetime

In [None]:
channels = df['channel'].unique()
print(channels) # This will show us the 28 unique channel names/numbers

[ 1 25 27  3  5 12 17 14 18 26  4  6 21  9 28 19 24 10 20  7  2 16 23 11
 15 13  8 22]


# **EDA (Exploratory Data Analysis):**

NOTE : ALL THE PLOTS ARE SAVED IN INDIVIDUAL FOLDERS IN THE FILES SECTION

Using the Looping process so that we dont have to individually do 28 channels one by one

In [None]:
# Creating a directory to save the plots
if not os.path.exists('channel_plots'):
    os.makedirs('channel_plots')
# Folder for Correlation Heatmaps
if not os.path.exists('correlation_heatmaps'):
    os.makedirs('correlation_heatmaps')


In [None]:
# Looping through each of our 28 channel to perform the analysis

for channel in channels:
    print(f"--- Analyzing Channel {channel} ---")

    # Filtering the DataFrame for the specific channel
    channel_df = df[df['channel'] == channel].copy()
    channel_df = channel_df.sort_values(by='time')

    # --- EDA Part 1: Correlation between Power and Sunlight ---

    # Creating a regression plot to visualize better than a scatter plot
    plt.figure(figsize=(10, 6))
    sns.regplot(data=channel_df, x='sunlight', y='power', line_kws={"color": "red"})
    plt.title(f'Channel {channel}: Power vs. Sunlight with Regression Line')
    plt.xlabel('Sunlight')
    plt.ylabel('Power (kW)')
    plt.grid(True)
    # Saving the plot to the new directory
    plt.savefig(f'channel_plots/channel_{channel}_power_vs_sunlight.png')
    plt.close()


    # --- EDA Part 2: Correlation between Voltage, Current, and Power ---

    correlation_matrix = channel_df[['voltage', 'current', 'power']].corr()
    # Creating a heatmap plot to visualize the correlation better
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(f'Channel {channel}: Correlation Matrix (Voltage, Current, Power)')
    # Saving the heatmap to the 'correlation_heatmaps' directory
    plt.savefig(f'correlation_heatmaps/channel_{channel}_correlation_matrix.png')
    plt.close()
print("\nAnalysis complete! Plots for each channel have been saved in the 'channel_plots' directory.")

--- Analyzing Channel 1 ---
--- Analyzing Channel 25 ---
--- Analyzing Channel 27 ---
--- Analyzing Channel 3 ---
--- Analyzing Channel 5 ---
--- Analyzing Channel 12 ---
--- Analyzing Channel 17 ---
--- Analyzing Channel 14 ---
--- Analyzing Channel 18 ---
--- Analyzing Channel 26 ---
--- Analyzing Channel 4 ---
--- Analyzing Channel 6 ---
--- Analyzing Channel 21 ---
--- Analyzing Channel 9 ---
--- Analyzing Channel 28 ---
--- Analyzing Channel 19 ---
--- Analyzing Channel 24 ---
--- Analyzing Channel 10 ---
--- Analyzing Channel 20 ---
--- Analyzing Channel 7 ---
--- Analyzing Channel 2 ---
--- Analyzing Channel 16 ---
--- Analyzing Channel 23 ---
--- Analyzing Channel 11 ---
--- Analyzing Channel 15 ---
--- Analyzing Channel 13 ---
--- Analyzing Channel 8 ---
--- Analyzing Channel 22 ---

Analysis complete! Plots for each channel have been saved in the 'channel_plots' directory.


# **Anomaly Visualization :**

In [None]:
# --- Creating Directories to Save the Plots ---
# Main directory for all anomaly plots
base_anomaly_path = 'anomaly_plots'
if not os.path.exists(base_anomaly_path):
    os.makedirs(base_anomaly_path)

# Subdirectory for flat line anomaly plots
flat_line_path = os.path.join(base_anomaly_path, 'flat_lines')
if not os.path.exists(flat_line_path):
    os.makedirs(flat_line_path)

# Subdirectory for sudden drop anomaly plots
sudden_drop_path = os.path.join(base_anomaly_path, 'sudden_drops')
if not os.path.exists(sudden_drop_path):
    os.makedirs(sudden_drop_path)

* I use below the Maximum Power (max_power) achieved by each channel to set
accurate, relative thresholds for detection.

* This allows me to check for faults like stalled power (flat lines) only during high-production periods (e.g., ≥70% of max), or for sudden drops relative to the channel's potential.

* By anchoring the logic to max_power, I ensure I reliably flag true operational issues.

In [None]:
# --- Looping through each channel to visualize the anomalies ---

for channel in channels:
    print(f"--- Visualizing Anomalies for Channel {channel} ---")

    # Filtering the DataFrame for the specific channel and set time as index
    channel_df = df[df['channel'] == channel].copy()
    channel_df = channel_df.sort_values(by='time').set_index('time')

    # --- Anomaly Detection Logic ---
    max_power = channel_df['power'].max()

    # Anomaly 1: Detect Flat Lines
    is_high_power = channel_df['power'] > max_power * 0.70
    power_change = channel_df['power'].diff().abs()
    is_flat = power_change < 0.1
    flat_anomalies = channel_df[is_high_power & is_flat]

    # Anomaly 2: Detect Sudden Drops
    sudden_drop_threshold = max_power * -0.25
    power_diff = channel_df['power'].diff()
    drop_anomalies = channel_df[power_diff < sudden_drop_threshold]

    # --- Plotting for Flat Line Anomalies ---
    if not flat_anomalies.empty:
        plt.figure(figsize=(15, 7))
        # Base power curve
        plt.fill_between(channel_df.index, channel_df['power'], color="yellow", alpha=0.6, label='Power Generation')
        plt.plot(channel_df.index, channel_df['power'], color="orange")
        # Highlighting anomalies
        plt.scatter(flat_anomalies.index, flat_anomalies['power'], color='red', s=50, zorder=5, label='Flat Line Anomaly')
        # Formatting and saving
        plt.title(f'Channel {channel}: Flat Line Anomalies', fontsize=16)
        plt.xlabel('Time of Day', fontsize=12)
        plt.ylabel('Power (kW)', fontsize=12)
        plt.grid(True, linestyle='--', alpha=0.6)
        plt.legend()
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(flat_line_path, f'channel_{channel}_flat_lines.png'))
        plt.close()

    # --- Plotting for Sudden Drop Anomalies ---
    if not drop_anomalies.empty:
        plt.figure(figsize=(15, 7))
        # Base power curve
        plt.fill_between(channel_df.index, channel_df['power'], color="yellow", alpha=0.6, label='Power Generation')
        plt.plot(channel_df.index, channel_df['power'], color="orange")
        # Highlighting anomalies
        plt.scatter(drop_anomalies.index, drop_anomalies['power'], color='darkred', marker='v', s=100, zorder=5, label='Sudden Drop Anomaly')
        # Formatting and saving
        plt.title(f'Channel {channel}: Sudden Drop Anomalies', fontsize=16)
        plt.xlabel('Time of Day', fontsize=12)
        plt.ylabel('Power (kW)', fontsize=12)
        plt.grid(True, linestyle='--', alpha=0.6)
        plt.legend()
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(sudden_drop_path, f'channel_{channel}_sudden_drops.png'))
        plt.close()

print("\nAnomaly visualization complete! Plots have been saved into their respective folders.")

--- Visualizing Anomalies for Channel 1 ---
--- Visualizing Anomalies for Channel 25 ---
--- Visualizing Anomalies for Channel 27 ---
--- Visualizing Anomalies for Channel 3 ---
--- Visualizing Anomalies for Channel 5 ---
--- Visualizing Anomalies for Channel 12 ---
--- Visualizing Anomalies for Channel 17 ---
--- Visualizing Anomalies for Channel 14 ---
--- Visualizing Anomalies for Channel 18 ---
--- Visualizing Anomalies for Channel 26 ---
--- Visualizing Anomalies for Channel 4 ---
--- Visualizing Anomalies for Channel 6 ---
--- Visualizing Anomalies for Channel 21 ---
--- Visualizing Anomalies for Channel 9 ---
--- Visualizing Anomalies for Channel 28 ---
--- Visualizing Anomalies for Channel 19 ---
--- Visualizing Anomalies for Channel 24 ---
--- Visualizing Anomalies for Channel 10 ---
--- Visualizing Anomalies for Channel 20 ---
--- Visualizing Anomalies for Channel 7 ---
--- Visualizing Anomalies for Channel 2 ---
--- Visualizing Anomalies for Channel 16 ---
--- Visualizing An

# **Channel Classification:**

I realized my initial attempt at channel classification failed because I calculated the overall average power. Including all the zero-power nighttime hours dragged that average down so much, the classification output for every channel was incorrectly marked as "Poor".

To fix this, I switched to calculate only during daytime average which gave me appropriate results.

In [None]:
daytime_df = df[df['power'] > 0].copy()

# --- 2. Calculating the Average Performance During the Day ---
#  grouping by channel and find the mean performance ONLY for the daytime data.
daytime_channel_performance = daytime_df.groupby('channel')['performance'].mean().reset_index()
daytime_channel_performance.rename(columns={'performance': 'daytime_avg_performance'}, inplace=True)

# --- 3. Creating the Classification Label using the Given Rules ---
# This function is our rule-based classifier
def classify_performance(score):
    if score > 80:
        return 'Excellent'
    elif 60 <= score <= 80:
        return 'Good'
    else:
        return 'Poor'

# Applying the classifier to the DAYTIME AVERAGE score
daytime_channel_performance['category'] = daytime_channel_performance['daytime_avg_performance'].apply(classify_performance)


# --- 4. Displaying the Final Results ---
print("\n--- Final Channel Classifications (Based on Daytime Average Performance) ---")
print(daytime_channel_performance.sort_values(by='daytime_avg_performance', ascending=False))


--- Final Channel Classifications (Based on Daytime Average Performance) ---
    channel  daytime_avg_performance   category
5         6                89.822222  Excellent
20       23                89.317073  Excellent
7         8                89.000000  Excellent
22       26                88.909091  Excellent
9        10                88.800000  Excellent
14       16                88.431818  Excellent
3         4                88.116279  Excellent
23       28                87.727273  Excellent
6         7                87.121951  Excellent
11       12                86.733333  Excellent
17       20                86.377778  Excellent
0         1                86.097561  Excellent
2         3                85.951220  Excellent
18       21                85.878049  Excellent
10       11                85.804878  Excellent
4         5                85.268293  Excellent
15       17                85.121951  Excellent
8         9                84.414634  Excellent
12       1