In [64]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime

In [65]:
df=pd.read_csv('All_Errors_Corrected.csv',parse_dates=['Standardized_Date'])
# df

In [66]:
def categorize_remarks(data):
    """
    Categorizes the remarks column based on specific conditions:
    1. 'Power Failure at Intake' if:
       - Contains 'intake' (case-insensitive)
       - Contains 'power' (case-insensitive)
       - Does not contain 'wtp' (case-insensitive)
    2. 'Power Failure at WTP' if:
       - Does not contain 'intake'
       - Contains 'power' (case-insensitive)
       - Contains 'wtp' (case-insensitive)
    """
    def categorize(row):
        remarks = row['REMARKS']
        raw_water = row['RAW WATER FLOW IN m3/h']
        clear_water_pumping = row['CLEAR WATER PUMPING FLOW m3/h']
        
        if pd.notna(remarks):
            remarks_lower = remarks.lower()
            power_failure = 'power' in remarks_lower or 'failure' in remarks_lower
            has_intake = 'intake' in remarks_lower
            has_wtp = 'wtp' in remarks_lower
            
            if (has_intake and power_failure and not has_wtp) or (power_failure and not has_wtp and (raw_water == 0.0 or pd.isna(raw_water)) and not pd.isna(clear_water_pumping) and clear_water_pumping != 0.0):
                return 'Power Failure at Intake'
            elif (not has_intake and power_failure and has_wtp) or (power_failure and not has_intake and not pd.isna(raw_water) and raw_water !=0.0 and (clear_water_pumping == 0.0 or pd.isna(clear_water_pumping))):
                return 'Power Failure at WTP'
            elif (has_intake and power_failure and has_wtp) or (not has_intake and not has_wtp and power_failure and (raw_water == 0.0 or pd.isna(raw_water)) and (clear_water_pumping == 0.0 or pd.isna(clear_water_pumping))):
                return 'Power Failure at intake and WTP'
            elif ('cleaning' in remarks_lower or 'wash' in remarks_lower) and has_intake:
                return 'intake cleaning'
            elif 'maint' in remarks_lower:
                return 'maintanance'
#         return 'Other'
 
    # Apply the categorization function to each row
    data['remarks category'] = data.apply(categorize, axis=1)
    return data

In [67]:
df=categorize_remarks(df)
df

Unnamed: 0,Standardized_Date,STANDARDIZED_TIME,RAW WATER FLOW IN m3/h,CLEAR WATER SUMP LEVEL IN Meter,CLEAR WATER PUMPING FLOW m3/h,TREATED WATER PRODUCTION IN m3/h,REMARKS,remarks category
0,2024-08-01,1:00,2561.58,2.32,1996.36,2479.10,,
1,2024-08-01,2:00,2510.71,2.49,1984.73,2429.87,,
2,2024-08-01,3:00,2489.32,2.71,1998.56,2409.16,,
3,2024-08-01,4:00,2444.78,2.79,1972.01,2366.06,,
4,2024-08-01,5:00,2417.47,2.99,1945.42,2339.63,,
...,...,...,...,...,...,...,...,...
24788,2021-12-31,20:00,2499.51,2.89,1851.25,2419.03,,
24789,2021-12-31,21:00,2477.10,3.06,1878.95,2397.34,,
24790,2021-12-31,22:00,2470.76,3.25,1913.29,2391.20,,
24791,2021-12-31,23:00,2492.04,3.30,1904.43,2411.80,,


In [69]:
df[df['REMARKS'] == 'power failed @9.45 to 2.25']

Unnamed: 0,Standardized_Date,STANDARDIZED_TIME,RAW WATER FLOW IN m3/h,CLEAR WATER SUMP LEVEL IN Meter,CLEAR WATER PUMPING FLOW m3/h,TREATED WATER PRODUCTION IN m3/h,REMARKS,remarks category
13328,2023-02-09,9:00,3532.62,2.93,2787.92,3418.87,power failed @9.45 to 2.25,


In [56]:
df[df['remarks category'] == 'Other']['REMARKS'].unique()

array([nan, 'RW pumping stopped',
       'Raw Water Reading not available due to PLC complaint.',
       'power failure at 7.28 to 7.36 pm', 'channel cieaning',
       'inlet chamber cleaning', 'power failed @9.45 to 2.25', '3', '1',
       'voltage fluctation at intake', 'Pumping stopped due to',
       'pumping sopped due to inake', 'chamber cleaning',
       'pumping started', 'under voltage', 'Pumping Stoped due to intake',
       'Sump Level Low', 'Pumping Stopped due to intake',
       'Pumping Stoped due to Intake', 'Pipe line broken at Taliparamba',
       'power failure', 'intake', 'Intake', 'PERMITTED POWER',
       'TWR Tank cleaning', 'Energy Auditing'], dtype=object)

In [6]:
# Function to fill NaN values in 'remarks category' based on the conditions
def fill_remarks_category(df):
    previous_remark = None

    for i in range(len(df)):
        current_remark = df.loc[i, 'remarks category']
        raw_water_flow = df.loc[i, 'RAW WATER FLOW IN m3/h']
        clear_water_pumping_flow = df.loc[i, 'CLEAR WATER PUMPING FLOW m3/h']

        # Check if 'remarks category' is NaN and if the previous row had a non-NaN remark
        if pd.isna(current_remark):
            # Check if raw water and treated water columns are 0 or NaN
            if ((raw_water_flow == 0 or pd.isna(raw_water_flow)) and (clear_water_pumping_flow == 0 or pd.isna(clear_water_pumping_flow))) or ((raw_water_flow == 0) and (clear_water_pumping_flow != 0)) or ((raw_water_flow != 0) and (clear_water_pumping_flow == 0)):
                # Fill the current NaN remark with the previous non-NaN remark
                df.loc[i, 'remarks category'] = previous_remark
            else:
                previous_remark = current_remark
                
        else:
            # Update the previous_remark if the current remark is not NaN
            previous_remark = current_remark

    return df

# Apply the function to the DataFrame
df_filled = fill_remarks_category(df)