In [1]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from datetime import datetime

flow_rate_per_pump = 932 # m3/hr for pumps at wtp for clear water pumping

df = pd.read_csv('Pattuvam_data_till_Aug_2024_corrected.csv')
df

Unnamed: 0,DATE,TIME,RAW WATER FLOW IN m3/h,CLEAR WATER SUMP LEVEL IN Meter,CLEAR WATER PUMPING FLOW m3/h,TREATED WATER PRODUCTION IN m3/h,REMARKS
0,01/08/2024,01:00,2561.58,2.32,1996.36,2479.10,
1,01/08/2024,02:00,2510.71,2.49,1984.73,2429.87,
2,01/08/2024,03:00,2489.32,2.71,1998.56,2409.16,
3,01/08/2024,04:00,2444.78,2.79,1972.01,2366.06,
4,01/08/2024,05:00,2417.47,2.99,1945.42,2339.63,
...,...,...,...,...,...,...,...
24812,31/12/2021,20:00,2499.51,289,1851.25,2419.03,
24813,31/12/2021,21:00,2477.1,3.06,1878.95,2397.34,
24814,31/12/2021,22:00,2470.76,3.25,1913.29,2391.20,
24815,31/12/2021,23:00,2492.04,3.3,1904.43,2411.80,


In [2]:
### For Date standardistaion

def standardize_date_format(date_str):
    if isinstance(date_str, str):  # Ensure the input is a string
        # Handle DD/MM/YYYY and variations
        if re.match(r'^\d{2}/\d{2}/\d{4}$', date_str):
            return date_str  # Already in DD/MM/YYYY format
        elif re.match(r'^\d{1}/\d{1}/\d{4}$', date_str):                   # D/M/YYYY
            date_obj = datetime.strptime(date_str, '%d/%m/%Y')
            return date_obj.strftime('%d/%m/%Y')
        elif re.match(r'^\d{2}/\d{1}/\d{4}$', date_str):                   # DD/M/YYYY
            date_obj = datetime.strptime(date_str, '%d/%m/%Y')
            return date_obj.strftime('%d/%m/%Y')
        elif re.match(r'^\d{1}/\d{2}/\d{4}$', date_str):                   # D/MM/YYYY
            date_obj = datetime.strptime(date_str, '%d/%m/%Y')
            return date_obj.strftime('%d/%m/%Y')
        elif re.match(r'^\d{1}/\d{1}/\d{2}$', date_str):                   # D/M/YY
            date_obj = datetime.strptime(date_str, '%d/%m/%y')
            return date_obj.strftime('%d/%m/%Y')
        elif re.match(r'^\d{2}/\d{1}/\d{2}$', date_str):                   # DD/M/YY
            date_obj = datetime.strptime(date_str, '%d/%m/%y')
            return date_obj.strftime('%d/%m/%Y')
        elif re.match(r'^\d{1}/\d{2}/\d{2}$', date_str):                   # D/MM/YY
            date_obj = datetime.strptime(date_str, '%d/%m/%y')
            return date_obj.strftime('%d/%m/%Y')
        
        # Convert DD.MM.YYYY to DD/MM/YYYY
        elif re.match(r'^\d{1,2}\.\d{1,2}\.\d{4}$', date_str):
            day, month, year = date_str.split('.')
            return f"{int(day):02}/{int(month):02}/{year}"  # Format as DD/MM/YYYY
        elif re.match(r'^\d{1,2}\.\d{1,2}\.\d{2}$', date_str):
            # Convert DD.MM.YY to DD/MM/YYYY
            day, month, year = date_str.split('.')
            return f"{int(day):02}/{int(month):02}/{int(year):02}"  # Format as DD/MM/YYYY
        elif re.match(r'^\d{4}-\d{2}-\d{2}$', date_str):
            # Convert YYYY-MM-DD to DD/MM/YYYY
            date_obj = datetime.strptime(date_str, '%Y-%m-%d')
            return date_obj.strftime('%d/%m/%Y')
        elif re.match(r'^\d{1,2}-\d{1,2}-\d{4}$', date_str):
            # Convert DD-MM-YYYY to DD/MM/YYYY
            return date_str.replace('-', '/')
        elif re.match(r'^[A-Za-z]+\s\d{1,2},\s\d{4}$', date_str):
            # Convert Month DD, YYYY to DD/MM/YYYY
            date_obj = datetime.strptime(date_str, '%B %d, %Y')
            return date_obj.strftime('%d/%m/%Y')

    return 'Unknown'

df['Standardized_Date'] = df['DATE'].apply(standardize_date_format)


print("DataFrame with standardized dates:")
df

DataFrame with standardized dates:


Unnamed: 0,DATE,TIME,RAW WATER FLOW IN m3/h,CLEAR WATER SUMP LEVEL IN Meter,CLEAR WATER PUMPING FLOW m3/h,TREATED WATER PRODUCTION IN m3/h,REMARKS,Standardized_Date
0,01/08/2024,01:00,2561.58,2.32,1996.36,2479.10,,01/08/2024
1,01/08/2024,02:00,2510.71,2.49,1984.73,2429.87,,01/08/2024
2,01/08/2024,03:00,2489.32,2.71,1998.56,2409.16,,01/08/2024
3,01/08/2024,04:00,2444.78,2.79,1972.01,2366.06,,01/08/2024
4,01/08/2024,05:00,2417.47,2.99,1945.42,2339.63,,01/08/2024
...,...,...,...,...,...,...,...,...
24812,31/12/2021,20:00,2499.51,289,1851.25,2419.03,,31/12/2021
24813,31/12/2021,21:00,2477.1,3.06,1878.95,2397.34,,31/12/2021
24814,31/12/2021,22:00,2470.76,3.25,1913.29,2391.20,,31/12/2021
24815,31/12/2021,23:00,2492.04,3.3,1904.43,2411.80,,31/12/2021


In [3]:
### For time standardisation

def standardize_time(time_str):
 
    time_str = time_str.replace(' ', ':')     #For Replacing the spaces with colons
    
    
        #For cases where time is not in HH:MM format
        
    if len(time_str) == 1:  # Single digit hour
        time_str = f'0{time_str}:00'
    elif len(time_str) == 2:  # Only hour 
        time_str = f'{time_str}:00'
    elif len(time_str) == 4:  # Format like HHMM
        time_str = f'{time_str[:2]}:{time_str[2:]}'
    elif len(time_str) == 5 and ':' not in time_str:  # Format like HHMM
        time_str = f'{time_str[:2]}:{time_str[2:]}'
    
    # Convert to datetime and format as HH:MM
    try:
        return pd.to_datetime(time_str, format='%H:%M', errors='coerce').strftime('%H:%M')
    except ValueError:
        return None  # Return None for unparseable formats

# Apply the function to the TIME column
df['STANDARDIZED_TIME'] = df['TIME'].apply(standardize_time)
df

Unnamed: 0,DATE,TIME,RAW WATER FLOW IN m3/h,CLEAR WATER SUMP LEVEL IN Meter,CLEAR WATER PUMPING FLOW m3/h,TREATED WATER PRODUCTION IN m3/h,REMARKS,Standardized_Date,STANDARDIZED_TIME
0,01/08/2024,01:00,2561.58,2.32,1996.36,2479.10,,01/08/2024,01:00
1,01/08/2024,02:00,2510.71,2.49,1984.73,2429.87,,01/08/2024,02:00
2,01/08/2024,03:00,2489.32,2.71,1998.56,2409.16,,01/08/2024,03:00
3,01/08/2024,04:00,2444.78,2.79,1972.01,2366.06,,01/08/2024,04:00
4,01/08/2024,05:00,2417.47,2.99,1945.42,2339.63,,01/08/2024,05:00
...,...,...,...,...,...,...,...,...,...
24812,31/12/2021,20:00,2499.51,289,1851.25,2419.03,,31/12/2021,20:00
24813,31/12/2021,21:00,2477.1,3.06,1878.95,2397.34,,31/12/2021,21:00
24814,31/12/2021,22:00,2470.76,3.25,1913.29,2391.20,,31/12/2021,22:00
24815,31/12/2021,23:00,2492.04,3.3,1904.43,2411.80,,31/12/2021,23:00


In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
df=df[['Standardized_Date','STANDARDIZED_TIME','RAW WATER FLOW IN m3/h',
       'CLEAR WATER SUMP LEVEL IN Meter', 'CLEAR WATER PUMPING FLOW m3/h',
       'TREATED WATER PRODUCTION IN m3/h', 'REMARKS']]
df

In [None]:
df['Standardized_Date'].isna().sum()

In [None]:
df[2130:2180]

In [None]:
df['Standardized_Date']=pd.to_datetime(df['Standardized_Date'],errors='coerce')
df

In [None]:
df['Standardized_Date'].isna().sum()

In [None]:
df[df['Standardized_Date'].isna()]