In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#Loading dataset

In [None]:
data = pd.read_csv('final_data_in_ML.csv',index_col='Standardized_Date',parse_dates=True)
data.shape

In [None]:
data.info()

In [None]:
data.columns

In [None]:
data_2days = data[(data.index>='2022-01-11')&(data.index<='2022-01-12')].reset_index()
data_2days

In [None]:
import matplotlib.pyplot as plt

In [None]:
data_11_jan_2022 = data[(data.index=='2022-01-11')].reset_index()
data_12_jan_2022 = data[(data.index=='2022-01-12')].reset_index()
data_12_jan_2022

In [None]:
plt.plot(data_12_jan_2022['STANDARDIZED_TIME'], data_12_jan_2022['RAW WATER FLOW IN ML'], label='January 12, 2022', marker='o',color='g')
plt.plot(data_11_jan_2022['STANDARDIZED_TIME'], data_11_jan_2022['RAW WATER FLOW IN ML'], label='January 11, 2022', marker='o',color='r')
plt.xticks(rotation=45)
plt.show()

## Resample the data to day

In [None]:
data_day = data.resample('D').agg({'RAW WATER FLOW IN ML':'sum',
                                   'CLEAR WATER SUMP LEVEL IN Meter':'mean',
                                   'CLEAR WATER PUMPING FLOW ML':'sum',
                                   'TREATED WATER PRODUCTION IN ML':'sum', 'remarks category':'unique'})

In [None]:
data_day

## Recovery Percentage
The significance of this metric is that it indicates how much of the raw water input is successfully converted into treated water output. A higher recovery percentage suggests that the plant is optimizing the utilization of raw water resources and minimizing water losses during the treatment process. This metric is important for evaluating the plant's operational efficiency and water conservation efforts 

In [None]:
data_day['Recovery_percentage'] = ((data_day['TREATED WATER PRODUCTION IN ML']/data_day['RAW WATER FLOW IN ML'])*100).round(2)
data_day.shape

In [None]:
data_day

In [None]:
data_day[data_day.Recovery_percentage.isna()]

In [None]:
data_day.Recovery_percentage.unique()

In [None]:
data_day.Recovery_percentage.fillna(0.0,inplace=True)
data_day[data_day.Recovery_percentage.isna()]

In [None]:
data_day.Recovery_percentage.unique()

In [None]:
data_day[data_day.Recovery_percentage>=100]

In [None]:
data_day[data_day.Recovery_percentage<96]

In [None]:
data_day[data_day.Recovery_percentage]

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(data_day.index, data_day['Recovery_percentage'], marker='o', color='b')

plt.title('Recovery Percentage Over Time')
plt.xlabel('Date')
plt.ylabel('Recovery Percentage (%)')
plt.xticks(rotation=45)
plt.grid(True)

plt.show()

## Capacity Utilization 

This metric is the ratio of the total treated water production in MLD to the design capacity of the WTP.
This metric indicates the extent to which the WTP is utilizing its designed capacity. This provides an insight into the operational efficiency of the plant.

In [None]:
data_day['Capacity_utilization'] = ((data_day['TREATED WATER PRODUCTION IN ML']/93)*100).round(2)
data_day.shape

In [None]:
data_day.head()

In [None]:
data_day[data_day['Capacity_utilization']>=100]

In [None]:
data_day['Capacity_utilization'].max()

In [None]:
data_day['Capacity_utilization'].min()

In [None]:
data_day[data_day['Capacity_utilization']==0.0]

In [None]:
data_day['Capacity_utilization'].nsmallest(60)

In [None]:
data_day[data_day['Capacity_utilization']!=0.0]['Capacity_utilization'].min()

In [None]:
data_day[data_day['Capacity_utilization']==10.5]

# Time Series Plot

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(data_day.index, data_day['Capacity_utilization'], marker='o', color='g')

plt.title('Capacity Utilization Over Time')
plt.xlabel('Date')
plt.ylabel('Capacity Utilization (%)')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

# Histogram

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(data_day['Capacity_utilization'], bins=20, color='purple', edgecolor='black')

plt.title('Distribution of Capacity Utilization')
plt.xlabel('Capacity Utilization (%)')
plt.ylabel('Frequency')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Create the histogram
plt.figure(figsize=(10, 6))
n, bins, patches = plt.hist(data_day['Capacity_utilization'], bins=20, color='purple', edgecolor='black')

# Annotating each bar with its value (frequency)
for i in range(len(patches)):
    height = n[i]
    plt.text(bins[i] + (bins[i+1] - bins[i]) / 2, height, f'{int(height)}', ha='center', va='bottom', fontsize=10)

plt.title('Distribution of Capacity Utilization')
plt.xlabel('Capacity Utilization (%)')
plt.ylabel('Count')
plt.show()


## Plant Availability (need to discuss with Rajeshchetan)

The plant availability metric indicates how often the plant is available to perform its functions compared to the total time it could potentially operate. 

 

In [None]:
planned_operating_time = 24
# actual_operating_time

In [None]:
data.resample('D').agg({'remarks category':'count'})

In [None]:
data_wtp = data[data['remarks category'].str.contains('WTP')]
data_wtp

In [None]:
data_wtp_hours = data_wtp.resample('D').agg({'remarks category':'count'})
data_wtp_hours

In [None]:
# data[data.index=='2022-01-10']

In [None]:
data_wtp_hours['plant_availability'] = (((24-data_wtp_hours['remarks category'])/24)*100).round(2)

In [None]:
data_wtp_hours

## Shift wise treated water production 

An aggregate of treated water production during each shift helps the operator to understand the volume of raw water, treated water and clear water pumping flow during each shift. 

Shift timings:  Shift1 from 6 am to 2 pm, Shift 2 from 2 pm to 10 pm and Shift 3 from 10 pm to 6 pm 

In [None]:
df = data.reset_index()

In [None]:
def assign_shift(hour):
    if 6 <= hour < 14:
        return '1'
    elif 14 <= hour < 22:
        return '2'
    else:
        return '3'

In [None]:
df['Hour'] = pd.to_datetime(df['STANDARDIZED_TIME']).dt.hour

In [None]:
df.dtypes

In [None]:
df

In [None]:
df['Shift'] = df['Hour'].apply(assign_shift)
df

In [None]:
import matplotlib.pyplot as plt

# Pie chart for Shift-wise Treated Water Production
shiftwise_production = df.groupby('Shift')['TREATED WATER PRODUCTION IN ML'].sum()

plt.figure(figsize=(8, 6))
plt.pie(shiftwise_production, labels=shiftwise_production.index, autopct='%1.1f%%', colors=['skyblue', 'lightgreen', 'salmon'], startangle=90)
plt.title('Shift-wise Treated Water Production')
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure your datetime index is in place
df['Year'] = pd.to_datetime(df['Standardized_Date']).dt.year

# Group the data by year and shift
year_shiftwise_production = df.groupby(['Year', 'Shift'])['TREATED WATER PRODUCTION IN ML'].sum().reset_index()

# Create a bar plot using Plotly
fig = px.bar(year_shiftwise_production, 
             x='Year', 
             y='TREATED WATER PRODUCTION IN ML', 
             color='Shift', 
             barmode='group',  # Group bars by shift
             title='Year-wise Treated Water Production by Shift',
             labels={'TREATED WATER PRODUCTION IN ML': 'Treated Water Production (ML)',
                     'Year': 'Year'},
             color_discrete_sequence=px.colors.qualitative.Set2)  # Use Set2 color palette

# Update layout for better readability
fig.update_layout(xaxis_title='Year',
                  yaxis_title='Treated Water Production (ML)',
                  legend_title='Shift')

# Show the plot
fig.show()




## Water demand analysis 

The total volume of clear water pumped from treated sources reflects the water consumption by the Overhead Storage Tanks (OHSR). This water is then distributed to individual households, representing the overall demand for water. 

The comparison of demand can be visualized for a desired time range (month on month, year on year)

In [None]:
data_month = data.resample('M').agg({'RAW WATER FLOW IN ML':'sum',
                                   'CLEAR WATER SUMP LEVEL IN Meter':'mean',
                                   'CLEAR WATER PUMPING FLOW ML':'sum',
                                   'TREATED WATER PRODUCTION IN ML':'sum', 'remarks category':'unique'})
data_month

In [None]:
data_year = data.resample('Y').agg({'RAW WATER FLOW IN ML':'sum',
                                   'CLEAR WATER SUMP LEVEL IN Meter':'mean',
                                   'CLEAR WATER PUMPING FLOW ML':'sum',
                                   'TREATED WATER PRODUCTION IN ML':'sum', 'remarks category':'unique'})
data_year

In [None]:
kwh, energy_cost