Water scarcity is an increasingly global issue, with urban households playing a major role in water wastage due to inefficient consumption habits. Traditional water meters provide only total usage data without insights into consumption patterns, making it difficult for homeowners to optimize their water usage effectively. Smart water monitoring systems, powered by machine learning, can help households predict their water consumption and adopt conservation measures.

Task

The goal of this project is to develop a Machine Learning model that predicts daily water consumption for individual households based on historical usage patterns, household characteristics, weather conditions, and conservation behaviors.

In [17]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [1]:
import pandas as pd
import numpy as np

test=pd.read_csv("test.csv")
train=pd.read_csv("train.csv")

In [4]:
print(train.shape)

(14000, 12)


In [None]:
print(train.head)

<bound method NDFrame.head of            Timestamp  Residents Apartment_Type  Temperature Humidity  \
0      01/01/2002 00          1         Studio        15.31    46.61   
1      01/01/2002 08          4            NaN        21.01    66.11   
2      01/01/2002 16          2        Cottage        12.86    60.86   
3      02/01/2002 00          2           1BHK        20.16    50.58   
4      02/01/2002 08          2        Cottage        16.23    52.25   
...              ...        ...            ...          ...      ...   
13995  10/10/2014 00          2           1BHK        25.61     61.5   
13996  10/10/2014 08          5           2BHK        13.27    52.58   
13997  10/10/2014 16          4           2BHK          NaN    46.93   
13998  11/10/2014 00          4           3BHK        11.62    64.48   
13999  11/10/2014 08          4           2BHK        23.78    44.88   

       Water_Price  Period_Consumption_Index  Income_Level  Guests  \
0             1.06                 

In [None]:
train.columns

Index(['Timestamp', 'Residents', 'Apartment_Type', 'Temperature', 'Humidity',
       'Water_Price', 'Period_Consumption_Index', 'Income_Level', 'Guests',
       'Amenities', 'Appliance_Usage', 'Water_Consumption'],
      dtype='object')

In [9]:
# Convert Timestamp to datetime (correct format with dayfirst=True)
train['Timestamp'] = pd.to_datetime(train['Timestamp'], format="%d/%m/%Y %H", dayfirst=True)
test['Timestamp'] = pd.to_datetime(test['Timestamp'], format="%d/%m/%Y %H", dayfirst=True)


In [12]:
# Feature Engineering from Timestamp
for df in [train, test]:
    df['Hour'] = df['Timestamp'].dt.hour
    df['Day'] = df['Timestamp'].dt.day
    df['Month'] = df['Timestamp'].dt.month
    df['Weekday'] = df['Timestamp'].dt.weekday
    df.drop('Timestamp', axis=1, inplace=True)

In [13]:
# Separate target variable
y = train['Water_Consumption']
train.drop('Water_Consumption', axis=1, inplace=True)

In [14]:
# Identify numeric and categorical features
numerical_cols = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = train.select_dtypes(include=['object']).columns.tolist()

In [18]:
numerical_transformer = SimpleImputer(strategy='mean')


In [19]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')


In [20]:
# Combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [21]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])


In [22]:
X_train, X_val, y_train, y_val = train_test_split(train, y, test_size=0.2, random_state=42)


In [23]:
model.fit(X_train, y_train)


In [24]:
# Validation score
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
score = max(0, 100 - rmse)
print(f"Validation RMSE: {rmse:.2f}, Score: {score:.2f}")

Validation RMSE: 20.33, Score: 79.67


In [25]:
# Final prediction on test data
final_preds = model.predict(test)

In [29]:
test = pd.read_csv('test.csv')
test_index = test['Timestamp']  # Save this before any processing


In [30]:
# Make predictions
final_preds = model.predict(test)

# Create a submission DataFrame using test index
submission = pd.DataFrame({
    'Timestamp': test_index,  # Must match the test set's Timestamp
    'Water_Consumption': final_preds
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("✅ Submission file saved as 'submission.csv'")


✅ Submission file saved as 'submission.csv'


In [35]:
# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# Parse timestamp
train['Timestamp'] = pd.to_datetime(train['Timestamp'], dayfirst=True, errors='coerce')
test['Timestamp'] = pd.to_datetime(test['Timestamp'], dayfirst=True, errors='coerce')

# Drop rows with failed date parsing
train = train.dropna(subset=['Timestamp'])
test = test.dropna(subset=['Timestamp'])

# Extract datetime features
for df in [train, test]:
    df['Hour'] = df['Timestamp'].dt.hour
    df['Day'] = df['Timestamp'].dt.day
    df['Month'] = df['Timestamp'].dt.month
    df['Weekday'] = df['Timestamp'].dt.weekday

# Drop Timestamp column
train.drop(columns=['Timestamp'], inplace=True)
test.drop(columns=['Timestamp'], inplace=True)

# Separate target
y = train['Water_Consumption']
X = train.drop(columns=['Water_Consumption'])

In [37]:
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [38]:
from xgboost import XGBRegressor

# Preprocessing
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# XGBoost pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42))
])

In [39]:
# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
model.fit(X_train, y_train)


In [40]:
val_preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
score = max(0, 100 - rmse)
print(f"Validation RMSE: {rmse:.2f}, Score: {score:.2f}")

Validation RMSE: 16.74, Score: 83.26


In [44]:
final_preds_xag = model.predict(test)


In [45]:


# Create a submission DataFrame using test index
submission = pd.DataFrame({
    'Timestamp': test_index,  # Must match the test set's Timestamp
    'Water_Consumption': final_preds_xag
})

# Save to CSV
submission.to_csv('submission_xag.csv', index=False)
print("✅ Submission file saved as 'submission.csv'")


✅ Submission file saved as 'submission.csv'
