In [3]:
### Step 1: Importing Necessary Libraries
import pandas as pd  # For data manipulation
import torch  # PyTorch for building neural networks
import torch.nn as nn  # PyTorch neural network modules
import torch.optim as optim  # PyTorch optimization algorithms
from sklearn.preprocessing import StandardScaler, LabelEncoder  # Data preprocessing tools
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.metrics import r2_score, f1_score  # For evaluating model performance
import numpy as np  # For numerical operations


### Step 2: Loading and Preparing Dataset
file_path = 'C:/Users/Sanjana Shah/AV GenAI Certification/DL_Pytorch/Assignment/water_quality.csv'  # Path to the dataset file
water_quality_df = pd.read_csv(file_path)  # Reading the dataset into a DataFrame


### Step 2.1: Handling Missing and Infinite Values

# Replacing NaN values with column means to avoid issues with missing data
if water_quality_df.isnull().sum().sum() > 0:
    print("Replacing NaN values with column means.")
    water_quality_df.fillna(water_quality_df.mean(numeric_only=True), inplace=True)

# Replacing Inf values with 0 to avoid computational errors
numeric_cols = water_quality_df.select_dtypes(include=[np.number]).columns
water_quality_df[numeric_cols] = water_quality_df[numeric_cols].replace([np.inf, -np.inf], 0)


### Step 3: Encoding Categorical Columns
categorical_columns = ['State', 'District', 'Block', 'Village']
# Label encoding categorical features to numerical values
for column in categorical_columns:
    water_quality_df[column] = LabelEncoder().fit_transform(water_quality_df[column].astype(str))


### Step 4: Encoding Target Column
# Encoding the categorical target column 'Water Quality Classification'
water_quality_df['Water Quality Classification'] = LabelEncoder().fit_transform(water_quality_df['Water Quality Classification'].astype(str))


### Step 5: Preparing Features and Targets
# Dropping unwanted columns and setting features and targets
X = water_quality_df.drop(['WQI', 'Water Quality Classification', 'Well_ID'], axis=1)

y_reg = water_quality_df['WQI']  # Regression target

y_clf = water_quality_df['Water Quality Classification']  # Classification target


### Step 6: Data Scaling
scaler = StandardScaler()  # Standardizing features (mean=0, variance=1)
X_scaled = scaler.fit_transform(X)  # Applying scaling to the feature data


### Step 7: Train/Test Split
X_train, X_test, y_train_reg, y_test_reg, y_train_clf, y_test_clf = train_test_split(
    X_scaled, y_reg, y_clf, test_size=0.2, random_state=42
)


### Step 8: Convert Data to PyTorch Tensors
# Converting data to tensors for PyTorch compatibility
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_reg_tensor = torch.tensor(y_train_reg.values, dtype=torch.float32).unsqueeze(1)  # Adding an extra dimension

# Regression and Classification targets
y_test_reg_tensor = torch.tensor(y_test_reg.values, dtype=torch.float32).unsqueeze(1)
y_train_clf_tensor = torch.tensor(y_train_clf.values, dtype=torch.long)
y_test_clf_tensor = torch.tensor(y_test_clf.values, dtype=torch.long)


### Step 9: Define Improved Neural Network Model
class ImprovedNetwork(nn.Module):
    def __init__(self):
        super(ImprovedNetwork, self).__init__()
        # Shared Network Layers
        self.shared = nn.Sequential(
            nn.Linear(X_train.shape[1], 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )
        # Separate layers for regression and classification
        self.regressor = nn.Linear(64, 1)  # Regression output layer
        self.classifier = nn.Linear(64, 5)  # Classification output layer

    def forward(self, x):
        shared_features = self.shared(x)  # Passing data through shared layers
        reg_output = self.regressor(shared_features)  # Producing regression output
        clf_output = self.classifier(shared_features)  # Producing classification output
        return reg_output, clf_output


### Step 10: Initializing Model, Losses, and Optimizer
model = ImprovedNetwork()  # Initializing the neural network

criterion_reg = nn.SmoothL1Loss()  # Regression loss function (better for small errors)
criterion_clf = nn.CrossEntropyLoss()  # Classification loss function
optimizer = optim.AdamW(model.parameters(), lr=0.0003)  # Using AdamW optimizer

# Learning rate scheduler for dynamic adjustment of learning rate
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5)


### Step 11: Training the Model
num_epochs = 100
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    optimizer.zero_grad()  # Clear gradients

    # Forward pass
    y_pred_reg_train, y_pred_clf_train = model(X_train_tensor)

    # Calculating training losses
    train_loss_reg = criterion_reg(y_pred_reg_train, y_train_reg_tensor / 1000)  # Scaling target
    train_loss_clf = criterion_clf(y_pred_clf_train, y_train_clf_tensor)
    train_loss = train_loss_reg + train_loss_clf

    # Backpropagation
    train_loss.backward()
    optimizer.step()
    scheduler.step(train_loss)

    # Evaluate on test data
    model.eval()
    with torch.no_grad():
        y_pred_reg_test, y_pred_clf_test = model(X_test_tensor)
        test_loss_reg = criterion_reg(y_pred_reg_test, y_test_reg_tensor / 1000)
        test_loss_clf = criterion_clf(y_pred_clf_test, y_test_clf_tensor)
        test_loss = test_loss_reg + test_loss_clf

    if (epoch + 1) % 5 == 0:
        print(f'Epoch {epoch+1}: Train Loss: {train_loss.item():.4f}, Test Loss: {test_loss.item():.4f}')


### Step 12: Evaluating Model Performance
model.eval()
with torch.no_grad():
    y_test_pred_reg, y_test_pred_clf = model(X_test_tensor)

    r2 = r2_score(y_test_reg_tensor.numpy(), y_test_pred_reg.numpy())
    _, predicted_classes = torch.max(y_test_pred_clf, 1)
    f1 = f1_score(y_test_clf_tensor.numpy(), predicted_classes.numpy(), average='weighted')
    accuracy = (predicted_classes == y_test_clf_tensor).sum().item() / len(y_test_clf_tensor)

print(f'R² Score (Regression): {r2:.4f}')
print(f'F1 Score (Classification): {f1:.4f}')
print(f'Test Accuracy (Classification): {accuracy * 100:.2f}%')

Replacing NaN values with column means.
Epoch 5: Train Loss: 1.7275, Test Loss: 1.6459
Epoch 10: Train Loss: 1.4971, Test Loss: 1.5532
Epoch 15: Train Loss: 1.3512, Test Loss: 1.4256
Epoch 20: Train Loss: 1.2372, Test Loss: 1.2799
Epoch 25: Train Loss: 1.1457, Test Loss: 1.1396
Epoch 30: Train Loss: 1.0680, Test Loss: 1.0195
Epoch 35: Train Loss: 0.9929, Test Loss: 0.9232
Epoch 40: Train Loss: 0.9322, Test Loss: 0.8461
Epoch 45: Train Loss: 0.8690, Test Loss: 0.7817
Epoch 50: Train Loss: 0.8148, Test Loss: 0.7266
Epoch 55: Train Loss: 0.7621, Test Loss: 0.6764
Epoch 60: Train Loss: 0.7190, Test Loss: 0.6306
Epoch 65: Train Loss: 0.6768, Test Loss: 0.5885
Epoch 70: Train Loss: 0.6456, Test Loss: 0.5510
Epoch 75: Train Loss: 0.6088, Test Loss: 0.5180
Epoch 80: Train Loss: 0.5809, Test Loss: 0.4885
Epoch 85: Train Loss: 0.5559, Test Loss: 0.4612
Epoch 90: Train Loss: 0.5297, Test Loss: 0.4369
Epoch 95: Train Loss: 0.5096, Test Loss: 0.4157
Epoch 100: Train Loss: 0.4880, Test Loss: 0.3966
