In [180]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Cleaned_Good_WQI_Removed_Extremes.csv")

# Clean column names (remove unwanted characters and spaces)
df.columns = df.columns.str.strip().str.replace('Â', '', regex=False)

# Standardize label formatting
df['WAWQI_Label'] = df['WAWQI_Label'].astype(str).str.strip().str.capitalize()

# Ensure relevant columns are numeric
columns = [
    'Ammonia-Total (as N)',
    'Conductivity @25°C',
    'pH',
    'Total Hardness (as CaCO3)'
]
df[columns] = df[columns].apply(pd.to_numeric, errors='coerce')

# Filter only 'Good' labeled data
good_df = df[df['WAWQI'] == 'Excellent']

# Define extreme conditions (outside the acceptable range)
extreme_condition = (
    (good_df['Ammonia-Total (as N)'] < 0) | (good_df['Ammonia-Total (as N)'] > 0.099) |
    (good_df['Conductivity @25°C'] < 0) | (good_df['Conductivity @25°C'] > 1200) |
    (good_df['pH'] < 6.0) | (good_df['pH'] > 9.0) |
    (good_df['Total Hardness (as CaCO3)'] < 0) | (good_df['Total Hardness (as CaCO3)'] > 600)
)

# Apply condition and count
extreme_count = good_df[extreme_condition].shape[0]

print(f"Number of 'Good' labeled rows with any extreme value: {extreme_count}")


Number of 'Good' labeled rows with any extreme value: 0


In [139]:
# Count total rows with label 'Good'
total_good = df[df['WAWQI'] == 'Poor'].shape[0]
print(f"Total number of rows labeled 'Good': {total_good}")


Total number of rows labeled 'Good': 456


In [140]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Cleaned_Good_WQI_Removed_Extremes.csv")

# Clean column names (remove unwanted characters and spaces)
df.columns = df.columns.str.strip().str.replace('Â', '', regex=False)

# Standardize label formatting
df['WAWQI'] = df['WAWQI'].astype(str).str.strip().str.capitalize()

# Ensure relevant columns are numeric
columns = [
    'Ammonia-Total (as N)',
    'Conductivity @25°C',
    'pH',
    'Total Hardness (as CaCO3)'
]
df[columns] = df[columns].apply(pd.to_numeric, errors='coerce')

# Define valid condition for 'Good' labeled rows
valid_condition = (
    (df['Ammonia-Total (as N)'] >= 0) & (df['Ammonia-Total (as N)'] <= 0.08) &
    (df['Conductivity @25°C'] >= 100) & (df['Conductivity @25°C'] <= 350) &
    (df['pH'] >= 7.0) & (df['pH'] <= 8.0) &
    (df['Total Hardness (as CaCO3)'] >= 0) & (df['Total Hardness (as CaCO3)'] <= 250)
)

# Keep only valid 'Good' rows and all other labels
df_cleaned = df[(df['WAWQI'] != 'Poor') | ((df['WAWQI'] == 'Poor') & valid_condition)]

# Optional: Save cleaned data
df_cleaned.to_csv("Cleaned_Good_WQI_Removed_Extremes.csv", index=False)

print("Rows with invalid values in 'Good' label removed.")
print(f"New dataset shape: {df_cleaned.shape}")


Rows with invalid values in 'Good' label removed.
New dataset shape: (8747, 6)
