In [6]:
import pandas as pd
import numpy as np

# Function to classify aquaculture water quality
def classify_aquaculture_health(row):
    ph, turbidity, tds = row['ph'], row['Turbidity'], row['Solids']
    
    # Check pH range
    if ph < 4.5 or ph > 9.5:
        return 0  # Unhealthy

    # Check Turbidity range (optimal: 10 to 50 NTU)
    if turbidity < 10 or turbidity > 50:
        return 0  # Unhealthy

    # Check TDS (Solids) range for Freshwater, Brackish, and Marine
    if 50 <= tds <= 500:   # Freshwater fish
        return 1  # Healthy
    elif 500 < tds <= 5000:  # Brackish water species
        return 1  # Healthy
    elif tds > 5000:  # Marine fish
        return 1  # Healthy

    return 0  # Default to Unhealthy

# Load dataset
input_file = "water_potability.csv"  # Change to your actual CSV filename
df = pd.read_csv(input_file)

# Select only relevant columns
df = df[['ph', 'Turbidity', 'Solids']].copy()

# Apply classification function
df['Labeled Data'] = df.apply(classify_aquaculture_health, axis=1)

# Generate synthetic healthy samples based on conditions
num_samples_needed = 6500 - len(df)

synthetic_data = pd.DataFrame({
    'ph': np.random.uniform(6.5, 8.5, num_samples_needed),  # Optimal pH range
    'Turbidity': np.random.uniform(10, 50, num_samples_needed),  # Optimal Turbidity range
    'Solids': np.random.uniform(50, 5000, num_samples_needed),  # Covering Freshwater & Brackish Water ranges
    'Labeled Data': 1  # Mark as Healthy
})

# Combine original and synthetic data
augmented_df = pd.concat([df, synthetic_data], ignore_index=True)

# Shuffle the dataset randomly
augmented_df = augmented_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the new dataset with labels
output_file = "augmented_water_data.csv"
augmented_df.to_csv(output_file, index=False)

print(f"Augmented dataset saved as {output_file} with {len(augmented_df)} entries.")


Augmented dataset saved as augmented_water_data.csv with 6500 entries.
