In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('water_potability.csv', encoding="latin1")

# Display the first few rows of the dataset
print(data.head())

# 1. Data Cleaning
# Updated num_features to include only available numeric columns
num_features = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# No categorical features available in this dataset based on the column list
cat_features = []
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps of Input Data
# Only include the numerical transformer as there are no categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        # Removed categorical transformer as there are no categorical features
        # ("cat", cat_transformer, cat_features)
    ],
    remainder='passthrough' # Keep other columns (like 'Potability')
)
preprocessor.set_output(transform="pandas")

# Apply the transformations to the Input data
data_preprocessed = preprocessor.fit_transform(data)

# Generate more readable column names - this might need adjustment if column names are complex after preprocessing
# For this simple case with only numerical features and passthrough, original names might be kept or slightly modified
# Let's inspect the columns after preprocessing to decide on renaming
print(data_preprocessed.head())


# 3. Data Splitting
# Assuming 'Potability' is the target variable for this dataset based on the column list
X = data_preprocessed.drop(columns=['remainder__Potability'])
y = data_preprocessed['remainder__Potability']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the first few rows
print(X_train.head())
print(y_train.head())

         ph    Hardness        Solids  Chloramines     Sulfate  Conductivity  \
0       NaN  204.890455  20791.318981     7.300212  368.516441    564.308654   
1  3.716080  129.422921  18630.057858     6.635246         NaN    592.885359   
2  8.099124  224.236259  19909.541732     9.275884         NaN    418.606213   
3  8.316766  214.373394  22018.417441     8.059332  356.886136    363.266516   
4  9.092223  181.101509  17978.986339     6.546600  310.135738    398.410813   

   Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       10.379783        86.990970   2.963135           0  
1       15.180013        56.329076   4.500656           0  
2       16.868637        66.420093   3.055934           0  
3       18.436524       100.341674   4.628771           0  
4       11.558279        31.997993   4.075075           0  
        num__ph  num__Hardness  num__Solids  num__Chloramines  num__Sulfate  \
0 -6.043133e-16       0.259195    -0.139471          0.112415  9.613574e-01   
1

In [None]:
print(data.columns)

Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')
