In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("../data/raw/water_potability.csv")
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [4]:
df.info()
df.isnull().sum()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.093092,176.850538,15666.690297,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0
50%,7.036752,196.967627,20927.833607,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0
75%,8.062066,216.667456,27332.762127,8.114887,359.95017,481.792304,16.557652,77.337473,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


In [5]:
df.fillna(df.median(numeric_only=True), inplace=True)
df.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [9]:
df.isnull().values.any()

np.False_

In [10]:
df['pH_Status'] = df['ph'].apply(
    lambda x: 'Normal' if 6.5 <= x <= 8.5 else 'Abnormal'
)

df['TDS_Status'] = df['Solids'].apply(
    lambda x: 'Normal' if x <= 500 else 'High'
)

df['Turbidity_Status'] = df['Turbidity'].apply(
    lambda x: 'Normal' if x <= 5 else 'High'
)


In [11]:
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability,pH_Status,TDS_Status,Turbidity_Status
0,7.036752,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0,Normal,High,Normal
1,3.71608,129.422921,18630.057858,6.635246,333.073546,592.885359,15.180013,56.329076,4.500656,0,Abnormal,High,Normal
2,8.099124,224.236259,19909.541732,9.275884,333.073546,418.606213,16.868637,66.420093,3.055934,0,Normal,High,Normal
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0,Normal,High,Normal
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0,Abnormal,High,Normal


In [12]:
def recommend_treatment(row):
    treatments = []

    if row['ph'] < 6.5 or row['ph'] > 8.5:
        treatments.append("Boiling / pH Adjustment")

    if row['Solids'] > 500:
        treatments.append("RO Filtration")

    if row['Turbidity'] > 5:
        treatments.append("Sediment Filter")

    if row['Potability'] == 0:
        treatments.append("UV Purification")

    return " + ".join(sorted(set(treatments))) if treatments else "No Treatment Required"

df['Recommended_Treatment'] = df.apply(recommend_treatment, axis=1)
df[['ph','Solids','Turbidity','Potability','Recommended_Treatment']].head()


Unnamed: 0,ph,Solids,Turbidity,Potability,Recommended_Treatment
0,7.036752,20791.318981,2.963135,0,RO Filtration + UV Purification
1,3.71608,18630.057858,4.500656,0,Boiling / pH Adjustment + RO Filtration + UV P...
2,8.099124,19909.541732,3.055934,0,RO Filtration + UV Purification
3,8.316766,22018.417441,4.628771,0,RO Filtration + UV Purification
4,9.092223,17978.986339,4.075075,0,Boiling / pH Adjustment + RO Filtration + UV P...


In [16]:
df.to_csv("../data/processed/water_treatment_clean.csv", index=False)
print("Dataset saved successfully")

Dataset saved successfully
