In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

In [28]:
df = pd.read_csv('ground water quality.csv')
df.head()

Unnamed: 0,Well ID,S.No,STATE,DISTRICT,BLOCK,LOCATION,LATITUDE,LONGITUDE,Year,pH,...,PO4,TH,Ca,Mg,Na,K,F,SiO2,TDS,U(ppb)
0,W113126092431802,1,A&N Islands,South Andaman,PORT BLAIR,Kodiaghat,11.545,92.74583,2021,7.74,...,,175.0,34.0,22.0,10.0,0.0,0.84,,206.0,4.8
1,W113616092411001,2,A&N Islands,South Andaman,PORT BLAIR,Sippighat,11.6025,92.73,2021,7.53,...,,250.0,52.0,29.0,51.0,1.0,0.28,,381.0,6.4
2,W113640092443301,3,A&N Islands,South Andaman,PORT BLAIR,Brichganj,11.62083,92.73028,2021,7.5,...,,125.0,4.0,28.0,10.0,1.0,0.46,,151.0,BDL
3,W114421092422801,4,A&N Islands,South Andaman,FERRARGUNJ,Wimberleyganj,11.7325,92.70861,2021,7.6,...,,135.0,26.0,17.0,15.0,2.0,0.37,,198.0,0.4
4,W113955092391801,5,A&N Islands,South Andaman,FERRARGUNJ,Ograbraj,11.66361,92.65806,2021,7.82,...,,465.0,56.0,79.0,336.0,18.0,0.95,,1424.0,BDL


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5375 entries, 0 to 5374
Data columns (total 26 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Well ID    5375 non-null   object 
 1   S.No       5375 non-null   int64  
 2   STATE      5375 non-null   object 
 3   DISTRICT   5375 non-null   object 
 4   BLOCK      5304 non-null   object 
 5   LOCATION   5375 non-null   object 
 6   LATITUDE   5374 non-null   object 
 7   LONGITUDE  5374 non-null   object 
 8   Year       5375 non-null   int64  
 9   pH         5375 non-null   float64
 10  EC         5375 non-null   float64
 11  CO3        5375 non-null   float64
 12  HCO3       5375 non-null   float64
 13  Cl         5375 non-null   float64
 14  SO4        5374 non-null   float64
 15  NO3        5375 non-null   float64
 16  PO4        3554 non-null   object 
 17  TH         5375 non-null   float64
 18  Ca         5375 non-null   float64
 19  Mg         5375 non-null   float64
 20  Na      

In [41]:
df.shape

(5375, 26)

In [30]:
df.isnull().sum()

Well ID         0
S.No            0
STATE           0
DISTRICT        0
BLOCK          71
LOCATION        0
LATITUDE        1
LONGITUDE       1
Year            0
pH              0
EC              0
CO3             0
HCO3            0
Cl              0
SO4             1
NO3             0
PO4          1821
TH              0
Ca              0
Mg              0
Na              0
K               1
F               0
SiO2         2621
TDS          2159
U(ppb)       3732
dtype: int64

In [31]:
df['PO4']  = pd.to_numeric(df['PO4'],errors='coerce')
df['U(ppb)']  = pd.to_numeric(df['U(ppb)'],errors='coerce')
df['SiO2']  = pd.to_numeric(df['SiO2'],errors='coerce')

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5375 entries, 0 to 5374
Data columns (total 26 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Well ID    5375 non-null   object 
 1   S.No       5375 non-null   int64  
 2   STATE      5375 non-null   object 
 3   DISTRICT   5375 non-null   object 
 4   BLOCK      5304 non-null   object 
 5   LOCATION   5375 non-null   object 
 6   LATITUDE   5374 non-null   object 
 7   LONGITUDE  5374 non-null   object 
 8   Year       5375 non-null   int64  
 9   pH         5375 non-null   float64
 10  EC         5375 non-null   float64
 11  CO3        5375 non-null   float64
 12  HCO3       5375 non-null   float64
 13  Cl         5375 non-null   float64
 14  SO4        5374 non-null   float64
 15  NO3        5375 non-null   float64
 16  PO4        2842 non-null   float64
 17  TH         5375 non-null   float64
 18  Ca         5375 non-null   float64
 19  Mg         5375 non-null   float64
 20  Na      

In [33]:
meanpo4 = df['PO4'].mean
meansio2 = df['SiO2'].mean
meantds = df['TDS'].mean
meanu = df['U(ppb)'].mean
meank = df['K'].mean
df['PO4'] = df['PO4'].fillna(meanpo4)
df['SiO2'] = df['SiO2'].fillna(meansio2)
df['TDS'] = df['TDS'].fillna(meantds)
df['U(ppb)'] = df['U(ppb)'].fillna(meanu)
df['K'] = df['K'].fillna(meank)

In [40]:
mean_df=df['K'].mean
mean_df

<bound method NDFrame._add_numeric_operations.<locals>.mean of 0        0.0
1        1.0
2        1.0
3        2.0
4       18.0
        ... 
5370    3.83
5371    3.55
5372    2.45
5373    8.53
5374    4.54
Name: K, Length: 5375, dtype: object>

In [34]:
df.isnull().sum()

Well ID       0
S.No          0
STATE         0
DISTRICT      0
BLOCK        71
LOCATION      0
LATITUDE      1
LONGITUDE     1
Year          0
pH            0
EC            0
CO3           0
HCO3          0
Cl            0
SO4           1
NO3           0
PO4           0
TH            0
Ca            0
Mg            0
Na            0
K             0
F             0
SiO2          0
TDS           0
U(ppb)        0
dtype: int64

In [None]:
naya column  banna hai
usko assign krni hai values 
eda
feature selection 
feature engineering(hypothesis testing)
split the data
model
test
