In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
df.head()

# Checking the percentage of null values

In [None]:
(df.isnull().sum()/len(df)*100).sort_values(ascending=False).round(2)

### Co-relation map to understand the data better

In [None]:
fig,ax=plt.subplots(figsize=(9,9))
sns.heatmap(df.corr(),annot=True)

## Approach used to fill null values:
1. **Identify the column with null values**
2. **Build Co-relation chart**
3. **Form a new data-set with one column of missing values and other positive co-related values filled data**
4. **Drop null values and Apply ML model and hyper-parameter tuning to find the best predicted value**
5. **Repeat the steps 3-4 till all the columns are filled**

In [None]:
df_sul=df[['Sulfate','Organic_carbon','Chloramines']]

### Dropping null values

In [None]:
df_sul=df_sul.dropna()

### Scaling the dataset

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scled=scaler.fit_transform(df_sul.drop('Sulfate',axis=1))

### Splitting the data

In [None]:
from sklearn.model_selection import train_test_split
x_tr,x_te,y_tr,y_te=train_test_split(scled,df_sul['Sulfate'],random_state=42,test_size=0.3)

### Importing the model to predict the values for missing column

In [None]:
from sklearn.svm import SVR
svr=SVR()

### For tuning the hyper-parameters

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid={'kernel':['linear', 'poly', 'rbf', 'sigmoid'],'degree':[1,2],'gamma':['scale','auto'],'C':[1.2,1.3,1.4,1.7]}
gs=GridSearchCV(svr,param_grid=param_grid,n_jobs=-1,cv=3,verbose=3)

### Fitting and predicting the data in Grid-search 

In [None]:
gs.fit(x_tr,y_tr)
sul_pr=gs.predict(x_te)

### Getting the column ready to be predicted

In [None]:
sulfatee=gs.predict(scaler.fit_transform(df[['Organic_carbon','Chloramines']]))

### Adding new column in dataframe

In [None]:
df['sulfatee']=sulfatee

### dropping the old column

In [None]:
df.drop('Sulfate',axis=1,inplace=True)

### Fitting data for next column

In [None]:
tr_df=df[['Chloramines','Conductivity','Potability','Trihalomethanes']]
tr_df=tr_df.dropna()
tr_sc=scaler.fit_transform(tr_df.drop('Trihalomethanes',axis=1))
tr_tr,tr_te,ytr_tr,yte_te=train_test_split(tr_sc,tr_df['Trihalomethanes'],random_state=42,test_size=0.3)
gs.fit(tr_tr,ytr_tr)

### Predicting the values

In [None]:
trr=gs.predict(scaler.fit_transform(df[['Chloramines','Conductivity','Potability']]))

### Adding the data in df and dropping the old data

In [None]:
df['trr']=trr
df.drop('Trihalomethanes',axis=1,inplace=True)

### Predicting values for missing ph column

In [None]:
ph_df=df[['ph','Hardness','sulfatee','Conductivity','Organic_carbon','trr']]
ph_df.dropna(inplace=True)
ph_sc=scaler.fit_transform(ph_df.drop('ph',axis=1))
x_tr,x_te,y_tr,y_te=train_test_split(ph_sc,ph_df['ph'],random_state=42,test_size=0.3)
gs.fit(x_tr,y_tr)
phh=gs.predict(df[['Hardness','sulfatee','Conductivity','Organic_carbon','trr']])
df['phh']=phh
df.drop('ph',axis=1,inplace=True)

### Final data-set

In [None]:
df.head()

In [None]:
(df.isnull().sum()/len(df)*100).round(2)

### Building a model to predict the Potability

In [None]:
from sklearn.svm import SVC
svc=SVC()
param_grid={'C':[1.2,1.5,2.2,3.5,3.2,4.1],'kernel':['linear', 'poly', 'rbf', 'sigmoid'],'degree':[1,2,4,8,10],'gamma':['scale','auto']}
gridsearch=GridSearchCV(svc,param_grid=param_grid,n_jobs=-1,verbose=4,cv=3)

### Scaling features

In [None]:
scaled_x=scaler.fit_transform(df.drop('Potability',axis=1))

### Splitting the final Data-set

In [None]:
x_tr,x_te,y_tr,y_te=train_test_split(scaled_x,df['Potability'],random_state=42,test_size=0.3)

In [None]:
x_tr.shape,y_tr.shape

### Fitting The data

In [None]:
gridsearch.fit(x_tr,y_tr)

In [None]:
gridsearch.best_params_

### Predicting the values with best model

In [None]:
predicted_y=gridsearch.predict(x_te)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_te,predicted_y))

### Testing the accuracy

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_te,predicted_y)

### 100% Accuracy achieved