# Water Potability 

<img src="https://media.istockphoto.com/photos/mineral-water-is-being-poured-into-glass-picture-id491962870?k=6&m=491962870&s=612x612&w=0&h=oCZyh1EKST_hoRI-x4Fh59S53JM-UTPbR7sIkvBTH-o=" width="800">

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing Libraries**

In [None]:
import pandas as pd
import missingno as mno
import itertools
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

sns.set()

**Getting the dataset**

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')
df.head(10)

In [None]:
df.shape

In [None]:
df.duplicated().sum()   # no duplicated values

Checking if there is any null value using .isnull() or .isna() methods

In [None]:
df.isnull().values.any()

So we are having null values because the output is True.Let's see what are the columns having null values.

In [None]:
df.info()

From the above output, we are clear that ph, Sulfate, Trihalomethanes are the columns having null values.

We can also use Missingno library to visualise null values.

In [None]:
mno.bar(df, figsize=(10,5), fontsize=12, color='blue', sort='ascending')

In [None]:
new_df = df.interpolate()
new_df.fillna(method='bfill',inplace=True)

In [None]:
new_df.head()

Now let's look at how each attribute is correlated with 'Potability of water'.

In [None]:
corr_matrix = new_df.corr()
corr_matrix['Potability'].sort_values(ascending=False)

In [None]:
plt.figure(figsize=(11,11))
mask = np.array(corr_matrix)
mask[np.tril_indices_from(mask)] = False
sns.heatmap(data=corr_matrix, mask=mask, square=True, annot=True)

Inferences from heatmap:
* Solids has positive correlation.
* Organic carbon has negative correlation.
    

Visualizing and removing outliers

In [None]:
def box_plot(df, ft):
    df.boxplot(column=[ft])
    plt.grid(False)
    plt.show()

In [None]:
box_plot(new_df,'Turbidity')

In [None]:
box_plot(new_df, 'Solids')

From the above box plot, we can infer that in solids column the values above 45000 (not exactly) are the outliers for that feature.

In [None]:
def outlier(df, ft):
    Q1 = df[ft].quantile(0.25)
    Q3 = df[ft].quantile(0.75)
    IQR=Q3 - Q1
    lower_bound = Q1 -(1.5 * IQR) 
    upper_bound = Q3 +(1.5 * IQR)
    ls = df.index[(df[ft]<lower_bound)|(df[ft]>upper_bound)]
    return ls

Checking whether the ouliers that we have from the outlier function and that in the previous boxplot are similar or not

In [None]:
solids = new_df['Solids']

# Finding outlier for the solid column
sol_out = list(outlier(new_df,'Solids'))

# Printing the outliers
for i in sol_out:  
    print(solids[i], end=" , ")

So, we are clear that these ouliers are same as the ones in boxplot.
Now find outliers for all other features and remove that from our data.

In [None]:
lst = ['Solids','Chloramines','Trihalomethanes','Turbidity','ph','Conductivity','Hardness','Sulfate','Organic_carbon']
index_list = []
for features in lst:
    index_list.extend(outlier( new_df, features))

In [None]:
len(index_list)

In [None]:
def remove(df, ls):         # Removing outliers from the dataset
    ls =sorted(set(ls))
    df=df.drop(ls)
    return df

df_cleaned = remove(new_df, index_list)

In [None]:
df_cleaned.shape

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(x=new_df['Solids']))
fig.add_trace(go.Histogram(x=df_cleaned['Solids']))

# Overlay both histograms
fig.update_layout(barmode='overlay')

# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

The blue histogram is the **solids** column in our data with outliers , but in our cleaned data the outliers has been removed.

In [None]:
df_cleaned.info()

Visualizing dataset

In [None]:
sns.pairplot(df, hue='Potability')

From the above pair plot we can infer that the data is Overlapped,so we can't use Logistic Regression.

**Training and Finding best model**

In [None]:
X = df_cleaned[lst]
y = df_cleaned['Potability']

In [None]:
scaler = MinMaxScaler()
scaler.fit(X)

scaled_features = scaler.transform(X)
df_feat = pd.DataFrame(scaled_features, columns = df_cleaned.columns[:-1])
df_feat.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_feat, y, random_state=42, test_size=0.3)

In [None]:
models = [LinearSVC(), SVC(kernel='rbf'), KNeighborsClassifier(), RandomForestClassifier(),
          DecisionTreeClassifier(), GaussianNB()]
names = ['Linear SVC','SVC', 'KNearestNeighbors', 'RandomForestClassifier', 'DecisionTree', 'GaussianNB']
acc = []

for model in range(len(models)):
    clf = models[model]
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc.append(accuracy_score(pred, y_test))

models = {'Algorithm': names, 'Accuracy': acc}
models_df = pd.DataFrame(models)
models_df

SVC is the best model.

In [None]:
class_names=np.array(['0','1'])

# Function to plot the confusion Matrix
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = 'd' 
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
model_best = SVC(kernel = 'rbf')
model_best.fit(X_train, y_train)

In [None]:
prediction_SVM = model_best.predict(X_test) #we predict our data test.
cm = confusion_matrix(y_test, prediction_SVM )
plot_confusion_matrix(cm,class_names)

In [None]:
print("the accuracy is : "+str((cm[0][0]+cm[1][1]) / (sum(cm[0]) + sum(cm[1]))))

If anyone knows how to improve the accuracy( since it is only 66% ),please do suggest me.

**Thank you !**