# Water Potability Prediction 

In this project, we will be prediciting if the water is save for human consumption based on the water quality metrics

**Importing Libraries**

In [None]:
# EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.utils import shuffle

# Model and Prediction
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

%matplotlib inline

In [None]:
wp=pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
wp.head()

**Exploratory Data Analysis**

In [None]:
wp.shape

In [None]:
wp.info()

In [None]:
wp.describe()

In [None]:
wp.isnull().sum()

We can see that we have missing values in the data, so we will remove the missing values 

In [None]:
wp = wp.dropna()

In [None]:
sns.set_style("darkgrid")
sns.countplot(x='Potability',data=wp)

In [None]:
wp['Potability'].value_counts()

In [None]:
# Resampling the data
np  = wp[wp['Potability']==0]
p = wp[wp['Potability']==1]

wp_minority_upsampled = resample(p, replace = True, n_samples = 1200) 

wp = pd.concat([np, wp_minority_upsampled])
wp = shuffle(wp)

In [None]:
wp.Potability.value_counts()

In [None]:
sns.pairplot(wp, hue = 'Potability')

In [None]:
wp.corr()

In [None]:
plt.figure(figsize=(14,10))
sns.heatmap(wp.corr(), annot=True)

**Splitting and Scaling the Data**

In [None]:
X = wp.drop(['Potability'], axis = 1)
Y = wp['Potability']

In [None]:
# Train-test split 30-70
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.3, 
                                                    random_state=101)

In [None]:
#Scaling the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

**Models and Accuracy**

In [None]:
def models(X_train,y_train):
    
    #Logistic Regression
    log = LogisticRegression(random_state=0)
    log.fit(X_train, y_train)
    
    #Decision Tree
    decision_tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    decision_tree.fit(X_train, y_train)
    
    #Random Forest
    random_forest = RandomForestClassifier(n_estimators=10,criterion = 'entropy', random_state=0)
    random_forest.fit(X_train, y_train)
    
    #KNN
    knn = KNeighborsClassifier(n_neighbors = 3)
    knn.fit(X_train, y_train)
    
    #Model Accuracy on Training Data
    print('[0]Logistic Regression Training Acc:', log.score(X_train,y_train))
    print('[1]Decision Tree Training Acc:', decision_tree.score(X_train,y_train))
    print('[2]Random Forest Training Acc:', random_forest.score(X_train,y_train))
    print('[3]KNN Training Acc:', knn.score(X_train,y_train))
    
    return log, decision_tree, random_forest, knn

In [None]:
model = models(X_train,y_train)

In [None]:
# Accuracy on Testing Data

for i in range(len(model)):
    print('Model ', i)
    cm = confusion_matrix(y_test, model[i].predict(X_test))

    tp = cm[0][0]
    tn = cm[1][1]
    fp = cm[1][0]
    fn = cm[0][1]

    print(cm)
    print('Testing Acc = ', (tp + tn)/(tp +tn +fn + fp))
    print()

**The testing accuracies are:**
* Logistic Regression : 0.48
* Decision Tree : 0.7
* Random Forest : 0.76
* KNN : 0.66