# Water Quality 


### Predict If water is safe for Human Consumption

In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
df.head()

In [None]:
# Shape of the dataframe
print(df.shape)
# Find the number of rows within a dataframe
print(len(df))
# Extracting information from the shape tuple
print(f'Number of rows: {df.shape[0]} \nNumber of columns: {df.shape[1]}')

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.describe()

# Exploratory Data Analysis

## Missing Data

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=True)

In [None]:
# Apply mean value to the missing values
df['ph'].fillna(df['ph'].mean(), inplace=True)
df['Sulfate'].fillna(df['Sulfate'].mean(), inplace=True)
df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean(), inplace=True)

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=True)

In [None]:
df.info()

In [None]:
sns.countplot(df.Potability);
sns.despine(bottom=False,left=False);

In [None]:
sns.pairplot(df);
sns.despine(bottom=True,left=True);

In [None]:
plt.subplots(figsize=(20,10));
sns.heatmap(df.corr(),cbar=False,annot=True,linewidth=.1);

In [None]:
corr = df.corr()
corr["Potability"].sort_values(ascending=False)

**Solids has the highest correlation**

In [None]:
g = sns.FacetGrid(df, col='Potability')
g.map(plt.hist, 'Solids', bins=25)

## Apply ML Alogrithm

In [None]:
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

### Split and Train the Data

In [None]:
x = df.drop(['Potability'],axis=True)
y = df['Potability']

In [None]:
x_train,x_test,y_train,y_test = tts(x,y,test_size=0.33, random_state = 101)

## Logistic Regression

In [None]:
logmodel = LogisticRegression()
logmodel.fit(x_train,y_train)

### **Prediction and Evaluations***

In [None]:
predictions = logmodel.predict(x_test)

In [None]:
Acc= accuracy_score(y_test,predictions)
print( Acc)

**Classification Report and confusion Matrix**

In [None]:
print(classification_report(y_test,predictions))

## Decision Tree

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(x_train,y_train)

### **Prediction and Evaluations**

In [None]:
predictions = dtree.predict(x_test)

In [None]:
Acc_dt= accuracy_score(y_test,predictions)
print( Acc_dt)

**Classification_report and Confusion Matrix**

In [None]:
print(classification_report(y_test,predictions))

In [None]:
print(confusion_matrix(y_test,predictions))

In [None]:
cmd= confusion_matrix(y_test,predictions)
sns.heatmap(cmd/np.sum(cmd), annot= True, fmt= '0.2%', cmap= 'twilight_shifted')

## Random Forest Model

In [None]:
# the number of trees you want to build before taking the maximum voting or averages of predictions.
# Higher number of trees give you better performance but makes your code slower.
rfc = RandomForestClassifier(n_estimators = 600)

In [None]:
rfc.fit(x_train,y_train)

### **Prediction and Evaluations**

In [None]:
prediction = rfc.predict(x_test)

In [None]:
Acc_rfc= accuracy_score(y_test,prediction)
print( Acc_rfc)

**Classification Report and Confusion Matrix**

In [None]:
print(classification_report(y_test,predictions))

In [None]:
print(confusion_matrix(y_test,predictions))

In [None]:
cmr= confusion_matrix(y_test,predictions)
sns.heatmap(cmr/np.sum(cmr), annot= True, fmt= '0.2%', cmap= 'coolwarm')

**Create a KNN model instance with n_neighbors=1**

In [None]:
knn = KNeighborsClassifier(n_neighbors = 1)

**Fit this KNN model to the training data.**

In [None]:
knn.fit(x_train,y_train)

### **Prediction and Evaluations***

**Use the predict method to predict values using your KNN model and X_test.**

In [None]:
pred = knn.predict(x_test)

In [None]:
Knn_ACC = accuracy_score(y_test,pred)
print(Knn_ACC)

** Create a confusion matrix and classification report.**

In [None]:
print(classification_report(y_test,pred))

In [None]:
print(confusion_matrix(y_test,pred))

In [None]:
cmk = confusion_matrix(y_test,pred)
sns.heatmap(cmk/np.sum(cmk), annot= True, fmt= '0.2%', cmap= 'Blues')

## XGBoost Classifier

In [None]:
Xgb = XGBClassifier(max_depth= 8,n_estimators= 125, random_state= 0, learning_rate= 0.03, n_jobs= 5)
Xgb.fit(x_train,y_train)
pred_Xgb =Xgb.predict(x_test)

In [None]:
Xgb_ACC = accuracy_score(y_test,pred_Xgb)
print(Xgb_ACC)

In [None]:
print(classification_report(y_test, pred_Xgb))

In [None]:
print(confusion_matrix(y_test, pred_Xgb))

In [None]:
cmx = confusion_matrix(y_test,pred_Xgb)
sns.heatmap(cmx/np.sum(cmx), annot= True, fmt= '0.2%', cmap= 'Reds')

# Final Report 

In [None]:
models = pd.DataFrame({"Model":['Logistic','Random Forest','Decision Tree','XGBoost','KNeighbours'],
           "Accuracy":[Acc,Acc_rfc,Acc_dt,Xgb_ACC,Knn_ACC]})#Creat Data.....
models.sort_values(by='Accuracy', ascending=False)#Show the Higher to lower value order....

In [None]:
output = pd.DataFrame({"Model":['Logistic','Random Forest','Decision Tree','XGBoost','KNeighbours'],
           "Accuracy":[Acc,Acc_rfc,Acc_dt,Xgb_ACC,Knn_ACC]})
output.to_csv('Water Quality EDA & Prediction', index= False)# Save My Output........

In [None]:
output.head()

In [None]:
sns.barplot(x ='Model', y ='Accuracy', data = models,palette = 'coolwarm')

# The Best Model We Can Apply is Random Forest