In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
potwater = pd.read_csv("../input/water-potability/water_potability.csv")
potwater.head()

In [None]:
potwater["Potability"].value_counts().plot(kind="bar")
plt.title("Potability");

In [None]:
potwater.describe()

In [None]:
numbers = pd.Series(potwater.columns)
potwater[numbers].hist(figsize=(14,14))
plt.show();

In [None]:
#Heatmap illustration of correlation
corr = potwater.corr()
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corr, vmax=.8, annot=True);

In [None]:
#Imputation
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
imputed_potwater = pd.DataFrame(my_imputer.fit_transform(potwater))
imputed_potwater.columns = potwater.columns

In [None]:
imwater = imputed_potwater
imwater.head()

In [None]:
#ScatterMatrix
import plotly.express as px
fig = px.scatter_matrix(imwater, dimensions = ["ph", "Sulfate","Hardness", "Chloramines", "Solids"], color = "Potability")
fig.show();

In [None]:
#Marginal Plot
fig = px.scatter(imwater, x = "ph", y = "Sulfate", color = "Potability", marginal_x = "box", marginal_y="violin")
fig.show();

In [None]:
#Marginal Plot
fig = px.scatter(imwater, x = "ph", y = "Hardness", color = "Potability", marginal_x = "box", marginal_y="violin")
fig.show();

In [None]:
#Marginal Plot
fig = px.scatter(imwater, x = "ph", y = "Sulfate", color = "Chloramines", marginal_x = "box", marginal_y="violin")
fig.show();

In [None]:
#Marginal Plot
fig = px.scatter(imwater, x = "ph", y = "Solids", color = "Potability", marginal_x = "box", marginal_y="violin")
fig.show();

In [None]:
#FeatureSelection
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize']= 18, 8
sns.set_style("darkgrid")
x = imwater.iloc[:, :-1]
y = imwater.iloc[:, -1]
model = ExtraTreesClassifier()
model.fit(x, y)
print(model.feature_importances_)
feat_importances = pd.Series(model.feature_importances_, index = x.columns)
feat_importances.nlargest(5).plot(kind = 'barh')
plt.show();

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from mlxtend.plotting import plot_confusion_matrix
Features = ['Sulfate','ph','Hardness','Chloramines','Solids' ]
x = imwater[Features]
y = imwater["Potability"]
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2, random_state= 2)

In [None]:
#Logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
log_reg.pred = log_reg.predict(x_test)

print(classification_report(y_test,log_reg.pred))
metrics.plot_roc_curve(log_reg, x_test, y_test);

In [None]:
cm = confusion_matrix(y_test, log_reg.pred)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("Logistic Regression Model - Confusion Matrix")
plt.xticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.yticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.show();

In [None]:
#Support Vector
from sklearn.svm import SVC
sv_clf = SVC()
sv_clf.fit(x_train, y_train)
sv_clf_pred = sv_clf.predict(x_test)
sv_clf_acc = accuracy_score(y_test, sv_clf_pred)
print(classification_report(y_test,sv_clf_pred))
metrics.plot_roc_curve(sv_clf, x_test, y_test);

In [None]:
cm = confusion_matrix(y_test, sv_clf_pred)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("SVC Model - Confusion Matrix")
plt.xticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.yticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.show();

In [None]:
#K Neighbours Classifier
from sklearn.neighbors import KNeighborsClassifier
kn_clf = KNeighborsClassifier(n_neighbors=6)
kn_clf.fit(x_train, y_train)
kn_pred = kn_clf.predict(x_test)
kn_acc = accuracy_score(y_test, kn_pred)
print(classification_report(y_test,kn_pred))
metrics.plot_roc_curve(kn_clf, x_test, y_test);


In [None]:
cm = confusion_matrix(y_test, kn_pred)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("KN model - Confusion Matrix")
plt.xticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.yticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.show();

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0, criterion='entropy')
dt_clf.fit(x_train, y_train)
dt_pred = dt_clf.predict(x_test)
dt_acc = accuracy_score(y_test, dt_pred)
print(classification_report(y_test,dt_pred))
metrics.plot_roc_curve(dt_clf, x_test, y_test);


In [None]:
cm = confusion_matrix(y_test, dt_pred)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("DT model - Confusion Matrix")
plt.xticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.yticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.show();

In [None]:
#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
r_clf = RandomForestClassifier(max_features=0.5, max_depth=15, random_state=1)
r_clf.fit(x_train, y_train)
r_pred = r_clf.predict(x_test)
r_acc = accuracy_score(y_test, r_pred)
print(classification_report(y_test,r_pred))
metrics.plot_roc_curve(r_clf, x_test, y_test);

In [None]:
cm = confusion_matrix(y_test, r_pred)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("RF model - Confusion Matrix")
plt.xticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.yticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.show();

In [None]:
#Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gradientboost_clf = GradientBoostingClassifier(max_depth=2, random_state=1)
gradientboost_clf.fit(x_train,y_train)
gradientboost_pred = gradientboost_clf.predict(x_test)
gradientboost_acc = accuracy_score(y_test, gradientboost_pred)
print(classification_report(y_test,r_pred))
metrics.plot_roc_curve(gradientboost_clf, x_test, y_test);

In [None]:
cm = confusion_matrix(y_test, gradientboost_pred)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("Gradient Boosting model - Confusion Matrix")
plt.xticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.yticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.show();

In [None]:
#xgbrf classifier
import xgboost
xgb_clf = xgboost.XGBRFClassifier(max_depth=3, random_state=1)
xgb_clf.fit(x_train,y_train)
xgb_pred = xgb_clf.predict(x_test)
xgb_acc = accuracy_score(y_test, xgb_pred)
print(classification_report(y_test,xgb_pred))
metrics.plot_roc_curve(xgb_clf, x_test, y_test);

In [None]:
cm = confusion_matrix(y_test, xgb_pred)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("XGBRF model - Confusion Matrix")
plt.xticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.yticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.show();

In [None]:
#LGBM classifier
import lightgbm
lgb_clf = lightgbm.LGBMClassifier(max_depth=2, random_state=4)
lgb_clf.fit(x_train,y_train)
lgb_pred = lgb_clf.predict(x_test)
lgb_acc = accuracy_score(y_test, lgb_pred)
print(classification_report(y_test,xgb_pred))
metrics.plot_roc_curve(lgb_clf, x_test, y_test);

In [None]:
cm = confusion_matrix(y_test, lgb_pred)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("LightGBM model - Confusion Matrix")
plt.xticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.yticks(range(2), ["Potable","Not Potable"], fontsize=18)
plt.show();