In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
df.head()

# Handling Null Values with Means

In [None]:
df.isnull().sum()

In [None]:
print("number of rows: ", df.shape[0])
print("number of column: ", df.shape[1])
df.Potability.value_counts()

In [None]:
#df = df.dropna() # dropping all row with null values
df_notpotable  = df[df['Potability']==0]
df_potable = df[df['Potability']==1] 

In [None]:
df_notpotable.isnull().sum()

In [None]:
df_potable.isnull().sum()

In [None]:
from sklearn.impute import SimpleImputer

impute = SimpleImputer(missing_values=np.nan, strategy = 'mean')

#for df_notpotable
impute.fit(df_notpotable[['ph']])
impute.fit(df_notpotable[['Sulfate']])
impute.fit(df_notpotable[['Trihalomethanes']])

df_notpotable['ph'] = impute.transform(df_notpotable[['ph']])
df_notpotable['Sulfate'] = impute.transform(df_notpotable[['Sulfate']])
df_notpotable['Trihalomethanes'] = impute.transform(df_notpotable[['Trihalomethanes']])

#for df_potable
impute.fit(df_potable[['ph']])
impute.fit(df_potable[['Sulfate']])
impute.fit(df_potable[['Trihalomethanes']])

df_potable['ph'] = impute.transform(df_potable[['ph']])
df_potable['Sulfate'] = impute.transform(df_potable[['Sulfate']])
df_potable['Trihalomethanes'] = impute.transform(df_potable[['Trihalomethanes']])

In [None]:
df_notpotable.isnull().sum()

In [None]:
df.Potability.value_counts()

In [None]:
df_potable.isnull().sum()

In [None]:
df = pd.concat([df_notpotable, df_potable])

In [None]:
df.head()

# Normalizing the dataset

In [None]:
df = df.sample(frac = 1) # shuffling the rows

In [None]:
df.head()

In [None]:
x = df.drop('Potability', axis = 1) #getting x
y = df['Potability'] #getting y 

In [None]:
from sklearn.preprocessing import MinMaxScaler
#Now, lets scale all the value in x within 0 to 1...
scaler = MinMaxScaler() # creating object of MinMaxScaler
scaler.fit(x)

x = scaler.transform(x)

x = pd.DataFrame(x)
x

In [None]:
df.hist(bins=10, figsize=(20,15), color = 'teal')

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(25,10))
 
p1 = fig.add_subplot(2,2,1)
p1.hist(df.ph[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.ph[df.Potability == 1], bins=20, alpha = .4)
plt.title('pH')
plt.xlabel('pH')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,2)
p1.hist(df.Hardness[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Hardness[df.Potability == 1], bins=20, alpha = .4)
plt.title('Hardness')
plt.xlabel('Hardness')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,3)
p1.hist(df.Solids[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Solids[df.Potability == 1], bins=20, alpha = .4)
plt.title('Solids')
plt.xlabel('Solids')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,4)
p1.hist(df.Chloramines[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Chloramines[df.Potability == 1], bins=20, alpha = .4)
plt.title('Chloramines')
plt.xlabel('Chloramines')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)
 
plt.subplots_adjust(wspace=.1, hspace=.3)
plt.show()

In [None]:
count_classes = pd.value_counts(y,sort=True)
count_classes.plot(kind = 'bar', rot = 0)
plt.title("Potability Class Distribution")
plt.xticks(range(2))
plt.xlabel("Potability")
plt.ylabel("Frequency")

# OverSampling

In [None]:
zero = df[df['Potability']==0]
one = df[df['Potability']==1]

In [None]:
print(zero.shape,one.shape)

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
x_res, y_res = oversample.fit_resample(x, y)

In [None]:
x_res.shape,y_res.shape

In [None]:
count_classes = pd.value_counts(y_res,sort=True)
count_classes.plot(kind = 'bar', rot = 0)
plt.title("Potability Class Distribution")
plt.xticks(range(2))
plt.xlabel("Potability")
plt.ylabel("Frequency")

# Classifiers

In [None]:
from numpy import mean
from numpy import std
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=1, shuffle=True)

randomForestClassifier = RandomForestClassifier(n_estimators = 1000)

scores = cross_val_score(randomForestClassifier, x_res, y_res, scoring='accuracy', cv=cv, n_jobs=-1)

print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

In [None]:
#to plot accuracy bar
accuracy_list ={}
accuracy_list["K fold with RF"] = (mean(scores)*100)

In [None]:
# fit model no training data

from xgboost import XGBClassifier

xgbClassifier = XGBClassifier()
scores = cross_val_score(xgbClassifier, x_res, y_res, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

In [None]:
accuracy_list["K fold with XG Boost"] = (mean(scores)*100)
print(accuracy_list)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, stratify = y)
y_train = pd.Series(y_train)

In [None]:
from sklearn.metrics import accuracy_score
#Random Forest
randomForestClassifier = RandomForestClassifier(n_estimators = 1000)
randomForestClassifier.fit(x_train, y_train.values.ravel())
predictionsRandomForestClassifier = randomForestClassifier.predict(x_test)
accuracyRandomForestClassifier = accuracy_score(predictionsRandomForestClassifier, y_test)
print(accuracyRandomForestClassifier)

In [None]:
accuracy_list["Random Forest"] = accuracyRandomForestClassifier*100
print(accuracy_list)

In [None]:
# fit model no training data
model = XGBClassifier(use_label_encoder = False)
model.fit(x_train, y_train,verbose=True)
print(model)

In [None]:

# make predictions for test data
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:

accuracy_list["XG Boost"] = accuracy*100
print(accuracy_list)

In [None]:
import matplotlib.pyplot as plt
print(accuracy_list)

In [None]:
names = list(accuracy_list.keys())
values = list(accuracy_list.values())

fig, axs = plt.subplots(3,1, figsize=(15, 15), sharey=True)
axs[0].bar(names, values)
axs[1].scatter(names, values)
axs[2].plot(names, values)
fig.suptitle('Categorical Plotting')