In [None]:
import numpy as np 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
#Scikit-learn models
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
# Scikit-learn metrics
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
df.hist(figsize=(12,8));

# **Missing Values Analysis**

In [None]:
missing={"missing":df.isnull().sum()," % of missing":round(((df.isnull().sum()/df.shape[0])*100),2)}
pd.DataFrame(missing)

In [None]:
pH_nan_1 = df.query('Potability == 1')['ph'][df['ph'].isna()].index

df.loc[pH_nan_1,'ph'] =df.query('Potability == 1')['ph'][df['ph'].notna()].mean()

pH_nan_0 = df.query('Potability == 0')['ph'][df['ph'].isna()].index
df.loc[pH_nan_0,'ph'] = df.query('Potability == 0')['ph'][df['ph'].notna()].mean()

In [None]:
Sulfate_nan_1 = df.query('Potability == 1')['Sulfate'][df['Sulfate'].isna()].index
df.loc[Sulfate_nan_1,'Sulfate'] =df.query('Potability == 1')['Sulfate'][df['Sulfate'].notna()].mean()

Sulfate_nan_0 = df.query('Potability == 0')['Sulfate'][df['Sulfate'].isna()].index
df.loc[Sulfate_nan_0,'Sulfate'] = df.query('Potability == 0')['Sulfate'][df['Sulfate'].notna()].mean()

In [None]:
df=df.dropna(subset=["Trihalomethanes"])

In [None]:
df.isnull().sum()

In [None]:
Potability=df["Potability"].value_counts()
Potability

In [None]:
plt.pie(Potability,labels=["Non-potable","potable"],startangle=90,explode=[0.3,0])
plt.show()

In [None]:
# Feature correlation heat map
# Get Pearson correlation values
data = df.corr()    # Pairwise correlation with a null value is ignored
# Generate heat map using seaborn
fig, ax = plt.subplots(figsize=(12,8))                          # Create grid of empty subplots using matplotlib library                      
mask = np.triu(np.ones_like(data, dtype=bool))                   # Mask correlation matrix along its line of symmetry to remove redencency and correlation of a feature with itself
sns.heatmap(data, cmap='seismic', annot=True, mask=mask, ax=ax, vmin=-0.2, vmax=0.2)    # Create heat map useing seaborn library
fig.text(0.5, 1.05, 'Correlation Heat Map', horizontalalignment='center', verticalalignment='center', fontsize=14, fontweight='bold', transform=ax.transAxes)   # Add title
sns.set_style('white')        # Remove tick marks

In [None]:
# Box Plots
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(20,10))  # Create empty grid of subplots
fig.subplots_adjust(hspace=.5)                       # Adjust vertical/height spacing 

# Fill each subplot with the distribution of a feature separated by potability
a=0                               # Increment subplot coordinates
for feature in df.drop('Potability', axis=1):    # Iterate through features ('Potability' is a label) 
  df.boxplot(by='Potability', column=[feature], ax=axes[ a%2, a%5 ], grid=False)    # Create boxplots for each feature grouped by potable or not (df.boxplot() auto handles nan correctly). Subplot coordinates [a%2, a%5] start top left and vertically zig zag moving right.
  a+=1

axes[1,4].remove()        # Remove unnecessary subplot from 2x5 grid
plt.show()

# **train_test_split**

In [None]:
df_copy=df
x=df.drop(["Potability"],axis=1)
y=df.Potability


In [None]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=0)

In [None]:
Accuracy={}

# **Logistic regresion**

In [None]:
logistic_regression= LogisticRegression()
logistic_regression.fit(X_train,y_train)
y_pred=logistic_regression.predict(X_test)

In [None]:
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True,fmt='g')

In [None]:
Accuracy_Logistic_regresion=round((metrics.accuracy_score(y_test, y_pred)*100),2)
print('Accuracy Logistic regresion: ',Accuracy_Logistic_regresion,"%")
Accuracy["Logisticregresion"]=Accuracy_Logistic_regresion

# **SVM**

In [None]:
from sklearn import svm

In [None]:
svm_class= svm.SVC()
svm_class.fit(X_train,y_train)
y_pred=svm_class.predict(X_test)

In [None]:
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True, cmap="YlGnBu" ,fmt='g')

In [None]:
Accuracy_Svm=round((metrics.accuracy_score(y_test, y_pred)*100),2)
print('Accuracy: ',Accuracy_Svm,"%")
Accuracy["SVC"]=Accuracy_Svm

# **Decision Tree Classifier**

In [None]:
from sklearn import tree

In [None]:
clf=tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

In [None]:
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True, cmap="YlGnBu" ,fmt='g')

In [None]:
Accuracy_Decision_Tree=round((metrics.accuracy_score(y_test, y_pred)*100),2)
print('Accuracy: ',Accuracy_Decision_Tree,"%")
Accuracy["DecisionTreeClassifier"]=Accuracy_Decision_Tree

# **RandomForestClassifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf= RandomForestClassifier()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

In [None]:
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True, cmap="YlGnBu" ,fmt='g')

In [None]:
Accuracy_RandomForestClassifier=round((accuracy_score(y_test,y_pred)*100),2)
print("Accuracy_RandomForestClassifier : ",Accuracy_RandomForestClassifier,"%")
Accuracy["RandomForestClassifier"]=Accuracy_RandomForestClassifier

# **KNeighborsClassifier**

In [None]:
clf=KNeighborsClassifier()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

In [None]:
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True, cmap="YlGnBu" ,fmt='g')


In [None]:
Accuracy_KNeighborsClassifier=round((metrics.accuracy_score(y_test,y_pred)*100),2)
print("Accuracy_KNeighborsClassifier : ",Accuracy_KNeighborsClassifier,"%")
Accuracy["Accuracy_KNeighborsClassifier"]=Accuracy_KNeighborsClassifier

# **XGBClassifier**

In [None]:
import xgboost as xgb

In [None]:
xg_reg = xgb.XGBClassifier(eval_metric = 'logloss', use_label_encoder=False)

In [None]:
xg_reg.fit(X_train,y_train)
y_pred = xg_reg.predict(X_test)
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True, cmap="YlGnBu" ,fmt='g')

In [None]:
xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()

# **All Model Building**

In [None]:
models = [  
  LogisticRegression(),
  SVC(),
  DecisionTreeClassifier(),
  XGBClassifier(eval_metric = 'logloss', use_label_encoder=False),
  AdaBoostClassifier(),
  RandomForestClassifier(),
  AdaBoostClassifier(),
  GaussianNB(),
  KNeighborsClassifier()]

model_name=["LogisticRegression","SVC","DecisionTreeClassifier",'XGBClassifier','AdaBoostClassifier',"RandomForestClassifier","AdaBoostClassifier","GaussianNB","KNeighborsClassifier"]

prints={}
for mod,name in zip(models,model_name):
  model=mod
  model.fit(X_train,y_train)
  y_pred=model.predict(X_test)
  acc=round((accuracy_score(y_test,y_pred)*100),2)
  prints[name]=acc

In [None]:
accuracy_all=pd.DataFrame(prints,index=[1])
accuracy_all

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Create the random grid
random_grid = {"n_neighbors":[3,4,5,6,7,8,10]
               }

rf = KNeighborsClassifier()

from pprint import pprint
pprint(random_grid)
rf = KNeighborsClassifier()
grid_cv_dtm = GridSearchCV(rf, random_grid)

In [None]:
grid_cv_dtm.fit(X_train,y_train)

In [None]:
df = pd.DataFrame(data=grid_cv_dtm.cv_results_)
df.head(7)

# **Deep Learning**

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras import models
from tensorflow.keras.layers import BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam, Adagrad, RMSprop, SGD
from tensorflow.keras.layers import Activation
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import BatchNormalization
from keras.models import Sequential, load_model
from keras.layers.core import Dense, Dropout, Activation

In [None]:
x = df_copy.drop(['Potability'], axis = 1)
y = df_copy['Potability']

In [None]:
st = StandardScaler()
x_columns= x.columns
x[x_columns] = st.fit_transform(x[x_columns])

In [None]:
x.head()

In [None]:
x.describe()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size = 0.2, random_state = 0)

In [None]:
X_val, X_test, Y_val, Y_test = train_test_split(X_test,Y_test, test_size = 0.5, random_state = 0)

In [None]:
model = models.Sequential()

model.add(layers.Dense(16, input_shape=(9,)))
model.add(BatchNormalization())
model.add(Activation("relu"))

model.add(layers.Dense(32))
model.add(BatchNormalization())
model.add(Activation("relu"))

model.add(layers.Dense(16))
model.add(BatchNormalization())
model.add(Activation("relu"))

model.add(layers.Dense(1))
model.add(Activation("sigmoid"))

In [None]:
model.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
tf.random.set_seed(0)
history = model.fit(X_train, Y_train,
          batch_size=32, epochs=200,
          verbose=2,
          validation_data=(X_val, Y_val))


In [None]:
# plotting the metrics
fig = plt.figure()
plt.subplot(2,1,1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='lower right')

plt.subplot(2,1,2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')

plt.tight_layout()