In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, classification_report, plot_confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
%matplotlib inline


In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')

# Basic Information of dataset

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)
df.reset_index(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.head()

### Exploratory Data Analysis

In [None]:
labels = ['Non-Potable', 'Potable']
data = [df['Potability'].value_counts()[0], df['Potability'].value_counts()[1]]
fig1, ax1 = plt.subplots(figsize=(15, 6))
ax1.pie(data, labels = labels, explode=[0.05]*2, autopct='%1.1f%%',pctdistance=0.5, shadow=True, colors = ['#1f78b4','#b2df8a'])
plt.title("Water Potability", fontsize=15);
plt.show()

### Correlation

In [None]:
new_df = pd.get_dummies(df)
plt.figure(figsize=(10, 10))
corr = new_df.corr()
sns.heatmap(corr*100, cmap="Paired", annot= True, fmt=".0f")

### Relationship between Trihalomethanes and Chloramines
* THMs are chemicals which may be found in water treated with chlorine.

In [None]:
sns.jointplot(x="Chloramines", y="Trihalomethanes", data=df, kind="hex", marginal_kws={'color': '#1f78b4'},color='#b2df8a');


### Density Distribution

In [None]:
fig, ax = plt.subplots(nrows=9, figsize=(10, 35))
count=0
for cols in list(df.iloc[:,1:-1].columns):
    sns.kdeplot(df[cols], fill=True, alpha=1, hue = df['Potability'], 
                palette='Paired', multiple='stack', ax=ax[count]).set_title(cols, fontsize=15)
    
    ax[count].set_xlabel(' ')
    ax[count].set_ylabel(' ')
    count=count++1  
plt.show()


In [None]:
ax = sns.pairplot(df, hue="Potability",kind="reg",palette="Paired")
plt.show()

### PREPARING THE DATA 

In [None]:
X = df.iloc[:,:-1].values
Y = df.iloc[:,-1:].values

In [None]:
x_train, x_test,y_train,y_test = train_test_split(X, Y,test_size=0.15)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(x_train)
X_test = sc.fit_transform(x_test)

### Modelling

In [None]:
def model_evaluate(model, test):
    y_pred = model.predict(test)
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)

    categories = ['Potable', 'Not Potable']
    
    sns.heatmap(cm, cmap = 'Blues', fmt = '', annot = True,
                xticklabels = categories, yticklabels = categories)

    plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
    plt.ylabel("Actual values"   , fontdict = {'size':14}, labelpad = 10)
    plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)

In [None]:
all_models = {'LogisticRegression':LogisticRegression(random_state = 0), 
              'SVC':SVC(), 
              'GaussianNaiveBayes':GaussianNB(), 
              'Bernoulli':BernoulliNB(), 
              'KneighborsClassifier':KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski'), 
              'DecisionTree':DecisionTreeClassifier(random_state = 0), 
              'RandomForest':RandomForestClassifier(n_estimators = 10, criterion = 'entropy'),
              'AdaBoostClassifier':AdaBoostClassifier(n_estimators = 50),
              'XGBClassifier': XGBClassifier(n_estimators = 100),
              'LGBMClassifier': LGBMClassifier(n_etimators=100)
         }
model_names = []
model_scores = []

for model_name in all_models:
    pipeline = make_pipeline(StandardScaler(), all_models[model_name])
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    model_names.append(model_name)
    model_scores.append(accuracy * 100)
    print(f"{model_name} got {str(accuracy*100)[:5]}% Accuracy.")

plt.figure(figsize = (9,5))
plt.barh(model_names, model_scores)
  
for index, value in enumerate(model_scores):
    plt.text(value, index,
             str(value)[:4]+"%")

plt.title("Models vs Accuracy")
plt.show()

In [None]:
model = XGBClassifier(n_estimators = 100, eval_metric='logloss', use_label_encoder=False)
scores = cross_validate(model, X_train, y_train, return_train_score=True, n_jobs=-1)
evaluation = [( X_train, y_train), ( X_test, y_test)]
model.fit(X_train, y_train)

model_evaluate(model, X_test)
print("Train Score : ", np.mean(scores['train_score']))
print("Test Score : ", np.mean(scores['test_score']))

In [None]:
model = LGBMClassifier(n_etimators=100)
scores = cross_validate(model, X_train, y_train, return_train_score=True, n_jobs=-1)
evaluation = [( X_train, y_train), ( X_test, y_test)]
model.fit(X_train, y_train)

model_evaluate(model, X_test)
print("Train Score : ", np.mean(scores['train_score']))
print("Test Score : ", np.mean(scores['test_score']))