# Importing the libraries

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("dark_background")

from scipy.stats import boxcox, skew
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, mean_absolute_error
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Loading the data

In [None]:
df = pd.read_csv("../input/water-potability/water_potability.csv")
df.head()

# Exploring the data

Knowledge about the data you are working on is very important for data analysis.

What I have used:

- [pandas.DataFrame.info](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.info.html)

- [pandas.DataFrame.isna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.isna.html)

- [pandas.DataFrame.describe](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe()

# Dropping null values

In [None]:
df.dropna(inplace=True)
df.isna().sum()

# Visualizing the data

In [None]:
sns.countplot(x="Potability", data=df, palette="Set2")
plt.show()

In [None]:
labels = df["Potability"].unique()
x = df["Potability"].value_counts()
explode = (0.1) * labels

fig1, ax1 = plt.subplots()
patches, texts, autotexts = ax1.pie(
                                    x=x, explode=explode,
                                    labels=labels, autopct="%1.2f%%",
                                    shadow=True, startangle=45
                                   )
ax1.axis('equal')

plt.legend(patches, labels, fontsize="xx-large")
plt.title("Potability", color="white", size=20)
plt.setp(texts, color='white', fontsize=15)
plt.setp(autotexts, color="black", size=12)

plt.tight_layout()
plt.show()

In [None]:
for col in df.columns[:-1]:
    sns.catplot(x="Potability", y=col, data=df, kind="box")

In [None]:
cor = df.drop("Potability", axis=1).corr()
plt.figure(figsize=(10, 7))
sns.heatmap(cor, annot=True, linewidth=.6, linecolor="black")
plt.show()

In [None]:
sns.pairplot(data=df, hue="Potability", palette="Set2")
plt.show()

By looking at the above plot, it looks like that our data is skewed.

# Fixing Skewed Data

In [None]:
for x in df.columns[:-1]:
    print(f"{x} : {skew(df[x])}")

## Log Transformation

In [None]:
# from sklearn.preprocessing import FunctionTransformer

# transformer = FunctionTransformer(np.log10, validate=True)
# for x in df.columns[:-1]:
#     df[x] = transformer.transform(df[[x]])

## Power Transformation

In [None]:
for x in df.columns[:-1]:
    df[x], _ = boxcox(df[x])

In [None]:
for x in df.columns[:-1]:
    print(f"{x} : {skew(df[x])}")

In [None]:
sns.pairplot(data=df, hue="Potability", palette="Set2")
plt.show()

# Building the models

## Creating features (X) and label (y)

Features are often referred to as "independent variables" and Label is often referred to as "dependent variable".

Here `Potability` is our label because it depends on other features.

In [None]:
X = df.drop("Potability", axis=1)
y = df["Potability"]

## Splitting the data into training and testing set

- Training data set is used for fitting our model to learn the patterns.
- Testing data set is used for prediction and unbiased evaluation of our final model

We can do this by using [sklearn.model_selection.train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).

Training data set - 80% of the total data

Testing data set - 20% of the total data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Scaling the data

Scaling means transforming the data so that it fits within a specific scale. We can do this by using [sklearn.preprocessing.StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)

StandardScaler is useful for the features that follow a __Normal Distribution__. Previously when we fixed the skewed data we got something similar to a normal distribution.

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
def score(y_test=y_test, y_pred=None):
    """Helper function for evaluation metrics."""
    acc = round(accuracy_score(y_test, y_pred), 2) * 100
    mae = round(mean_absolute_error(y_test, y_pred), 2)
    print(f"Accuracy: {acc:.2f}%, MAE: {mae}")
    
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, linewidth=0.8, linecolor="black", fmt='g')
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Truth")
    plt.show()
    
    return acc

In [None]:
# Creating a ndarray to store model's accuracy
accuracy_scores = np.zeros(8, dtype="float64")

## Logistic Regression

In [None]:
clf1 = LogisticRegression(solver="newton-cg", random_state=42).fit(X_train, y_train)
y_pred1 = clf1.predict(X_test)
accuracy_scores[0] = score(y_pred=y_pred1)

## Random Forest Classifier

In [None]:
clf2 = RandomForestClassifier(n_estimators=15, random_state=42).fit(X_train, y_train)
y_pred2 = clf2.predict(X_test)
accuracy_scores[1] = score(y_pred=y_pred2)

## Decision Tree Classifier

In [None]:
clf3 = DecisionTreeClassifier(criterion="entropy", splitter="best", max_depth=6, random_state=42).fit(X_train, y_train)
y_pred3 = clf3.predict(X_test)
accuracy_scores[2] = score(y_pred=y_pred3)

## Support Vector Classifier

In [None]:
clf4 = SVC(random_state=42).fit(X_train, y_train)
y_pred4 = clf4.predict(X_test)
accuracy_scores[3] = score(y_pred=y_pred4)

## Naive Bayes

In [None]:
clf5 = GaussianNB().fit(X_train, y_train)
y_pred5 = clf5.predict(X_test)
accuracy_scores[4] = score(y_pred=y_pred5)

## Stochastic Gradient Descent

In [None]:
clf6 = SGDClassifier(random_state=42).fit(X_train, y_train)
y_pred6 = clf6.predict(X_test)
accuracy_scores[5] = score(y_pred=y_pred6)

## K-Nearest Neighbours

In [None]:
err_rate = []
for i in range(1, 50):
    knn = KNeighborsClassifier(n_neighbors=i).fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    err_rate.append(np.mean(y_pred != y_test))
k_index = err_rate.index(min(err_rate))
min_err = min(err_rate)
print(f"Minimum error of {min_err} at K = {k_index}.")

In [None]:
clf7 = KNeighborsClassifier(n_neighbors=12).fit(X_train, y_train)
y_pred7 = clf7.predict(X_test)
accuracy_scores[6] = score(y_pred=y_pred7)

## Gradient Boosting Classifier

In [None]:
clf8 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=9, random_state=42).fit(X_train, y_train)
y_pred8 = clf8.predict(X_test)
accuracy_scores[7] = score(y_pred=y_pred8)

# Conclusion

In [None]:
models = [
          "Logistic Regression", "Random Forest Classifier", "Decision Tree Classifier",
          "Support Vector Classifier", "Naive Bayes", "Stochastic Gradient Descent",
          "K-Nearest Neighbours", "Gradient Boosting Classifier",
         ]

plt.figure(figsize=(10, 6))
sns.barplot(x=models, y=accuracy_scores)

plt.xlabel("Model Name")
plt.xticks(rotation = -90)
plt.ylabel("Accuracy")

plt.show()