# Understanding the data

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')
df

Check for basic information about the features

In [None]:
df.info()

### Preliminary stats

In [None]:
df.describe()

#### Null Values

In [None]:
df.isnull().sum()

#### Other Statistical parameters

In [None]:
df.kurtosis()

In [None]:
df.skew()

## Visualization of Distribution

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig = plt.figure(figsize=(12,12), constrained_layout = True)
axis = fig.subplots(3, 3)

axis[0, 0].hist(df['ph'])
axis[0, 0].set_title("ph")

axis[0, 1].hist(df['Hardness'])
axis[0, 1].set_title("Hardness")

axis[0, 2].hist(df['Solids'])
axis[0, 2].set_title("Solids")

axis[1, 0].hist(df['Chloramines'])
axis[1, 0].set_title("Chloramines")

axis[1, 1].hist(df['Sulfate'])
axis[1, 1].set_title("Sulfate")

axis[1, 2].hist(df['Conductivity'])
axis[1, 2].set_title("Conductivity")

axis[2, 0].hist(df['Organic_carbon'])
axis[2, 0].set_title("Organic_carbon")

axis[2, 1].hist(df['Trihalomethanes'])
axis[2, 1].set_title("Trihalomethanes")

axis[2, 2].hist(df['Turbidity'])
axis[2, 2].set_title("Turbidity")

In [None]:
import seaborn as sns

In [None]:
fig = plt.figure(figsize=(12,12), constrained_layout = True)
axis = fig.subplots(3, 3)

sns.kdeplot(df['ph'], ax=axis[0, 0], fill=True)
sns.kdeplot(df['Hardness'], ax=axis[0, 1], fill=True)
sns.kdeplot(df['Solids'], ax=axis[0, 2], fill=True)
sns.kdeplot(df['Chloramines'], ax=axis[1, 0], fill=True)
sns.kdeplot(df['Sulfate'], ax=axis[1, 1], fill=True)
sns.kdeplot(df['Conductivity'], ax=axis[1, 2], fill=True)
sns.kdeplot(df['Organic_carbon'], ax=axis[2, 0], fill=True)
sns.kdeplot(df['Trihalomethanes'], ax=axis[2, 1], fill=True)
sns.kdeplot(df['Turbidity'], ax=axis[2, 2], fill=True)

#### Correlation Between Features

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), annot=True, cmap="inferno")

### Observations:

The total number of features: 9


1. The differnt number of counts shows that there are missing or null values in the dataset.
2. The feature range varies a lot, and can go as small as 0.0 to 61227.19, hence regularization is required.
3. The features are not correlated except there is a small correlation between the feature "Solids" and "Sulphate".


### Preparing the data

Removing null values:

We can use mutiple methods here,
1. Remove the rows with missing values (But we loose data...!, so use when missing values are less).
2. Replace missing with mean value (Generalization takes place, but works well only then there are no outliners).
3. Replace missing with mode value (Repeating the most common value might bias the model further on!).
4. Replace missing with median value (Good to use when lots of outliners are present in dataset)

#### Methods used in this notebook

- Replacing pH Null values with median values since the mean of pH values is more than preferred pH value range ie. 6.52-6.83.
- Replacing the Trihalomethane values with mean values since the outliner values are very less.

Note that the Sulfate values are not imputed since the amount of missing values is very high, and using any of the above methods will only introduce biasness in the model, hence it will be imputed using regression.

In [None]:
ph_median = df["ph"].median()
tri_mean = df["Trihalomethanes"].mean()

In [None]:
df["ph"].replace(np.nan, ph_median, inplace=True)
df["Trihalomethanes"].replace(np.nan, tri_mean, inplace=True)

#### Removing Skewness

The features, Solids and Conductivity are skewed, as we can observe from the KDE and histogram plots. Hence before going forward it is necessary to remove this skewness. The most popular method to remove the skewness is to take the log transform of the data, provided the data has non-zero and positive values.
In our case both these features are positive in nature and do not have any zero values.

In [None]:
df['Solids'] = np.sqrt(df['Solids'])
df['Conductivity'] = np.sqrt(df['Conductivity'])

In [None]:
df.skew()

We keep comparing the data manipulations with normal distributions to ensure proper data cleaning.

In [None]:
fig = plt.figure(figsize=(12,12), constrained_layout = True)
axis = fig.subplots(3, 3)

sns.kdeplot(df['ph'], ax=axis[0, 0], fill=True)
sns.kdeplot(df['Hardness'], ax=axis[0, 1], fill=True)
sns.kdeplot(df['Solids'], ax=axis[0, 2], fill=True)
sns.kdeplot(df['Chloramines'], ax=axis[1, 0], fill=True)
sns.kdeplot(df['Sulfate'], ax=axis[1, 1], fill=True)
sns.kdeplot(df['Conductivity'], ax=axis[1, 2], fill=True)
sns.kdeplot(df['Organic_carbon'], ax=axis[2, 0], fill=True)
sns.kdeplot(df['Trihalomethanes'], ax=axis[2, 1], fill=True)
sns.kdeplot(df['Turbidity'], ax=axis[2, 2], fill=True)

### Regularization/Normalization of values of each feature

It is necessary since the average value of each feature varies by huge difference. Failing to do so will result in a biased model.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scale = StandardScaler()

df[df.columns[0]] = scale.fit_transform(np.array(df[df.columns[0]]).reshape(-1, 1))
df[df.columns[1]] = scale.fit_transform(np.array(df[df.columns[1]]).reshape(-1, 1))
df[df.columns[2]] = scale.fit_transform(np.array(df[df.columns[2]]).reshape(-1, 1))
df[df.columns[3]] = scale.fit_transform(np.array(df[df.columns[3]]).reshape(-1, 1))
df[df.columns[4]] = scale.fit_transform(np.array(df[df.columns[4]]).reshape(-1, 1))
df[df.columns[5]] = scale.fit_transform(np.array(df[df.columns[5]]).reshape(-1, 1))
df[df.columns[6]] = scale.fit_transform(np.array(df[df.columns[6]]).reshape(-1, 1))
df[df.columns[7]] = scale.fit_transform(np.array(df[df.columns[7]]).reshape(-1, 1))
df[df.columns[8]] = scale.fit_transform(np.array(df[df.columns[8]]).reshape(-1, 1))

In [None]:
df

#### Regressive Imputation

Imputing Sulfate fiels values using regression due to higher number of missing values

In [None]:
df_reg = df.query('Sulfate == Sulfate')
pot_reg = df_reg['Potability']
df_reg = df_reg.drop('Potability', axis=1)
sulfate_reg = df_reg['Sulfate']
df_reg = df_reg.drop('Sulfate', axis=1)
df_reg.isnull().sum()

In [None]:
df_pred = df.query('Sulfate != Sulfate')
pot_pred = df_pred['Potability']
df_pred = df_pred.drop('Potability', axis=1)
sulfate_pred = df_pred['Sulfate']
df_pred = df_pred.drop('Sulfate', axis=1)

In [None]:
df_pred.isnull().sum()

In [None]:
from sklearn import linear_model

In [None]:
model = linear_model.LinearRegression()

In [None]:
model.fit(X=df_reg, y=sulfate_reg)

In [None]:
sulfate_pred = model.predict(df_pred)

In [None]:
df_reg['Sulfate'] = sulfate_reg
df_reg['Potability'] = pot_reg
df_pred['Sulfate'] = sulfate_pred
df_pred['Potability'] = pot_pred

In [None]:
df = pd.concat([df_reg, df_pred])

In [None]:
fig = plt.figure(figsize=(12,12), constrained_layout = True)
axis = fig.subplots(3, 3)

sns.kdeplot(df['ph'], ax=axis[0, 0], fill=True)
sns.kdeplot(df['Hardness'], ax=axis[0, 1], fill=True)
sns.kdeplot(df['Solids'], ax=axis[0, 2], fill=True)
sns.kdeplot(df['Chloramines'], ax=axis[1, 0], fill=True)
sns.kdeplot(df['Sulfate'], ax=axis[1, 1], fill=True)
sns.kdeplot(df['Conductivity'], ax=axis[1, 2], fill=True)
sns.kdeplot(df['Organic_carbon'], ax=axis[2, 0], fill=True)
sns.kdeplot(df['Trihalomethanes'], ax=axis[2, 1], fill=True)
sns.kdeplot(df['Turbidity'], ax=axis[2, 2], fill=True)

#### Shuffling the data by rows.

In [None]:
df = df.sample(frac = 1)

The data is now ready for modelling and classification.

# Models

In [None]:
y = np.array(df['Potability'], dtype=int)

In [None]:
df = df.drop(['Potability'], axis=1)

In [None]:
x = df.to_numpy(dtype='float32')

Splitting into training and testing sets.

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=120)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf1 = LogisticRegression()
clf2 = KNeighborsClassifier()
clf3 = RandomForestClassifier()
clf4 = DecisionTreeClassifier()
clf5 = GaussianNB()
clf6 = ExtraTreesClassifier()

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('kn', clf2), ('rf', clf3), ('dt', clf4), ('gnb', clf5), ('etc', clf6)],
                        voting='hard')

for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6, eclf], ['Logistic Regression', 'Knn', 'Random Forest', 'Decision Tree',
                                                                   'Gaussian NB','Extra Tree','Ensemble']):
    scores = cross_val_score(clf, x, y, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
model = Sequential()
model.add(Dense(12, input_dim=9, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(x_train, y_train, epochs=200, batch_size=16, validation_data=(x_test,y_test))

# Final Thoughts

The data has some embedded variations which causes fundamental confusion while classification. The solution for such problem is usually,
- Increasing the size of the dataset
- Adding more features

After modelling using various techniques, it is clear that only these models are usable,
- Ensemble Model
- Random Forest
- Extra Tree Classification

Even though other models do not produce significant results, they are however necessary in reducing confusion for the ensemble model.

# Thank You