In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
# Check for balance of the data in target variable

In [None]:
df.shape

In [None]:
import plotly.express as px

In [None]:
fig = px.histogram(df['Potability'])
fig.update_layout(bargap=0.2)
fig.show()

In [None]:
# Correlation plot

fig, ax = plt.subplots(figsize=(12,10))
sns.heatmap(df.corr(), annot=True, annot_kws = {'size':9}, xticklabels=df.columns, yticklabels=df.columns, ax=ax)

In [None]:
import plotly.express as px

fig = px.scatter_matrix(df, df.drop('Potability', axis=1), height=1250, width=1250,)
fig.show()

# Different approaches of dealing with missing values
## Assumption 1:-
If the data is missing then dropping the rows with nan values

In [None]:
na_cols = ['ph', 'Sulfate', 'Trihalomethanes']

In [None]:
df2 = df.dropna(subset = na_cols)

In [None]:
df2.head()

In [None]:
df2.shape

In [None]:
df2.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df2.reset_index(inplace=True, drop=True)

In [None]:
# df2.drop(['index'], axis=1, inplace=True)

In [None]:
df2.head()

In [None]:
y1 = df2.loc[:, 'Potability']

In [None]:
y1.head()

In [None]:
X1 = df2.drop(['Potability'], axis=1)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X1, y1, test_size=0.25, random_state=0)

In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
classifier1 = LogisticRegression(random_state=0)
classifier1.fit(train_x, train_y)

In [None]:
pred1 = classifier1.predict(test_x)

In [None]:
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(test_y, pred1)

In [None]:
cm1

In [None]:
(308+3)/test_x.shape[0]

## Approach 2: dropping the columns with nan values

In [None]:
df3 = df.drop(['Potability'], axis=1)

In [None]:
df3.head()

In [None]:
df3.isnull().sum()

In [None]:
y2 = df['Potability']

In [None]:
df3.drop(na_cols, axis=1, inplace=True)

In [None]:
df3.head()

In [None]:
train_x2, test_x2, train_y2, test_y2 = train_test_split(df3, y2, test_size=0.25, random_state=0)

In [None]:
classifier2 = LogisticRegression(random_state=0, max_iter=1000)
classifier2.fit(train_x2, train_y2)

In [None]:
pred2 = classifier2.predict(test_x2)

In [None]:
cm2 = confusion_matrix(test_y2, pred2)
cm2

In [None]:
# DecisionTree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error

tree1 = DecisionTreeClassifier(random_state=0)
tree1.fit(train_x2, train_y2)
pred3 = tree1.predict(test_x2)

In [None]:
mean_absolute_error(test_y2, pred3)

In [None]:
from sklearn.metrics import precision_score

In [None]:
precision_score(test_y2, pred2, average='macro')

In [None]:
precision_score(test_y2, pred3, average='macro')

The decision tree gave a better precision_score than Logistic regression in approach 2 that was to drop columns with nan values

## Approach 3: filling the Nan values using imputer

In [None]:
X = df.drop('Potability', axis=1)

In [None]:
X.shape

In [None]:
y = df['Potability']
y.shape

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)


In [None]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
imputed_train_X = pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_test_X = pd.DataFrame(my_imputer.transform(test_X))

In [None]:
imputed_train_X.columns = train_X.columns
imputed_test_X.columns = test_X.columns

In [None]:
imputed_test_X.isnull().sum()

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
forest_model = RandomForestClassifier(n_estimators=100, random_state=0)
forest_model.fit(imputed_train_X, train_y)


In [None]:
forest_preds = forest_model.predict(imputed_test_X)
print(mean_absolute_error(test_y, forest_preds))
print(precision_score(test_y, forest_preds, average='macro'))

In [None]:
tree_model2 = DecisionTreeClassifier(random_state=0)
tree_model2.fit(imputed_train_X, train_y)
tree_preds = tree_model2.predict(imputed_test_X)

print(mean_absolute_error(test_y, tree_preds))
print(precision_score(test_y, tree_preds, average='macro'))

## scaling the data

In [None]:
from mlxtend.preprocessing import minmax_scaling

scaled_X = minmax_scaling(X, columns = X.columns)

In [None]:
fig, ax = plt.subplots(1,2)
sns.distplot(X['Solids'], ax=ax[0])
ax[0].set_title("Original data")
sns.distplot(scaled_X['Solids'], ax=ax[1])
ax[1].set_title("Scaled data")

In [None]:
train_X3, test_X3, train_y3, test_y3 = train_test_split(scaled_X, y, train_size=0.8, test_size=0.2, random_state=0)



In [None]:
train_X3.isnull().sum()

In [None]:
my_imputer = SimpleImputer()
imputed_train_X3 = pd.DataFrame(my_imputer.fit_transform(train_X3))
imputed_test_X3 = pd.DataFrame(my_imputer.transform(test_X3))

In [None]:
imputed_train_X3.columns = train_X3.columns
imputed_test_X3.columns = test_X3.columns

In [None]:
imputed_train_X3.isnull().sum()

In [None]:
forest_model2 = RandomForestClassifier(n_estimators=100, random_state=0)
forest_model2.fit(imputed_train_X3, train_y3)
forest_preds2 = forest_model2.predict(imputed_test_X3)

print(mean_absolute_error(test_y3, forest_preds2))
print(precision_score(test_y3, forest_preds2, average='macro'))

### Random Forest gives the best result with accuracy_score of 69.66%

In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(test_y3, forest_preds2))