In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/water-potability/water_potability.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# missing info in ph, Sulfate, Trihalomethanes columns
df.isnull().sum()

In [None]:
#fraction of data missing in these columns
df.isnull().sum()/len(df)
#we'll have to handle these missing values before training any learning model

In [None]:
df.describe()

In [None]:
# 61% to 39% split between true and false values of target variable
df['Potability'].value_counts(normalize = True)

In [None]:
# skew of data features
df.drop('Potability', axis = 1).skew()
# data is close to normally distributed

In [None]:
df.drop('Potability', axis = 1).hist(figsize = (12,8))
# solids are skewed towards lower concentration as expected

In [None]:
#correlation exploration between data features
plt.figure(figsize= (10,10))
sns.heatmap(df.corr(), annot= True)
# no strong correlation between data features
# no need to proceed further with exploratory data analysis

In [None]:
#recall fraction of data missing in ph, Sulfate and Trihalomethanes column
df.isnull().sum()/len(df)

In [None]:
#we'll use scikit learn's SimpleImputer to imput the missing values 
# first perform a train-test split
X = df.drop('Potability', axis =1)
y = df['Potability']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
# making sure X_train dataset reflects features in the entire dataset
X_train.isnull().sum()/len(X_train)

In [None]:
X_train.skew()

In [None]:
X_train.hist(figsize=(12,8))

In [None]:
#X_train dataset reflects entire dataset. Now to impute the missing values in the dataset
#with scikit learn's SimpleImputer

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [None]:
X_train.describe()

In [None]:
X_train.isnull().sum()

In [None]:
# we've successfully imputed the missing data values and thereby have retained 
# more of the dataset

# Training models

In [None]:
# In classifying water as potable, we'll compare three classification models:
# Logistic Regression, Decision Tree Classifier and Random Forest Classifier. 

# Logistic Regression

In [None]:
# importing, imputing and training model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)

In [None]:
# accessing model performance
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, lr_preds))
print('\n')
print(confusion_matrix(y_test, lr_preds))
# many false positives here

# Decision Tree

In [None]:
# importing, imputing and training model
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc_preds = dtc.predict(X_test)

In [None]:
# accessing model performance
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, dtc_preds))
print('\n')
print(confusion_matrix(y_test, dtc_preds))

# Random Forest Classifier

In [None]:
# importing, imputing and training model
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_preds = rfc.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, rfc_preds))
print('\n')
print(confusion_matrix(y_test, rfc_preds))