In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import packages and dataset

In [None]:
# Import standard packages and dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

data = pd.read_csv("../input/water-potability/water_potability.csv")

# First look at the data

In [None]:
data.describe()

In [None]:
data.head()

In [None]:
data.shape

#### Check for any missing values

In [None]:
data.isnull().sum()

# Create a dataframe for missing values

In [None]:
# How many % of missing data are there for each feature?
def isnull(data):
    total_rows = data.shape[0]
    missing_val_dict = {}
    for col in data.columns:
        missing_val_dict[col] = [data[col].isnull().sum(), (data[col].isnull().sum() / total_rows)]
    return missing_val_dict

null_dict = isnull(data)

data_miss = pd.DataFrame.from_dict(null_dict, orient="index", columns=['missing', 'miss_percent'])
data_miss

There are quite some missing data. 

- Ph: 491 (~15%)
- Sulfate: 781 (~24%)
- Trihalomethanes: 162 (~0.5%)

In [None]:
# Plot the graph
graph = sns.countplot(data.Potability)
graph.set_xticklabels(graph.get_xticklabels(),rotation=90)

# Display count inside the data
for p in graph.patches:
    height = p.get_height()
    graph.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center")

In the dataset, there are 1998 non-potable water for human consumption.

# Data cleaning

#### Let's see if there's any correlation in the potability of the water againist the different features.

In [None]:
# create correlation heat map with final score
corr = data.corr()
fig, ax = plt.subplots(figsize=(8,8)) # sample figsize in inches 
sns.heatmap(corr, annot=True);

From the data, it seems that most of the features has similar correlation with the Potability of the water.

In [None]:
data.isnull().sum()

In [None]:
sns.distplot(data['ph'])

In [None]:
sns.distplot(data['Trihalomethanes'])

In [None]:
sns.distplot(data['Sulfate'])

In [None]:
# Replace NA values for Trihalomethanes with median value of sulfate
data['Trihalomethanes'].fillna(data['Trihalomethanes'].median(), inplace=True)
data['Sulfate'].fillna(data['Sulfate'].median(), inplace=True)
data['ph'].fillna(data['ph'].median(), inplace=True)
data.isnull().sum()

In [None]:
sns.boxplot(y='ph', x='Potability',data=data)

In [None]:
sns.boxplot(y='Hardness', x='Potability',data=data)

In [None]:
sns.boxplot(y='Solids', x='Potability',data=data)

In [None]:
sns.boxplot(y='Sulfate', x='Potability',data=data)

In [None]:
sns.boxplot(y='Conductivity', x='Potability',data=data)

In [None]:
sns.boxplot(y='Organic_carbon', x='Potability',data=data)

In [None]:
sns.boxplot(y='Trihalomethanes', x='Potability',data=data)

In [None]:
sns.boxplot(y='Turbidity', x='Potability',data=data)

# Model training

In [None]:
# Import standard packages
from sklearn import svm
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score

scaler = StandardScaler()
model_svm = svm.SVC()
model_clf = RandomForestClassifier()
model_tree = DecisionTreeClassifier(random_state=0)
model_knn = KNeighborsClassifier()

In [None]:
# Spilt training and testing dataset
X = data.drop(['Potability'], axis=1)
y = data['Potability']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Standard scale the data
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#### Prediction using SVM

In [None]:
# Predictions using SVM
model_svm.fit(X_train, y_train)
predictions_svm = model_svm.predict(X_test)
score_svm = accuracy_score(y_test, predictions_svm)
print(score_svm)

pred_svm = precision_score(y_test, predictions_svm, average='micro')
pred_svm

#### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

model_clf = RandomForestClassifier()

model_clf.fit(X_train, y_train)
predictions_clf = model_clf.predict(X_test)
score_clf = accuracy_score(y_test, predictions_clf)
print(score_clf)

pred_clf = precision_score(y_test, predictions_clf, average='micro')
pred_clf

#### Predictions using KNN

In [None]:
model_knn.fit(X_train, y_train)
predictions_knn = model_knn.predict(X_test)
score_knn = accuracy_score(y_test, predictions_knn)
print(score_knn)

pred_knn = precision_score(y_test, predictions_knn, average='micro')
pred_knn

#### Predictions using Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier(random_state=0)

model_tree.fit(X_train, y_train)
predictions_tree = model_tree.predict(X_test)
score_tree = accuracy_score(y_test, predictions_tree)
print(score_tree)

pred_tree = precision_score(y_test, predictions_tree, average='micro')
pred_tree

In [None]:
print("SVM: ", score_svm)
print("Random forest classifer:" , score_clf)
print("Decision tree: ", score_tree)
print("KNN: ", score_knn)

Based on the different models tested, it seems that Support vector machine model can produce the highest prediction score.

# Please give me feedback on how to improve!