In [None]:
## Data Analysis Phase
## MAin aim is to understand more about the data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
## Display all the columns of the dataframe

pd.pandas.set_option('display.max_columns',None) # display all the columns

In [None]:
dataset=pd.read_csv('../input/water-potability/water_potability.csv')

## print shape of dataset with rows and columns
print(dataset.shape)

In [None]:
## print the top5 records
dataset.head()

## Exploring data

In [None]:
print(dataset.info())
print(dataset.describe())

In [None]:
# Correlations
dataset.corr()

In [None]:
# heatmap for correlations
sns.heatmap(dataset.corr())
## seems like most of the data are not much correlated

## Missing values

In [None]:
## Here we will check the percentage of nan values present in each feature
## 1 -step make the list of features which has missing values
features_with_na=[features for features in dataset.columns if dataset[features].isnull().sum()>1]
## 2- step print the feature name and the percentage of missing values

for feature in features_with_na:
    print(feature, np.round(dataset[feature].isnull().mean()*100, 4),  ' % missing values')

In [None]:
# Try to understand the distribution for imputing hte missing values
for feature in dataset.columns:
    data=dataset.copy()
    data[feature].hist(bins=25)
    plt.xlabel(feature)
    plt.ylabel("Count")
    plt.title(feature)
    plt.show()
# we observe that all the features are mostly gaussian distributed

In [None]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

In [None]:
## Imputing missing values by KNN Imputer at n_neighbours=3
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
imputed = imputer.fit_transform(dataset)
df_imputed = pd.DataFrame(imputed, columns=dataset.columns)

In [None]:
## checking any missing value
df_imputed.info()

## Outliers

In [None]:
water_df =df_imputed.copy()

In [None]:
## Box plot
for feature in water_df.columns:
    data=water_df.copy()
    sns.boxplot(x=data[feature],data=data.drop('Potability',axis=1))
    plt.xlabel(feature)
    plt.ylabel("Value")
    plt.title(feature)
    plt.show()

In [None]:
## Let us plot some scatter plots to understand more about the relations
sns.pairplot(water_df,hue= 'Potability')
## Seems no visual relations between features

In [None]:
# let us see more closely on behaviour of outliers on classes 
# let us take example of two classes with high outliers like chloramites and sulfates
sns.scatterplot(x=water_df['Chloramines'],y=water_df['Sulfate'],hue=water_df['Potability'])
# we leeave the outlier as it as it is not have a clear impact on classes.

In [None]:
# Building model
# As potability is important, so we need to have less false positive hence need higher precision
# Try with boosting ensemble technique of Adaboost and xgboost
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold,cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [None]:
## Defining input and target features
y =water_df['Potability'].values
x =water_df.drop('Potability',axis=1).values

In [None]:
## let us scale the input variables
scaler =StandardScaler()
scaled_x= scaler.fit_transform(x)

In [None]:
## split for train and test, use stratify to balance class in between test and train
X_train, X_test, y_train, y_test = train_test_split(scaled_x, y, test_size=0.30, random_state=42,stratify=y)

## ADAboost

In [None]:
# Fitting a general model
from sklearn.tree import DecisionTreeClassifier
ada_clf= AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=200,learning_rate=0.5, algorithm ="SAMME.R")
ada_clf.fit(X_train,y_train)
y_pred = ada_clf.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))  

In [None]:
# Searching param grid
from sklearn.metrics import make_scorer
scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
param_grid={'learning_rate':[1,0.5,0.1,0.01,0.001],'n_estimators':[50,100,200,250,300,400,500]}
grid=GridSearchCV(ada_clf,param_grid,scoring=scoring, refit='AUC',cv=5)
grid.fit(X_train,y_train)
grid.best_params_

In [None]:
ada_clf= AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=100,learning_rate=0.1, algorithm ="SAMME.R")
ada_clf.fit(X_train,y_train)
y_pred = ada_clf.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

##XGB

In [None]:
xgb_clf= XGBClassifier()
xgb_clf.fit(X_train,y_train)
y_pred = xgb_clf.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))

In [None]:
# Searching param grid
scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
param_grid={'learning_rate':[1,0.5,0.1,0.01,0.001],'n_estimators':[50,100,200,250,300,400,500],'max_depth':[1,2,3,4,5,6]}
grid=GridSearchCV(xgb_clf,param_grid,scoring=scoring, refit='AUC',cv=5)
grid.fit(X_train,y_train)
grid.best_params_

In [None]:
xgb_clf= XGBClassifier(learning_rate= 0.01,
 max_depth= 6,
 max_features= 'auto',
 n_estimators= 300)
xgb_clf.fit(X_train,y_train)
y_pred = xgb_clf.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))

##Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 100)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 50, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
#Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_clf=rf_random.best_estimator_
rf_clf.fit(X_train,y_train)
y_pred = rf_clf.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))

##Constructing a Voting classifier

###Hard Voting

In [None]:
from sklearn.ensemble import VotingClassifier
vot_clf = VotingClassifier(estimators = [('ada', ada_clf), ('xgb', xgb_clf), ('rf', rf_clf)], voting = 'hard')
vot_clf.fit(X_train, y_train)
y_pred = vot_clf.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))

###Soft Voting

In [None]:
vot_clf = VotingClassifier(estimators = [('ada', ada_clf), ('xgb', xgb_clf), ('rf', rf_clf)], voting = 'soft')
vot_clf.fit(X_train, y_train)
y_pred = vot_clf.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))