In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.utils import resample
from joblib import dump, load

In [None]:
wq = pd.read_csv('../input/water-potability/water_potability.csv')
wq.head(n=10)

In [None]:
print(wq.shape)
print("--------")
print(wq.info())

In [None]:
# We have some NAs
wq.isna().sum()/wq.shape[0]

## 14% of ph, 24% of Sulfate and 5% of Trihalomethanes values are missing

In [None]:
# Lets drop NAs
wq.dropna(inplace=True)

In [None]:
wq.Potability.value_counts()

## Dataset is slightly imbalanced i.e. number of observations with potable water is not equal to the number of observations with nonpotable water. We would upsample the training data to have same number of potable and non-potable observations. Another option could be using a probablity threshold using ROC AUC curve to predict the potability of water. Higher AUC means a better model. I will be using upsampling for this effort.

In [None]:
# Let's look at the correlation 
plt.figure(figsize=(12,10))
sns.heatmap(wq.corr(), annot=True, cmap='BuGn', fmt='.2f')
plt.show()

## Potability of water is very weakly correlated with the features in the dataset. It would be interesting to see if these features can collectively predict potability.

In [None]:
random_state = 7 
X = wq.drop(['Potability'], axis=1).to_numpy()
y = wq['Potability'].to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state, stratify=y)

In [None]:
def upsample_train_data(X, y):
    ''' upsample the minority class '''
    df = pd.concat([pd.DataFrame(X),pd.DataFrame(y, columns=['outcome'])], axis=1)
    
    val_counts = df.outcome.value_counts()
    val_counts = dict(val_counts)    
    
    high_count = max(val_counts, key= val_counts.get)
    low_count  = min(val_counts, key= val_counts.get)
   
    df_high = df[df.outcome == high_count]
    df_low = df[df.outcome == low_count]   
    df_low = resample(df_low, n_samples=val_counts.get(high_count), replace=True, random_state=random_state)
    
    df = pd.concat([df_high, df_low], axis=0, ignore_index=True)
    X = df.drop('outcome', axis=1).to_numpy()
    y = df['outcome'].to_numpy()
    
    return X, y   

In [None]:
# Upsample Training Data
X_train, y_train =  upsample_train_data(X_train, y_train)

In [None]:
models = [("model_RF", RandomForestClassifier()), ("model_Ada", AdaBoostClassifier()), ("model_KNN", KNeighborsClassifier())]

param_grids = [
              {
               "model_RF__min_samples_leaf" : [2,4,6],
               "model_RF__criterion" : ['gini','entropy'],
               "model_RF__n_estimators" : [250,500,1000],
               "model_RF__random_state" : [random_state]  
              },
              {
               "model_Ada__n_estimators" :range(50,200,50),
               "model_Ada__random_state" : [random_state]  
              },
              {
                "model_KNN__n_neighbors" :range(2, 75, 1)                  
              }             
             ]

In [None]:
for i in range(0,3):
    pipeline = Pipeline(steps=
                        [
                         ('StdScaler',StandardScaler()),
                         models[i]
                        ]
                       )
    gridsearch = GridSearchCV(estimator=pipeline, param_grid = param_grids[i], cv=5, refit='Accuracy', scoring={'AUC':'roc_auc','Accuracy': make_scorer(accuracy_score)}, return_train_score=False)
    gridsearch.fit(X_train, y_train)
    print(gridsearch.best_estimator_)
    print(accuracy_score(y_test, gridsearch.best_estimator_.predict(X_test)))
    print('**********************************************************')

# We get highest accuracy of 69.30% on test data using RandomForestClassifier. 