In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, StratifiedShuffleSplit, StratifiedKFold

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder, Normalizer, QuantileTransformer, RobustScaler, \
PowerTransformer

from imblearn.over_sampling import RandomOverSampler

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier

In [None]:
df=pd.read_csv("/kaggle/input/water-potability/water_potability.csv")

In [None]:
df.head()

# Check for missing values

In [None]:
df.isnull().sum()/len(df)*100

# View class distribution 

In [None]:
df.iloc[:,-1].value_counts()/len(df)

# Visualization

In [None]:
df.hist(figsize=(8,8))
plt.tight_layout()

# Correlation

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(df.corr(),annot=True)

# Create training and validation sets

In [None]:
training,validation=train_test_split(df)
X,y=training.iloc[:,0:9],training.iloc[:,9]

# Modeling

In [None]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))

# Baseline

In [None]:
imputer=SimpleImputer()

results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    pipe=Pipeline([('imputer',imputer),('model',model)])
    cv_results = cross_val_score(pipe, X, y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# Feature importance with random forest

In [None]:
imputer=SimpleImputer()
model=RandomForestClassifier()
pipe=Pipeline([('imputer',imputer),('model',model)])
pipe.fit(X,y)

In [None]:
pipe.named_steps.model.feature_importances_

# Scaler grid search

In [None]:
imputer=SimpleImputer()

space={'scaler':[MinMaxScaler(),StandardScaler(),Normalizer(),RobustScaler(),PowerTransformer(),QuantileTransformer()]}

for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    pipe=Pipeline([('imputer',imputer),('scaler',MinMaxScaler()),('model',model)])
    grid = GridSearchCV(estimator=pipe,param_grid=space, cv=10, scoring='accuracy')
    grid.fit(X,y)
    print(grid.best_params_,grid.best_score_.round(3),name)

# Ensemble

In [None]:
ensembles = []
ensembles.append(('AB', AdaBoostClassifier()))
ensembles.append(('GBM', GradientBoostingClassifier()))
ensembles.append(('RF', RandomForestClassifier(n_estimators=10)))
ensembles.append(('ET', ExtraTreesClassifier(n_estimators=10)))

# Ensemble baseline

In [None]:
imputer=SimpleImputer()
results = []
names = []
for name, model in ensembles:
    kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    pipe=Pipeline([('imputer',imputer),('model',model)])
    cv_results = cross_val_score(pipe, X, y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# Ensemble scaler grid search

In [None]:
imputer=SimpleImputer()

space={'scaler':[MinMaxScaler(),StandardScaler(),Normalizer(),RobustScaler(),PowerTransformer(),QuantileTransformer()]}

for name, model in ensembles:
    kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    pipe=Pipeline([('imputer',imputer),('scaler',MinMaxScaler()),('model',model)])
    grid = GridSearchCV(estimator=pipe,param_grid=space, cv=10, scoring='accuracy')
    grid.fit(X,y)
    print(grid.best_params_,grid.best_score_.round(3),name)

# Oversampling

In [None]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)

In [None]:
X.shape,y.shape

In [None]:
y.value_counts()

In [None]:
X_over.shape,y_over.shape

In [None]:
y_over.value_counts()

# Baseline with oversampling

In [None]:
imputer=SimpleImputer()

results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    pipe=Pipeline([('imputer',imputer),('model',model)])
    cv_results = cross_val_score(pipe, X_over, y_over, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# Scaler grid search with oversampling

In [None]:
imputer=SimpleImputer()

space={'scaler':[MinMaxScaler(),StandardScaler(),Normalizer(),RobustScaler(),PowerTransformer(),QuantileTransformer()]}

for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    pipe=Pipeline([('imputer',imputer),('scaler',MinMaxScaler()),('model',model)])
    grid = GridSearchCV(estimator=pipe,param_grid=space, cv=10, scoring='accuracy')
    grid.fit(X_over,y_over)
    print(grid.best_params_,grid.best_score_.round(3),name)

# Ensemble baseline with oversampling

In [None]:
imputer=SimpleImputer()
results = []
names = []
for name, model in ensembles:
    kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    pipe=Pipeline([('imputer',imputer),('model',model)])
    cv_results = cross_val_score(pipe, X_over, y_over, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# Ensemble scaler grid search with oversampling

In [None]:
imputer=SimpleImputer()

space={'scaler':[MinMaxScaler(),StandardScaler(),Normalizer(),RobustScaler(),PowerTransformer(),QuantileTransformer()]}

for name, model in ensembles:
    kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    pipe=Pipeline([('imputer',imputer),('scaler',MinMaxScaler()),('model',model)])
    grid = GridSearchCV(estimator=pipe,param_grid=space, cv=10, scoring='accuracy')
    grid.fit(X_over,y_over)
    print(grid.best_params_,grid.best_score_.round(3),name)

# Make predictions on the validation set

In [None]:
model=ExtraTreesClassifier()
scaler=StandardScaler()
imputer=SimpleImputer()
pipe=Pipeline([('imputer',imputer),('scaler',scaler),('model',model)])
pipe.fit(X_over,y_over)
X_val,y_val=validation.iloc[:,0:9],validation.iloc[:,9]
predictions=pipe.predict(X_val)
confusion_matrix(predictions,y_val)