In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
## Data Analysis Phase
## MAin aim is to understand more about the data

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
## Display all the columns of the dataframe

pd.pandas.set_option('display.max_columns',None)

In [None]:
dataset=pd.read_csv("/kaggle/input/water-potability/water_potability.csv")
dataset.head()


In [None]:
## Here we will check the percentage of nan values present in each feature
## 1 -step make the list of features which has missing values
features_with_na=[features for features in dataset.columns if dataset[features].isnull().sum()>1]
## 2- step print the feature name and the percentage of missing values

for feature in features_with_na:
    print(feature, np.round(dataset[feature].isnull().mean(), 4),  ' % missing values')

In [None]:
water_df=dataset.copy()
water_df.head()

In [None]:
pH_nan_1 = water_df.query('Potability == 1')['ph'][water_df['ph'].isna()].index
water_df.loc[pH_nan_1,'ph'] = water_df.query('Potability == 1')['ph'][water_df['ph'].notna()].mean()
#############"
pH_nan_0 = water_df.query('Potability == 0')['ph'][water_df['ph'].isna()].index
water_df.loc[pH_nan_0,'ph'] = water_df.query('Potability == 0')['ph'][water_df['ph'].notna()].mean()

In [None]:
#Set any value that fails the guideline for pH not to be potable
water_df.loc[~water_df.ph.between(6.5, 8.5), 'Potability'] = 0

In [None]:
water_df.head()

In [None]:
water_df.hist(column='ph', by='Potability')


In [None]:
water_df.describe()

In [None]:
#first replace the Nan values with the mean of the classification
THM_nan_1 = water_df.query('Potability == 1')['Trihalomethanes'][water_df['Trihalomethanes'].isna()].index
water_df.loc[THM_nan_1,'Trihalomethanes'] = water_df.query('Potability == 1')['Trihalomethanes'][water_df['Trihalomethanes'].notna()].mean()
THM_nan_0 = water_df.query('Potability == 0')['Trihalomethanes'][water_df['Trihalomethanes'].isna()].index
water_df.loc[THM_nan_0,'Trihalomethanes'] = water_df.query('Potability == 0')['Trihalomethanes'][water_df['Trihalomethanes'].notna()].mean()
#Set any value that fails the guideline for Trihalomethanes not to be potable
water_df.loc[water_df.Trihalomethanes > 80, 'Potability'] = 0
#####################################################
#first replace the Nan values with the mean of the classification
Sulfate_nan_1 = water_df.query('Potability == 1')['Sulfate'][water_df['Sulfate'].isna()].index
water_df.loc[Sulfate_nan_1,'Sulfate'] = water_df.query('Potability == 1')['Sulfate'][water_df['Sulfate'].notna()].mean()
Sulfate_nan_0 = water_df.query('Potability == 0')['Sulfate'][water_df['Sulfate'].isna()].index
water_df.loc[Sulfate_nan_0,'Sulfate'] = water_df.query('Potability == 0')['Sulfate'][water_df['Sulfate'].notna()].mean()

In [None]:
for feature in features_with_na:
    print(feature, np.round(water_df[feature].isnull().mean(), 4),  ' % missing values')

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(water_df[water_df.columns[:-1]],water_df['Potability'],test_size=0.25,random_state=0)

In [None]:

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [None]:
feature_sel_model = SelectFromModel(Lasso(alpha=0.0005, random_state=0)) # remember to set the seed, the random state in this function
feature_sel_model.fit(X_train, y_train)

In [None]:
feature_sel_model.get_support()


In [None]:
## Apply RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
ytrain_pred = rf_model.predict_proba(X_train)
print('RF train roc-auc: {}'.format(roc_auc_score(y_train, ytrain_pred[:,1])))
ytest_pred = rf_model.predict_proba(X_test)
print('RF test roc-auc: {}'.format(roc_auc_score(y_test, ytest_pred[:,1])))

In [None]:
from sklearn.linear_model import LogisticRegression
log_classifier=LogisticRegression()
log_classifier.fit(X_train, y_train)
ytrain_pred = log_classifier.predict_proba(X_train)
print('Logistic train roc-auc: {}'.format(roc_auc_score(y_train, ytrain_pred[:,1])))
ytest_pred = log_classifier.predict_proba(X_test)
print('Logistic test roc-auc: {}'.format(roc_auc_score(y_test, ytest_pred[:,1])))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada_classifier=AdaBoostClassifier()
ada_classifier.fit(X_train, y_train)
ytrain_pred = ada_classifier.predict_proba(X_train)
print('Adaboost train roc-auc: {}'.format(roc_auc_score(y_train, ytrain_pred[:,1])))
ytest_pred = ada_classifier.predict_proba(X_test)
print('Adaboost test roc-auc: {}'.format(roc_auc_score(y_test, ytest_pred[:,1])))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier=KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)
ytrain_pred = knn_classifier.predict_proba(X_train)
print('Adaboost train roc-auc: {}'.format(roc_auc_score(y_train, ytrain_pred[:,1])))
ytest_pred = knn_classifier.predict_proba(X_test)
print('Adaboost test roc-auc: {}'.format(roc_auc_score(y_test, ytest_pred[:,1])))

In [None]:
pred=[]
for model in [rf_model,log_classifier,ada_classifier,knn_classifier]:
    pred.append(pd.Series(model.predict_proba(X_test)[:,1]))
final_prediction=pd.concat(pred,axis=1).mean(axis=1)
print('Ensemble test roc-auc: {}'.format(roc_auc_score(y_test,final_prediction)))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, final_prediction)
thresholds

In [None]:
from sklearn.metrics import accuracy_score
accuracy_ls = []
for thres in thresholds:
    y_pred = np.where(final_prediction>thres,1,0)
    accuracy_ls.append(accuracy_score(y_test, y_pred, normalize=True))
    
accuracy_ls = pd.concat([pd.Series(thresholds), pd.Series(accuracy_ls)],
                        axis=1)
accuracy_ls.columns = ['thresholds', 'accuracy']
accuracy_ls.sort_values(by='accuracy', ascending=False, inplace=True)
accuracy_ls.head()