In [None]:
# import 'Pandas' 
import pandas as pd 

# import 'Numpy' 
import numpy as np

# import subpackage of Matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# import 'Seaborn' 
import seaborn as sns

# to suppress warnings 
from warnings import filterwarnings
filterwarnings('ignore')

# display all columns of the dataframe
pd.options.display.max_columns = None

# display all rows of the dataframe
pd.options.display.max_rows = None
 
# to display the float values upto 6 decimal places     
pd.options.display.float_format = '{:.6f}'.format

# import train-test split 
from sklearn.model_selection import train_test_split

# import StandardScaler to perform scaling
from sklearn.preprocessing import StandardScaler 

# import various functions from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# import the XGBoost function for classification
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/water-potability/water_potability.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#missing values

In [None]:
df.isnull().sum()

In [None]:
#percentage of missing values

In [None]:
df.isnull().sum()/df.shape[0] *100

In [None]:
sns.heatmap(df.isnull(), cbar = False)

In [None]:
plt.rcParams['figure.figsize'] = [12,12]
df.hist()
plt.show()

In [None]:
j = []
skew = []
kurtosis = []
for i in df.columns[:9]:
    j.append(i)
    skew.append(df[i].skew())
    kurtosis.append(df[i].kurt())
skew_kurtosis = pd.DataFrame({'column name': j, 'skew':skew, 'kurtosis':kurtosis})
skew_kurtosis

In [None]:
k = 1
plt.figure(figsize = (30,30))
for i in df.columns[:9]:
    plt.subplot(5,3,k)
    sns.distplot(df[i])
    k+=1

In [None]:
k = 1
plt.figure(figsize =(20,20))
for i in df.columns[:9]:
    if(df[i].dtypes != 'object'):
        plt.subplot(4,4,k)
        sns.boxplot(x=df[i])
        k+=1

In [None]:
#since we have outliers we use median

In [None]:
#As the median value is not affected by the presence of outliers, replace the null values in the variables by median

In [None]:
df['ph'] = df['ph'].fillna(df['ph'].median())
df['Sulfate'] = df['Sulfate'].fillna(df['Sulfate'].median())
df['Trihalomethanes'] = df['Trihalomethanes'].fillna(df['Trihalomethanes'].median())

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
plt.rcParams['figure.figsize'] = [12,12]
df.hist()
plt.show()

In [None]:
df.nunique()

In [None]:
print(df['Potability'].value_counts())


In [None]:
sns.countplot(data=df, x='Potability')

In [None]:
sns.pairplot(df)

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(df.corr(),annot=True, cmap="YlGnBu")

In [None]:
# harness and pH has high correlation
#sulfate and solids has low correlation

In [None]:
X= df.drop('Potability',axis=1)
y= df['Potability']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 20, test_size=0.3)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Logistic regression

In [None]:
lr= LogisticRegression()
lr.fit(X_train,y_train)
y_pred_lr = lr.predict(X_test)
print(classification_report(y_test,y_pred_lr))

In [None]:
y_act = y_test  # True o/p
y_pred = y_pred_lr # model o/p
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True, linewidth = 0.1)

In [None]:
cnf_mat = confusion_matrix(y_act,y_pred)
tn = cnf_mat[0,0]
tp = cnf_mat[1,1]
fp = cnf_mat[0,1]
fn = cnf_mat[1,0]
tn,tp,fp,fn

In [None]:
acc_log = (tn+tp)/(tn+tp+fn+fp)
acc_log

#### Decision Tree 

In [None]:
dt = DecisionTreeClassifier(random_state=1)
dt.fit(X_train,y_train)
y_pred_dt = dt.predict(X_test)
print(classification_report(y_test,y_pred_dt))

In [None]:
y_act = y_test  # True o/p
y_pred = y_pred_dt # model o/p
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True, linewidth = 0.1)

In [None]:
cnf_mat = confusion_matrix(y_act,y_pred)
tn = cnf_mat[0,0]
tp = cnf_mat[1,1]
fp = cnf_mat[0,1]
fn = cnf_mat[1,0]
tn,tp,fp,fn

In [None]:
acc_dec = (tn+tp)/(tn+tp+fn+fp)
acc_dec

#### Random forest

In [None]:
rf= RandomForestClassifier(random_state=1)
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test,y_pred_rf))

In [None]:
y_act = y_test  # True o/p
y_pred = y_pred_rf # model o/p
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True, linewidth = 0.1)

In [None]:
cnf_mat = confusion_matrix(y_act,y_pred)
tn = cnf_mat[0,0]
tp = cnf_mat[1,1]
fp = cnf_mat[0,1]
fn = cnf_mat[1,0]
tn,tp,fp,fn

In [None]:
acc_ran = (tn+tp)/(tn+tp+fn+fp)
acc_ran

#### XG BOOST

In [None]:
xgb= XGBClassifier(random_state=1)
xgb.fit(X_train,y_train)
y_pred_xgb = xgb.predict(X_test)
print(classification_report(y_test,y_pred_xgb))

In [None]:
y_act = y_test  # True o/p
y_pred = y_pred_xgb # model o/p
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True, linewidth = 0.1)

In [None]:
cnf_mat = confusion_matrix(y_act,y_pred)
tn = cnf_mat[0,0]
tp = cnf_mat[1,1]
fp = cnf_mat[0,1]
fn = cnf_mat[1,0]
tn,tp,fp,fn

In [None]:
acc_XG = (tn+tp)/(tn+tp+fn+fp)
acc_XG

#### ADA BOOST

In [None]:
ada = AdaBoostClassifier(random_state = 1)
ada.fit(X_train,y_train)
y_pred_ada = ada.predict(X_test)
print(classification_report(y_test,y_pred_ada))

In [None]:
y_act = y_test  # True o/p
y_pred = y_pred_ada # model o/p
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True, linewidth = 0.1)

In [None]:
cnf_mat = confusion_matrix(y_act,y_pred)
tn = cnf_mat[0,0]
tp = cnf_mat[1,1]
fp = cnf_mat[0,1]
fn = cnf_mat[1,0]
tn,tp,fp,fn

In [None]:
acc_Ada = (tn+tp)/(tn+tp+fn+fp)
acc_Ada

#### GRADIENT BOOSTING

In [None]:
gra = GradientBoostingClassifier(random_state=1)
gra.fit(X_train,y_train)
y_pred_gra = gra.predict(X_test)
print(classification_report(y_test,y_pred_gra))

In [None]:
y_act = y_test  # True o/p
y_pred = y_pred_gra # model o/p
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True, linewidth = 0.1)

In [None]:
cnf_mat = confusion_matrix(y_act,y_pred)
tn = cnf_mat[0,0]
tp = cnf_mat[1,1]
fp = cnf_mat[0,1]
fn = cnf_mat[1,0]
tn,tp,fp,fn

In [None]:
acc_GB = (tn+tp)/(tn+tp+fn+fp)
acc_GB

In [None]:
#Random forest ranks first with accuracy of 0.6673448626653102

In [None]:
# lets work on the outliers treatment and very we go with models

In [None]:
def outlier_treatment(column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3-q1
    ul = q3 +1.5*iqr
    ll = q1 -1.5*iqr
    return ll, ul

In [None]:
for i in df.columns[:9]:
    w_limit = outlier_treatment(i)
    df = df[~((df[i] < w_limit[0]) | (df[i]>w_limit[1]))]

In [None]:
k = 1
plt.figure(figsize =(20,20))
for i in df.columns[:9]:
    if(df[i].dtypes != 'object'):
        plt.subplot(4,4,k)
        sns.boxplot(x=df[i])
        k+=1

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
X= df.drop('Potability',axis=1)
y= df['Potability']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 20, test_size=0.3)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Logistic regression

In [None]:
lr= LogisticRegression()
lr.fit(X_train,y_train)
y_pred_lr = lr.predict(X_test)
print(classification_report(y_test,y_pred_lr))

In [None]:
y_act = y_test  # True o/p
y_pred = y_pred_lr # model o/p
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True, linewidth = 0.1)

In [None]:
cnf_mat = confusion_matrix(y_act,y_pred)
tn = cnf_mat[0,0]
tp = cnf_mat[1,1]
fp = cnf_mat[0,1]
fn = cnf_mat[1,0]
tn,tp,fp,fn

In [None]:
acc_log1 = (tn+tp)/(tn+tp+fn+fp)
acc_log1

#### Decision Tree 

In [None]:
dt = DecisionTreeClassifier(random_state=1)
dt.fit(X_train,y_train)
y_pred_dt = dt.predict(X_test)
print(classification_report(y_test,y_pred_dt))

In [None]:
y_act = y_test  # True o/p
y_pred = y_pred_dt # model o/p
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True, linewidth = 0.1)

In [None]:
cnf_mat = confusion_matrix(y_act,y_pred)
tn = cnf_mat[0,0]
tp = cnf_mat[1,1]
fp = cnf_mat[0,1]
fn = cnf_mat[1,0]
tn,tp,fp,fn

In [None]:
acc_dec1 = (tn+tp)/(tn+tp+fn+fp)
acc_dec1

#### Random forest

In [None]:
rf= RandomForestClassifier(random_state=1)
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test,y_pred_rf))

In [None]:
y_act = y_test  # True o/p
y_pred = y_pred_rf # model o/p
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True, linewidth = 0.1)

In [None]:
cnf_mat = confusion_matrix(y_act,y_pred)
tn = cnf_mat[0,0]
tp = cnf_mat[1,1]
fp = cnf_mat[0,1]
fn = cnf_mat[1,0]
tn,tp,fp,fn

In [None]:
acc_ran1 = (tn+tp)/(tn+tp+fn+fp)
acc_ran1

#### XG BOOST

In [None]:
xgb= XGBClassifier(random_state=1)
xgb.fit(X_train,y_train)
y_pred_xgb = xgb.predict(X_test)
print(classification_report(y_test,y_pred_xgb))

In [None]:
y_act = y_test  # True o/p
y_pred = y_pred_xgb # model o/p
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True, linewidth = 0.1)

In [None]:
cnf_mat = confusion_matrix(y_act,y_pred)
tn = cnf_mat[0,0]
tp = cnf_mat[1,1]
fp = cnf_mat[0,1]
fn = cnf_mat[1,0]
tn,tp,fp,fn

In [None]:
acc_XG1 = (tn+tp)/(tn+tp+fn+fp)
acc_XG1

#### ADA BOOST

In [None]:
ada = AdaBoostClassifier(random_state = 1)
ada.fit(X_train,y_train)
y_pred_ada = ada.predict(X_test)
print(classification_report(y_test,y_pred_ada))

In [None]:
y_act = y_test  # True o/p
y_pred = y_pred_ada # model o/p
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True, linewidth = 0.1)

In [None]:
cnf_mat = confusion_matrix(y_act,y_pred)
tn = cnf_mat[0,0]
tp = cnf_mat[1,1]
fp = cnf_mat[0,1]
fn = cnf_mat[1,0]
tn,tp,fp,fn

In [None]:
acc_Ada1 = (tn+tp)/(tn+tp+fn+fp)
acc_Ada1

#### GRADIENT BOOSTING

In [None]:
gra = GradientBoostingClassifier(random_state=1)
gra.fit(X_train,y_train)
y_pred_gra = gra.predict(X_test)
print(classification_report(y_test,y_pred_gra))

In [None]:
y_act = y_test  # True o/p
y_pred = y_pred_gra # model o/p
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True, linewidth = 0.1)

In [None]:
cnf_mat = confusion_matrix(y_act,y_pred)
tn = cnf_mat[0,0]
tp = cnf_mat[1,1]
fp = cnf_mat[0,1]
fn = cnf_mat[1,0]
tn,tp,fp,fn

In [None]:
acc_GB1 = (tn+tp)/(tn+tp+fn+fp)
acc_GB1

In [None]:
# again random forest has accuracy of 0.6353383458646616

In [None]:
models = pd.DataFrame({
'Model': ['Logistic Regression','Decision Tree',
              'Random Forest','ADA Boost','Gredient Boost','XG Boost'],
    'Accuracy before outliers treatment': [acc_log,acc_dec,
              acc_ran,acc_Ada,acc_GB,acc_XG],
    'Accuracy after outliers treatment': [acc_log1,acc_dec1,
              acc_ran1,acc_Ada1,acc_GB1,acc_XG1]})
models.sort_values(by='Accuracy before outliers treatment', ascending=False)

In [None]:
models.to_csv("svm_linear.csv",index = False)