## Water Quality
- *Predict if water is safe for Human consumption*

## Importing Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

## Reading Data

In [None]:
df=pd.read_csv("/kaggle/input/water-potability/water_potability.csv")
df.head()

## Size of data

In [None]:
df.shape

## Target feature

In [None]:
df["Potability"].unique()

## Null values check

In [None]:
df.isnull().sum()

## Replace Null values
- With zeros

In [None]:
df["ph"]=df["ph"].fillna(0)
df["Sulfate"]=df["Sulfate"].fillna(0)
df["Trihalomethanes"]=df["Trihalomethanes"].fillna(0)

- No null present now.

In [None]:
df.isnull().sum()

## Outlier Check

In [None]:
def lowerupper(col):
    q1=np.quantile(col,.25)
    q3=np.quantile(col,.75)
    inter=q3-q1
    lowerbound=q1-(inter*1.5)
    upperbound=q3+(inter*1.5)
    outlier=[]
    for x in col:
        if((x>upperbound)or(x<lowerbound)):
            outlier.append(x)
    return outlier

In [None]:
col=df.columns
print(col)

In [None]:
for i in col:
    outlier=lowerupper(df[i].values)
    print(len(outlier),"outliers are present in",i)

## Replacing Outlier
- With Upper limit and lower limit of that particular column.

In [None]:
def lowerupper_b(col):
    q1=np.quantile(col,.25)
    q3=np.quantile(col,.75)
    inter=q3-q1
    lowerbound=q1-(inter*1.5)
    upperbound=q3+(inter*1.5)
    return lowerbound,upperbound

In [None]:
for i in col:
  outlier=lowerupper(df[i].values)
  if  outlier == 0:
    print("No Outlier is present")
  else:
    lu,ub =lowerupper_b(df[i])
    df[i]=np.where(df[i]<lu,lu,df[i])
    df[i]=np.where(df[i]>ub,ub,df[i])

- No outlier present now.

In [None]:
for i in col:
    outlier=lowerupper(df[i].values)
    print(len(outlier),"outliers are present in",i)

## Descriptive statistical values

In [None]:
df.describe()

In [None]:
df.head()

## Boxplot

In [None]:
plt.subplot(1,3,1)
plt.subplots_adjust(left=0,right=3,bottom=1,top=2,wspace=0.2,hspace=0.4)
plt.title("ph")
plt.boxplot(df["ph"])

plt.subplot(1,3,2)
plt.title("Hardness")
plt.boxplot(df["Hardness"])

plt.subplot(1,3,3)
plt.title("Solids")
plt.boxplot(df["Solids"])

plt.show()

In [None]:
plt.subplot(1,3,1)
plt.subplots_adjust(left=0,right=3,bottom=1,top=2,wspace=0.2,hspace=0.4)
plt.title("Chloramines")
plt.boxplot(df["Chloramines"])

plt.subplot(1,3,2)
plt.title("Sulfate")
plt.boxplot(df["Sulfate"])

plt.subplot(1,3,3)
plt.title("Conductivity")
plt.boxplot(df["Conductivity"])

plt.show()

In [None]:
plt.subplot(1,3,1)
plt.subplots_adjust(left=0,right=3,bottom=1,top=2,wspace=0.2,hspace=0.4)
plt.title("Organic_carbon")
plt.boxplot(df["Organic_carbon"])

plt.subplot(1,3,2)
plt.title("Trihalomethanes")
plt.boxplot(df["Trihalomethanes"])

plt.subplot(1,3,3)
plt.title("Turbidity")
plt.boxplot(df["Turbidity"])

plt.show()

## Histogram
- ph
- Hardness
- Solids

In [None]:
plt.subplot(1,3,1)
plt.subplots_adjust(left=0,right=3,bottom=1,top=2,wspace=0.2,hspace=0.4)
plt.title("ph")
plt.hist(df["ph"])

plt.subplot(1,3,2)
plt.title("Hardness")
plt.hist(df["Hardness"])

plt.subplot(1,3,3)
plt.title("Solids")
plt.hist(df["Solids"])

plt.show()

- Chloramines
- Sulfate
- Conductivity

In [None]:
plt.subplot(1,3,1)
plt.subplots_adjust(left=0,right=3,bottom=1,top=2,wspace=0.2,hspace=0.4)
plt.title("Chloramines")
plt.hist(df["Chloramines"])

plt.subplot(1,3,2)
plt.title("Sulfate")
plt.hist(df["Sulfate"])

plt.subplot(1,3,3)
plt.title("Conductivity")
plt.hist(df["Conductivity"])

plt.show()

- Organic_carbon
- Trihalomethanes
- Turbidity

In [None]:
plt.subplot(1,3,1)
plt.subplots_adjust(left=0,right=3,bottom=1,top=2,wspace=0.2,hspace=0.4)
plt.title("Organic_carbon")
plt.hist(df["Organic_carbon"])

plt.subplot(1,3,2)
plt.title("Trihalomethanes")
plt.hist(df["Trihalomethanes"])

plt.subplot(1,3,3)
plt.title("Turbidity")
plt.hist(df["Turbidity"])

plt.show()

## Pie Chart
- Potability

In [None]:
plt.figure()
plt.title("Potability")
la=[0, 1]
plt.pie(df.groupby(df["Potability"]).size(),labels=la)
plt.show()

In [None]:
print(df.groupby(df["Potability"]).size())

## Corealtion

In [None]:
corr=df.corr()
plt.figure(figsize=(10,10))
sb.heatmap(corr, annot=True)
plt.show()

In [None]:
df.columns

In [None]:
X=df[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity']]
y=df['Potability']

## Ramdom Over sampler

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
os=RandomOverSampler(random_state=100)
X_train_res, Y_train_res =os.fit_resample(X,y)
X_train_res.shape, Y_train_res.shape

## Splitting Data

In [None]:
train_x, test_x, train_y, test_y=train_test_split(X,y,test_size=0.30, random_state=100)
print(train_x.shape)
print(test_x.shape)
print(train_y.shape)
print(test_y.shape)

## Standard Scaler

In [None]:
sc=StandardScaler()
train_x=sc.fit_transform(train_x)
test_x=sc.fit_transform(test_x)

## Logistic Regression

In [None]:
model_lr2=LogisticRegression().fit(train_x,train_y)
prd_lr2=model_lr2.predict(test_x)

## Accuracy

In [None]:
print("The Accuracy of the model is : ",accuracy_score(prd_lr2,test_y))

## Classification report

In [None]:
print("Classification report :  \n",classification_report(prd_lr2,test_y))

## Hyperparameter Optimization 
- Randomized search CV

In [None]:
params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost

In [None]:
classifier=xgboost.XGBClassifier()

In [None]:
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)

In [None]:
random_search.fit(X_train_res, Y_train_res) #X_train_res, Y_train_res

## Best Estimator

In [None]:
random_search.best_estimator_

## Best Parameters

In [None]:
random_search.best_params_

In [None]:
classifier=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.3, gpu_id=-1,
              importance_type='gain',
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=5,
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
score=cross_val_score(classifier,X_train_res, Y_train_res,cv=10)

In [None]:
score

## Score

In [None]:
score.mean()