In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from warnings import filterwarnings
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression,RidgeClassifier,SGDClassifier,PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import SVC,LinearSVC,NuSVC
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.ensemble import VotingClassifier
filterwarnings('ignore')

# Evaluation & CV Libraries
from sklearn.metrics import precision_score,accuracy_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,RepeatedStratifiedKFold

In [2]:
WQ = pd.read_csv("water_potability.csv")

In [3]:
# Impute the missing values
Before_imputation = WQ
#print dataset before imputaion
print("Data Before performing imputation\n",WQ)
  
# create an object for KNNImputer
imputer = KNNImputer(n_neighbors=4)
After_Imputation = imputer.fit_transform(Before_imputation)
WQI = pd.DataFrame(After_Imputation)
WQI.rename(columns = {0:'ph', 1:'Hardness', 2:'Solids', 3:'Chloramines', 4:'Sulfate', 5:'Conductivity', 6:'Organic_carbon', 7:'Trihalomethanes', 8:'Turbidity', 9:'Potability'}, inplace = True)
print("\n\nAfter performing imputation\n",WQI)

Data Before performing imputation
             ph    Hardness        Solids  Chloramines     Sulfate  \
0          NaN  204.890455  20791.318981     7.300212  368.516441   
1     3.716080  129.422921  18630.057858     6.635246         NaN   
2     8.099124  224.236259  19909.541732     9.275884         NaN   
3     8.316766  214.373394  22018.417441     8.059332  356.886136   
4     9.092223  181.101509  17978.986339     6.546600  310.135738   
...        ...         ...           ...          ...         ...   
3271  4.668102  193.681735  47580.991603     7.166639  359.948574   
3272  7.808856  193.553212  17329.802160     8.061362         NaN   
3273  9.419510  175.762646  33155.578218     7.350233         NaN   
3274  5.126763  230.603758  11983.869376     6.303357         NaN   
3275  7.874671  195.102299  17404.177061     7.509306         NaN   

      Conductivity  Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       564.308654       10.379783        86.990970   2.963

In [4]:
# Import Data Pre-processing Libraries
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split

In [5]:
# RSeparate the data set columns in to dependant and independant variables
X = WQI.drop('Potability',axis=1).values
y = WQI['Potability'].values

# Split the dataset into train test parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [6]:
# Standardisation of data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_t = scaler.transform(X_train)
X_test = scaler.transform(X_test)
#Sample = [[6, 250, 20000, 5, 259, 350, 6, 58, 4]]
#X_Sample = scaler.transform(Sample)
#print(X_Sample)

In [7]:
# import SMOTE module from imblearn library
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train_t, y_train.ravel())

In [8]:
# Defining model parameters
model_params = {
    'GB':
    {
        'model':GradientBoostingClassifier(),
        'params':
        {
            'learning_rate':[0.1],
            'n_estimators':[500],
            'max_features':['log2'],
            'max_depth':[9]
        }
    }
}


In [9]:
cv = RepeatedStratifiedKFold(n_splits=5,n_repeats=2)
scores=[]
for model_name,params in model_params.items():
    rs = RandomizedSearchCV(params['model'],params['params'],cv=cv,n_iter=20)
    rs.fit(X_train_res,y_train_res)
    rs_prediction = rs.predict(X_test)
    #scores_test = precision_score(y_test, rs_prediction,average='macro')
    scores.append([model_name,dict(rs.best_params_),rs.best_score_])
    #print(model_name,scores_test)
data=pd.DataFrame(scores,columns=['Model','          Parameters           ','Score'])
data.style.set_properties(subset=['Parameters'], **{'width': '400px'})
data

Unnamed: 0,Model,Parameters,Score
0,GB,"{'n_estimators': 500, 'max_features': 'log2', ...",0.717656


In [10]:
rs_prediction = rs.predict(X_test)
score_test_precision = precision_score(y_test, rs_prediction,average='macro')
score_test_accuracy = accuracy_score(y_test, rs_prediction)
print(model_name,score_test_precision, score_test_accuracy)

GB 0.6472004758615396 0.6678876678876678


In [11]:
import pickle 
pickle_out = open("classifier.pkl", mode = "wb") 
pickle.dump(rs, pickle_out) 
pickle_out.close()

In [12]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, rs_prediction))

              precision    recall  f1-score   support

         0.0       0.73      0.74      0.73       506
         1.0       0.57      0.55      0.56       313

    accuracy                           0.67       819
   macro avg       0.65      0.65      0.65       819
weighted avg       0.67      0.67      0.67       819



In [13]:
print(confusion_matrix(y_test, rs_prediction))

[[375 131]
 [141 172]]


In [14]:
Sample = [[6, 250, 20000, 5, 259, 350, 6, 58, 4],[2,370,40000, 12, 900,800,15,110,12]]
# Standardisation of data
#scaler = StandardScaler()
#scaler.fit(X_train)
X_Sample= scaler.transform(Sample)
X_Sample

array([[-0.71860171,  1.64439984, -0.23751439, -1.34421881, -1.96740767,
        -0.95757518, -2.52663952, -0.54923047,  0.02857098],
       [-3.41051783,  5.3261417 ,  2.04425412,  3.08064748, 14.98782357,
         4.63843525,  0.22124304,  2.73616009, 10.32625122]])

In [15]:
rs.predict(X_Sample)

array([1., 0.])