In [None]:
import pandas as pd
water=pd.read_csv('../input/water-potability/water_potability.csv')
water.head()

In [None]:
water.info()

In [None]:
water.describe()

In [None]:
Potability_0=water[water.Potability==0]
Potability_0.head()

In [None]:
round(Potability_0.isnull().sum()*100/len(Potability_0),2).sort_values(ascending=False)

In [None]:
Potability_0.describe()

#### Treating Missing value for Potability_0 records

In [None]:
Potability_0.fillna(Potability_0.median(), inplace=True)
Potability_0.describe()

In [None]:
Potability_1=water[water.Potability==1]
Potability_1.head()

In [None]:
round(Potability_1.isnull().sum()*100/len(Potability_1),2).sort_values(ascending=False)

#### Treating Missing value for Potability_1 records

In [None]:
Potability_1.fillna(Potability_1.median(), inplace=True)
Potability_1.describe()

In [None]:
import numpy as np
water=pd.concat([Potability_1, Potability_0], axis=0)
water = water.iloc[np.random.permutation(len(water))]
water=water.reset_index(drop=True)
water.head()

In [None]:
water.nunique()

In [None]:
round(water.Potability.value_counts()*100/len(water),2)

In [None]:
data = water.sample(frac=0.8, random_state=42)
data_unseen = water.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))

In [None]:
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

In [None]:
round(data.Potability.value_counts()*100/len(data),2)

In [None]:
!pip install pycaret

In [None]:
from pycaret.classification import *

In [None]:
water_test= setup(data = data, target = 'Potability', session_id=42,
                  normalize = True, 
                  transformation = True, 
                  log_experiment = True,
                  handle_unknown_categorical = True, 
                  unknown_categorical_method = 'most_frequent',
                  remove_multicollinearity = True, #rop one of the two features that are highly correlated with each other
                  ignore_low_variance = True,#all categorical features with statistically insignificant variances are removed from the dataset.
                  combine_rare_levels = True,
                  fix_imbalance = True
                 )

In [None]:
rf = create_model('rf')

In [None]:
print(rf)

In [None]:
tuned_rf = tune_model(rf,optimize = 'Precision')

In [None]:
plot_model(tuned_rf, plot = 'auc')

In [None]:
plot_model(tuned_rf, plot = 'pr')

In [None]:
plot_model(tuned_rf, plot='feature')

In [None]:
plot_model(tuned_rf, plot = 'confusion_matrix')

In [None]:
evaluate_model(tuned_rf)

In [None]:
predict_model(tuned_rf)

In [None]:
unseen_predictions = predict_model(tuned_rf, data=data_unseen)
unseen_predictions.head()

In [None]:
print("Confidence Score :   {}".format(round(unseen_predictions.Score.mean(),2)))#Confidence Score