In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/water-potability/water_potability.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum().sort_values(ascending=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.countplot(x='Potability', data=data)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 10))
sns.boxplot(ax=axes[0], data=data, x='Potability', y='ph')
sns.boxplot(ax=axes[1], data=data, x='Potability', y='Sulfate')
sns.boxplot(ax=axes[2], data=data, x='Potability', y='Trihalomethanes')

In [None]:
df = data.copy()
df['ph'] = data['ph'].fillna(data['ph'].median())
df['Sulfate'] = data['Sulfate'].fillna(data['Sulfate'].median())
df['Trihalomethanes'] = data['Trihalomethanes'].fillna(data['Trihalomethanes'].median())

In [None]:
df.head()

In [None]:
import warnings
warnings.filterwarnings('ignore')
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
sns.distplot(ax=axes[0, 0], x=data['ph'])
sns.distplot(ax=axes[0, 1], x=data['Hardness'])
sns.distplot(ax=axes[0, 2], x=data['Solids'])
sns.distplot(ax=axes[1, 0], x=data['Chloramines'])
sns.distplot(ax=axes[1, 1], x=data['Sulfate'])
sns.distplot(ax=axes[1, 2], x=data['Conductivity'])
sns.distplot(ax=axes[2, 0], x=data['Organic_carbon'])
sns.distplot(ax=axes[2, 1], x=data['Trihalomethanes'])
sns.distplot(ax=axes[2, 2], x=data['Turbidity'])

In [None]:
sns.pairplot(data, hue='Potability')

In [None]:
X = df.drop(['Potability'], axis=1)
y = df['Potability']

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(X.corr(), annot=True, cmap='copper_r')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression,RidgeClassifier,SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score,accuracy_score

In [None]:
models =[("LR", LogisticRegression()),
         ("SVC", SVC()),
         ('KNN',KNeighborsClassifier()),
         ("DTC", DecisionTreeClassifier()),
         ("SGDC", SGDClassifier()),
         ("Ridge", RidgeClassifier()),
         ('RF',RandomForestClassifier()),]

results = []
names = []
finalResults = []

for name,model in models:
    model.fit(X_train, y_train)
    model_results = model.predict(X_test)
    score = precision_score(y_test, model_results,average='macro')
    results.append(score)
    names.append(name)
    finalResults.append((name,score))
    
finalResults.sort(key=lambda k:k[1],reverse=True)

In [None]:
FR = pd.DataFrame(finalResults,columns=['Model', 'Score'])
FR

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 10, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rf_random = RandomizedSearchCV(RandomForestClassifier(), hyperF, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
y_pred_random = rf_random.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_random)
print("Confusion Matrix : ")
sns.heatmap(cm, annot=True, cmap='BuGn')
plt.show()
print(f"Accuracy Score : {accuracy_score(y_test, y_pred_random)}")

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

print(f'Precision Score : {precision_score(y_test, y_pred_random)}')
print(f'Recall Score : {recall_score(y_test, y_pred_random)}')
print(f'f1 Score : {f1_score(y_test, y_pred_random)}')