In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Import Dataset

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')
df

## EDA

In [None]:
plt.figure(figsize=(10, 10))
sns.displot(df['ph'])

In [None]:
sns.pairplot(data=df)

In [None]:
df.columns

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data=df, x='Potability', y='ph')

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(),
           annot=True,
           cmap='coolwarm')

## Data Cleaning

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
## Fill missing values
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            df[label] = content.fillna(content.median())

In [None]:
# check again
df.isna().sum()

## SPLIT  THE DATASET

In [None]:
X = df.drop('Potability', axis=1)
y = df['Potability']

In [None]:
# split into training and validation set
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
len(X_train), len(X_val), len(y_train), len(y_val)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix, classification_report

## Modelling

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier





In [None]:
models = {"LogisticRegression" : LogisticRegression(),
        "RandomForestClassifier" : RandomForestClassifier(),
        "GradientBoostingClassifier" : GradientBoostingClassifier(),
        "SVM" : LinearSVC(),
        "XGB " : XGBClassifier()}

In [None]:
def fit_and_score(X_train, X_val, y_train, y_val, models):
    
    np.random.seed(42)
    
    model_scores = {}
    
    for name, model in models.items():
        
        model.fit(X_train, y_train)
        
        model_scores[name] = model.score(X_val, y_val)
        
    return model_scores

In [None]:
models = fit_and_score(X_train=X_train, X_val=X_val, y_train=y_train, y_val=y_val, models=models)
models

In [None]:
model_comparision = pd.DataFrame()
model_comparision['Model Name'] = models.keys()
model_comparision['Accuracy'] = models.values()
model_comparision


In [None]:
# XGB gave has good accuracy

np.random.seed(42)
ideal_model = XGBClassifier()
ideal_model.fit(X_train, y_train)
y_preds = ideal_model.predict(X_val)
y_preds

ideal_model.score(X_val, y_val)



## Evaluation

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

In [None]:
print(f"Accuracy : {accuracy_score(y_val, y_preds)*100:.2f}%")
print(f"Precision : {precision_score(y_val, y_preds)}")
print(f"Recall : {recall_score(y_val, y_preds)}")
print(f"F1  : {f1_score(y_val, y_preds)}")
print("-"*100)
print("                       Classification_Report             ")
print(classification_report(y_val,y_preds))
print("-" * 100)
sns.heatmap(confusion_matrix(y_val, y_preds),
           annot=True,
           cmap='turbo')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label');