In [None]:
# Cell 1: Install Libraries
!pip install pandas numpy matplotlib seaborn scikit-learn xgboost imbalanced-learn

In [None]:
# Cell 2: Main Analysis Script
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, classification_report
from sklearn.impute import SimpleImputer

# Import ML algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

# Import ensemble methods
from sklearn.ensemble import VotingClassifier

# Import for handling imbalanced data
from imblearn.over_sampling import SMOTE

# Load the dataset
try:
    df = pd.read_csv('water_potability.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: water_potability.csv not found. Please ensure the file is uploaded to the Colab session.")
    exit()

# ---------------------------------
# 1. Raw Data & 2. Exploratory Data Analysis (EDA)
# ---------------------------------
print("\n--- 1. Raw Data ---")
print("First 5 rows of the dataset:")
print(df.head())
print("\nDataset Information:")
df.info()

print("\n--- 2. Exploratory Data Analysis (EDA) ---")
print("\nChecking for Null/Missing Values:")
print(df.isnull().sum())

imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
print("\nMissing values after imputation:")
print(df_imputed.isnull().sum())

print("\nDescriptive Statistics:")
print(df_imputed.describe())

print("\nGenerating visualizations...")
plt.figure(figsize=(6, 4))
sns.countplot(x='Potability', data=df_imputed)
plt.title('Distribution of Water Potability')
plt.xlabel('Potability (0 = Not Potable, 1 = Potable)')
plt.ylabel('Count')
plt.show()

df_imputed.hist(figsize=(15, 12), bins=20)
plt.suptitle('Histograms of All Features')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

plt.figure(figsize=(12, 10))
sns.heatmap(df_imputed.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Features')
plt.show()

# ---------------------------------
# 3. Train-Test Split
# ---------------------------------
print("\n--- 3. Train-Test Split ---")
X = df_imputed.drop('Potability', axis=1)
y = df_imputed['Potability']

potability_counts = y.value_counts()
print(f"Potability distribution:\n{potability_counts}")
if abs(potability_counts[0] - potability_counts[1]) / len(y) > 0.1:
    print("\nDataset is imbalanced. Applying SMOTE for upsampling.")
    smote = SMOTE(random_state=42)
    X, y = smote.fit_resample(X, y)
    print(f"New distribution after SMOTE:\n{y.value_counts()}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ---------------------------------
# 4. Preprocessing and Scaling
# ---------------------------------
print("\n--- 4. Preprocessing and Scaling ---")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Data has been scaled using StandardScaler.")

# ---------------------------------
# 6. ML Algorithms
# ---------------------------------
print("\n--- 6. Training ML Algorithms ---")
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier(),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, "predict_proba") else [0]*len(y_test)

    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'AUC-ROC': roc_auc_score(y_test, y_pred_proba) if hasattr(model, "predict_proba") else 'N/A',
        'Model': model
    }

# ---------------------------------
# 7. Hyperparameter Tuning (Example on Random Forest)
# ---------------------------------
print("\n--- 7. Hyperparameter Tuning (Example on Random Forest) ---")
param_grid_rf = { 'n_estimators': [100, 200], 'max_depth': [10, 20, None] }
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_rf.fit(X_train_scaled, y_train)

best_rf = grid_search_rf.best_estimator_
models['Tuned Random Forest'] = best_rf

y_pred_tuned_rf = best_rf.predict(X_test_scaled)
y_pred_proba_tuned_rf = best_rf.predict_proba(X_test_scaled)[:, 1]
results['Tuned Random Forest'] = {
    'Accuracy': accuracy_score(y_test, y_pred_tuned_rf), 'Precision': precision_score(y_test, y_pred_tuned_rf), 'Recall': recall_score(y_test, y_pred_tuned_rf), 'F1-Score': f1_score(y_test, y_pred_tuned_rf), 'AUC-ROC': roc_auc_score(y_test, y_pred_proba_tuned_rf), 'Model': best_rf
}

# ---------------------------------
# 8 & 10. Ensemble Technique (Voting Classifier)
# ---------------------------------
print("\n--- 8 & 10. Designing Ensemble Technique (Voting Classifier) ---")
estimators = [('lr', models['Logistic Regression']), ('rf', best_rf), ('xgb', models['XGBoost'])]
voting_clf = VotingClassifier(estimators=estimators, voting='soft')
voting_clf.fit(X_train_scaled, y_train)
models['Ensemble (Voting)'] = voting_clf

y_pred_voting = voting_clf.predict(X_test_scaled)
y_pred_proba_voting = voting_clf.predict_proba(X_test_scaled)[:, 1]
results['Ensemble (Voting)'] = {
    'Accuracy': accuracy_score(y_test, y_pred_voting), 'Precision': precision_score(y_test, y_pred_voting), 'Recall': recall_score(y_test, y_pred_voting), 'F1-Score': f1_score(y_test, y_pred_voting), 'AUC-ROC': roc_auc_score(y_test, y_pred_proba_voting), 'Model': voting_clf
}

# ---------------------------------
# 9 & 11. Model Evaluation and Comparative Analysis
# ---------------------------------
print("\n--- 9 & 11. Model Evaluation and Comparative Analysis ---")
results_df = pd.DataFrame(results).T.drop(columns=['Model'])
print(results_df.sort_values(by='F1-Score', ascending=False))

best_model_name = results_df['F1-Score'].idxmax()
best_model = results[best_model_name]['Model']

print(f"\nClassification Report for the Best Model ({best_model_name}):")
y_pred_best = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred_best))

plt.figure(figsize=(6, 5))
cm = confusion_matrix(y_test, y_pred_best)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Potable', 'Potable'], yticklabels=['Not Potable', 'Potable'])
plt.title(f'Confusion Matrix for {best_model_name}')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

plt.figure(figsize=(10, 8))
for name, values in results.items():
    model = values['Model']
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        auc = roc_auc_score(y_test, y_pred_proba)
        plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.2f})")

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for All Models')
plt.legend(loc='lower right')
plt.grid()
plt.show()