# Training Machine Learning Models

This notebook trains six machine learning models on the provided dataset, calculates evaluation metrics, and saves the models as pickle files.

In [2]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import pickle

# Load the dataset
data = pd.read_csv('water_potability.csv')
data = data.dropna()

# Split the dataset
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize models
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'NaiveBayes': GaussianNB(),
    'KNN': KNeighborsClassifier()
}

# Train models and save metrics
metrics = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    metrics.append({'Model': name, 'Accuracy': acc, 'Precision': prec, 'F1_Score': f1})
    print(f'{name} - Accuracy: {acc}, Precision: {prec}, F1 Score: {f1}')

    # Save the model
    with open(f'{name}.pkl', 'wb') as f:
        pickle.dump(model, f)

# Save metrics to CSV
metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv('model_metrics.csv', index=False)

print("Models trained and metrics saved.")


RandomForest - Accuracy: 0.6771523178807947, Precision: 0.6607142857142857, F1 Score: 0.5323741007194245


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression - Accuracy: 0.6026490066225165, Precision: 0.8461538461538461, F1 Score: 0.08396946564885496
SVM - Accuracy: 0.5877483443708609, Precision: 0.0, F1 Score: 0.0
DecisionTree - Accuracy: 0.6109271523178808, Precision: 0.528, F1 Score: 0.5290581162324649
NaiveBayes - Accuracy: 0.6059602649006622, Precision: 0.5504587155963303, F1 Score: 0.33519553072625696
KNN - Accuracy: 0.5579470198675497, Precision: 0.45263157894736844, F1 Score: 0.3917995444191344
Models trained and metrics saved.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
