#Question 1: Data Structure and Processing Pipeline

In [2]:
#loading diabetes dataset
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [3]:
class DiabetesDataProcessor:
    def __init__(self):
        data = load_diabetes()
        self.df = pd.DataFrame(data.data, columns=data.feature_names)
        self.df['target'] = data.target
        self.scaler = StandardScaler()

    def prepare_data(self, test_size=0.2, random_state=42):
        X = self.df.drop(columns=['target'])
        y = self.df['target']

        X_scaled = self.scaler.fit_transform(X)

        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, test_size=test_size, random_state=random_state)

        return X_train, X_test, y_train, y_test

    def get_feature_stats(self):
        return self.df.describe()

In [4]:
processor = DiabetesDataProcessor()
X_train, X_test, y_train, y_test = processor.prepare_data()
print(processor.get_feature_stats())

                age           sex           bmi            bp            s1  \
count  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02   
mean  -2.511817e-19  1.230790e-17 -2.245564e-16 -4.797570e-17 -1.381499e-17   
std    4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02   
min   -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.123988e-01 -1.267807e-01   
25%   -3.729927e-02 -4.464164e-02 -3.422907e-02 -3.665608e-02 -3.424784e-02   
50%    5.383060e-03 -4.464164e-02 -7.283766e-03 -5.670422e-03 -4.320866e-03   
75%    3.807591e-02  5.068012e-02  3.124802e-02  3.564379e-02  2.835801e-02   
max    1.107267e-01  5.068012e-02  1.705552e-01  1.320436e-01  1.539137e-01   

                 s2            s3            s4            s5            s6  \
count  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02  4.420000e+02   
mean   3.918434e-17 -5.777179e-18 -9.042540e-18  9.293722e-17  1.130318e-17   
std    4.761905e-02  4.761905e-02  4.761905e-02  4.

#Question 2: Experiment Tracking and Model Development

In [6]:
#installing ML Flow
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.18.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.18.0 (from mlflow)
  Downloading mlflow_skinny-2.18.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.18.0->mlflow)
  Downloading databricks_sdk-0.38.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.5-py3-none-any.whl.metadata (10 kB)
Colle

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [10]:
class DiabetesExperiment:
    def __init__(self, data_processor):
        self.data_processor = data_processor
        self.models = {
            'Logistic Regression': LogisticRegression(),
            'Random Forest': RandomForestClassifier()
        }

    def run_experiment(self, X_train, X_test, y_train, y_test):
        for model_name, model in self.models.items():
            with mlflow.start_run(run_name=model_name):

                model.fit(X_train, y_train)

                cv_scores = cross_val_score(model, X_train, y_train, cv=5)

                y_pred = model.predict(X_test)

                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='macro')
                recall = recall_score(y_test, y_pred, average='macro')

                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric("precision", precision)
                mlflow.log_metric("recall", recall)
                mlflow.log_metric("cv_accuracy_mean", np.mean(cv_scores))

                mlflow.sklearn.log_model(model, model_name)

In [11]:
experiment = DiabetesExperiment(processor)
experiment.run_experiment(X_train, X_test, y_train, y_test)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#Question 3: Model Optimization and Testing

In [12]:
from joblib import dump, load
import unittest

In [13]:
class DiabetesModelOptimizer:
    def __init__(self, experiment):
        self.experiment = experiment

    def save_model(self, model, filename="logistic_regression_model.joblib"):
        dump(model, filename)
        print(f"Model saved to {filename}")

    def load_model(self, filename="logistic_regression_model.joblib"):
        return load(filename)

    def run_tests(self):
        class DiabetesModelTest(unittest.TestCase):
            def test_model_saving_and_loading(self):
                model = self.experiment.models['Logistic Regression']
                optimizer = DiabetesModelOptimizer(self.experiment)
                optimizer.save_model(model)
                loaded_model = optimizer.load_model()
                self.assertIsNotNone(loaded_model)
                self.assertEqual(type(model), type(loaded_model))

        unittest.main(argv=[''], exit=False)

In [14]:
optimizer = DiabetesModelOptimizer(experiment)
optimizer.save_model(experiment.models['Logistic Regression'])
optimizer.run_tests()


----------------------------------------------------------------------
Ran 0 tests in 0.000s

OK


Model saved to logistic_regression_model.joblib
