In [None]:
# Import necessary libraries
!pip install mlflow
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
import mlflow
import mlflow.sklearn

# Question 1: Data Structure and Processing Pipeline
class IrisDataProcessor:
    def __init__(self):
        # Load the Iris dataset
        self.data = load_iris()
        # Convert to DataFrame for easier manipulation
        self.df = pd.DataFrame(self.data.data, columns=self.data.feature_names)
        self.df['target'] = self.data.target
        self.scaler = StandardScaler()

    def prepare_data(self):
        # Scale features and perform train-test split
        X = self.df[self.data.feature_names]
        y = self.df['target']
        X_scaled = self.scaler.fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
        return X_train, X_test, y_train, y_test

    def get_feature_stats(self):
        # Return basic statistical analysis
        return self.df.describe()

Collecting mlflow
  Downloading mlflow-2.17.2-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.17.2 (from mlflow)
  Downloading mlflow_skinny-2.17.2-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.17.2->mlflow)
  Downloading databricks_sdk-0.36.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.5-py3-none-any.whl.metadata (10 kB)
Colle

In [None]:
# Question 2: Experiment Tracking and Model Development
class IrisExperiment:
    def __init__(self, data_processor):
        self.X_train, self.X_test, self.y_train, self.y_test = data_processor.prepare_data()
        mlflow.set_experiment("Iris Classification Experiment")

    def run_experiment(self):
        # Initialize models
        models = {
            "Logistic Regression": LogisticRegression(),
            "Random Forest": RandomForestClassifier()
        }
        # Track experiments
        for model_name, model in models.items():
            with mlflow.start_run(run_name=model_name):
                # Perform cross-validation
                cv_scores = cross_val_score(model, self.X_train, self.y_train, cv=5)
                model.fit(self.X_train, self.y_train)
                predictions = model.predict(self.X_test)

                # Calculate metricsdocker-compose --version

                accuracy = accuracy_score(self.y_test, predictions)
                precision = precision_score(self.y_test, predictions, average='macro')
                recall = recall_score(self.y_test, predictions, average='macro')

                # Log metrics with MLflow
                mlflow.log_param("Model Type", model_name)
                mlflow.log_metric("CV Accuracy", np.mean(cv_scores))
                mlflow.log_metric("Test Accuracy", accuracy)
                mlflow.log_metric("Precision", precision)
                mlflow.log_metric("Recall", recall)

                # Log the model
                mlflow.sklearn.log_model(model, model_name)

    def log_results(self):
        print("Experiment results have been logged in MLflow.")

In [None]:
# Question 3: Model Optimization and Testing
class IrisModelOptimizer:
    def __init__(self, experiment):
        self.model = LogisticRegression()
        self.X_train, self.X_test, self.y_train, self.y_test = experiment.X_train, experiment.X_test, experiment.y_train, experiment.y_test

    def quantize_model(self):
        # Simple quantization example: reducing the model to int8 precision if needed
        self.model.fit(self.X_train, self.y_train)
        # Simulated quantization step (normally involves model serialization)
        # In practice, this requires frameworks such as TensorFlow Lite or ONNX.
        print("Model quantization complete.")

    def run_tests(self):
        # Simple unit test
        predictions = self.model.predict(self.X_test)
        accuracy = accuracy_score(self.y_test, predictions)
        assert accuracy > 0.7, "Accuracy should be above 70%"
        print("Unit test passed with accuracy:", accuracy)