# Step 1: Install mlflow and required libraries. 


# Step 2: Load Data and Perform EDA
You would start by loading the Titanic dataset into a pandas DataFrame and perform basic exploratory data analysis (EDA). This step involves cleaning the data, handling missing values, dropping redundant columns, and encoding categorical variables.

In [15]:
import os
import mlflow
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load data
data = pd.read_csv('D:\Post Graduate Program IIITH in Software Engg for Data Science\Part 2 IIITH Software Engg for Data Science\MLflow Exp Titanic\Titanic Dataset.csv')



In [16]:
# Perform EDA and preprocessing (hypothetical functions)
# Assuming 'data' is your pandas DataFrame
columns_to_drop = ['Name', 'Ticket','Cabin','Age'] # Columns Dropped
data = data.drop(columns_to_drop, axis=1)



# Checking for missing values in the dataset
missing_values = data.isnull().sum()
missing_values_percentage = (missing_values / len(data)) * 100

# Displaying the count and percentage of missing values in each column
missing_values_summary = pd.DataFrame({'Missing Values': missing_values, 'Percentage (%)': missing_values_percentage})
missing_values_summary.sort_values(by="Missing Values", ascending=False)


Unnamed: 0,Missing Values,Percentage (%)
Embarked,2,0.224467
PassengerId,0,0.0
Survived,0,0.0
Pclass,0,0.0
Sex,0,0.0
SibSp,0,0.0
Parch,0,0.0
Fare,0,0.0


# Step 3: Build and Train Classification Model
Now, build and train a classification model using scikit-learn:

In [14]:
# Assuming 'data' is your DataFrame
encoder = LabelEncoder()
data['Sex'] = encoder.fit_transform(data['Sex'])
data['Embarked'] = encoder.fit_transform(data['Embarked'])



# Now, split the data again into features (X) and target (y)
X = data.drop('Survived', axis=1)
y = data['Survived']

scaler = StandardScaler()
X_train = scaler.fit_transform(data[['PassengerId', 'Pclass', 'Sex', 'Embarked']])



# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now, you can fit the model
model = RandomForestClassifier()
model.fit(X_train, y_train)


RandomForestClassifier()

In [17]:
#Train Model with different Parameters
# Assuming 'data' is your DataFrame
encoder = LabelEncoder()
data['Sex'] = encoder.fit_transform(data['Sex'])
data['Embarked'] = encoder.fit_transform(data['Embarked'])



# Now, split the data again into features (X) and target (y)
X = data.drop('Survived', axis=1)
y = data['Survived']

scaler = StandardScaler()
X_train = scaler.fit_transform(data[['PassengerId', 'Pclass', 'Sex', 'Embarked']])



# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Now, you can fit the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [18]:
num_imputer = SimpleImputer(strategy='mean')  # or 'mean'
X_train = pd.DataFrame(num_imputer.fit_transform(X_train), columns=X_train.columns)

cat_imputer = SimpleImputer(strategy='most_frequent')
X_train = pd.DataFrame(cat_imputer.fit_transform(X_train), columns=X_train.columns)

np.isnan(X_train).any()
np.isinf(X_train).any()
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)




  mode = stats.mode(array)


# Step 4: Log Parameters and Metrics using MLflow
Now we can start logging our experiment using MLflow:

In [5]:
# Start an MLflow experiment
mlflow.start_run()

# Log parameters and metrics
mlflow.log_param("n_estimators", 100)

# Instantiate the classifier
clf = RandomForestClassifier()

# Train the classifier
# Make sure X_train and y_train are properly defined and contain your training data
try:
    clf.fit(X_train, y_train)
except Exception as e:
    print("Error during training:", e)
    # Handle or report the error appropriately


# Start an MLflow run
with mlflow.start_run():
    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Log parameters (if any)
    # Example: mlflow.log_param("n_estimators", clf.get_params()["n_estimators"])

# Logging the metrics using MLflow
mlflow.log_metrics({'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1_score': f1})

# Assuming you have a function to calculate metrics
metrics = evaluate_model(clf, X_test, y_test)
mlflow.log_metrics(metrics)

# Log the model
mlflow.sklearn.log_model(clf, "random_forest_classifier")



Exception: Run with UUID bc558e4bb74a45bbbc2b504e0ed925dc is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [11]:
# End the MLflow experiment
mlflow.end_run()

# Step 5: Run and Track the Logged Parameters in the MLflow UI
Run the above script to perform the training and logging. To start the MLflow UI, use the following command in your terminal:
mlflow ui


# Step 6: Modify the Model and Parameters
Now, make some changes to the model or its parameters:

In [19]:
# Start an MLflow experiment
mlflow.start_run()

# Modify the model's parameters
clf = RandomForestClassifier(n_estimators=200, random_state=42)

# Instantiate the classifier
clf = RandomForestClassifier()

# Train the classifier
# Make sure X_train and y_train are properly defined and contain your training data
try:
    clf.fit(X_train, y_train)
except Exception as e:
    print("Error during training:", e)
    # Handle or report the error appropriately


# Start an MLflow run
with mlflow.start_run():
    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Log parameters (if any)
    # Example: mlflow.log_param("n_estimators", clf.get_params()["n_estimators"])

# Logging the metrics using MLflow
mlflow.log_metrics({'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1_score': f1})

# Assuming you have a function to calculate metrics
metrics = evaluate_model(clf, X_test, y_test)
mlflow.log_metrics(metrics)

# Log the model
mlflow.sklearn.log_model(clf, "random_forest_classifier_v2")


Exception: Run with UUID 885d185e42ab48dd8fbcaf4b10be704d is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [20]:
# End the MLflow experiment
mlflow.end_run()

# Step 7: Serve the Model using MLflow
To serve the model using MLflow, you need to first save it in the MLflow Model format:

In [21]:
mlflow.sklearn.save_model(clf, "my_random_forest_model")

MlflowException: Path 'my_random_forest_model' already exists and is not empty