In [1]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
import dagshub
mlflow.set_tracking_uri("https://dagshub.com/santanu211/mini_project_mlopps.mlflow")

dagshub.init(repo_owner='santanu211', repo_name='mini_project_mlopps', mlflow=True)


In [3]:
# Load data
df = pd.read_csv("C:/Users/Admin/Music/Student Depression Dataset.csv")

# Preprocessing
df.fillna(0, inplace=True)  # Fill missing values with 0
df = df.select_dtypes(include=['number'])  # Keep only numeric columns

# Splitting features and labels
X = df.drop(columns=['Depression'], errors='ignore')
y = df["Depression"] if 'Depression' in df else None  

# Define test sizes and algorithms
test_sizes = [0.2, 0.3]
algorithms = {
    'RandomForest': RandomForestClassifier(),
    'DecisionTree': DecisionTreeClassifier()
}

# Hyperparameter grids for tuning
param_grids = {
    'RandomForest': {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20]},
    'DecisionTree': {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
}


In [6]:
# Start MLflow Tracking
mlflow.set_experiment("Test_Size_Experiments")

with mlflow.start_run(run_name="Parent_Run") as parent_run:
    for test_size in test_sizes:
        with mlflow.start_run(run_name=f"Test_Size_{test_size}", nested=True):
            mlflow.log_param("test_size", test_size)

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

            for model_name, model in algorithms.items():
                with mlflow.start_run(run_name=model_name, nested=True):
                    # Perform Grid Search
                    grid_search = GridSearchCV(model, param_grids[model_name], cv=3, scoring='accuracy')
                    grid_search.fit(X_train, y_train)

                    # Get best model and parameters
                    best_model = grid_search.best_estimator_
                    best_params = grid_search.best_params_

                    # Log best parameters
                    mlflow.log_params(best_params)

                    # Retrain with best model
                    best_model.fit(X_train, y_train)
                    y_pred = best_model.predict(X_test)

                    # Convert accuracy to a native Python float
                    accuracy = float(accuracy_score(y_test.squeeze(), y_pred.squeeze()))
                    mlflow.log_metric("accuracy", accuracy)

                    # Log the best trained model
                    mlflow.sklearn.log_model(best_model, artifact_path=f"{model_name}_model")

                    print(f"Test Size: {test_size}, Model: {model_name}, Best Params: {best_params}, Accuracy: {accuracy:.2f}")

mlflow.end_run()

2025/03/28 22:42:07 INFO mlflow.tracking.fluent: Experiment with name 'Test_Size_Experiments' does not exist. Creating a new experiment.


Test Size: 0.2, Model: RandomForest, Best Params: {'max_depth': 10, 'n_estimators': 150}, Accuracy: 0.77
🏃 View run RandomForest at: https://dagshub.com/santanu211/mini_project_mlopps.mlflow/#/experiments/2/runs/d014593988a54808b5e0146c3d33cdb6
🧪 View experiment at: https://dagshub.com/santanu211/mini_project_mlopps.mlflow/#/experiments/2




Test Size: 0.2, Model: DecisionTree, Best Params: {'max_depth': 10, 'min_samples_split': 2}, Accuracy: 0.75
🏃 View run DecisionTree at: https://dagshub.com/santanu211/mini_project_mlopps.mlflow/#/experiments/2/runs/e2e01d3e4c9f4563b3a2621fb8106888
🧪 View experiment at: https://dagshub.com/santanu211/mini_project_mlopps.mlflow/#/experiments/2
🏃 View run Test_Size_0.2 at: https://dagshub.com/santanu211/mini_project_mlopps.mlflow/#/experiments/2/runs/5a3bca91a74d40cbb7fe5159b167519b
🧪 View experiment at: https://dagshub.com/santanu211/mini_project_mlopps.mlflow/#/experiments/2




Test Size: 0.3, Model: RandomForest, Best Params: {'max_depth': 10, 'n_estimators': 150}, Accuracy: 0.78
🏃 View run RandomForest at: https://dagshub.com/santanu211/mini_project_mlopps.mlflow/#/experiments/2/runs/2fd1caa3eec047edb178ff26f13fc9b7
🧪 View experiment at: https://dagshub.com/santanu211/mini_project_mlopps.mlflow/#/experiments/2




Test Size: 0.3, Model: DecisionTree, Best Params: {'max_depth': 10, 'min_samples_split': 2}, Accuracy: 0.75
🏃 View run DecisionTree at: https://dagshub.com/santanu211/mini_project_mlopps.mlflow/#/experiments/2/runs/db66667cee134ce4b1d9516192aa5005
🧪 View experiment at: https://dagshub.com/santanu211/mini_project_mlopps.mlflow/#/experiments/2
🏃 View run Test_Size_0.3 at: https://dagshub.com/santanu211/mini_project_mlopps.mlflow/#/experiments/2/runs/4d2cf9232160468e83b89bb680c3742a
🧪 View experiment at: https://dagshub.com/santanu211/mini_project_mlopps.mlflow/#/experiments/2
🏃 View run Parent_Run at: https://dagshub.com/santanu211/mini_project_mlopps.mlflow/#/experiments/2/runs/f80ab2f9abe947f88223516f635269eb
🧪 View experiment at: https://dagshub.com/santanu211/mini_project_mlopps.mlflow/#/experiments/2
