In [3]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import mlflow
import mlflow.sklearn
import pandas as pd
import joblib
import os
import json

def separate_features_target(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    return X, y

def identify_features(X):
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns
    return numerical_features, categorical_features

def handle_duplicates(df):
    df_no_duplicates = df.drop_duplicates()
    return df_no_duplicates  

def create_preprocessor(numerical_features,categorical_features):
    numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    return preprocessor

def create_classifier():
    return RandomForestClassifier()

def create_pipeline(preprocessor, classifier):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

def train_pipeline(pipeline, X_train, y_train):
    pipeline.fit(X_train, y_train)

def evaluate_pipeline(pipeline, X_test, y_test):
    accuracy = pipeline.score(X_test, y_test)
    print(f'Model Accuracy: {accuracy}')
    return accuracy

def save_pipeline(pipeline, file_path):
    joblib.dump(pipeline, file_path)

def load_pipeline(file_path):
    return joblib.load(file_path)

def predict_with_pipeline(pipeline, new_data):
    return pipeline.predict(new_data)


def save_metrics(metrics, folder_path):
    with open(os.path.join(folder_path, 'metrics.json'), 'w') as f:
        json.dump(metrics, f, indent=4)


def evaluate_model(pipeline, X_test, y_test):
    y_pred = pipeline.predict(X_test)
    accuracy = pipeline.score(X_test, y_test)
    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r_squared = r2_score(y_test, y_pred)
    return accuracy, report, conf_matrix, mae, mse, r_squared


def train_and_log_with_mlflow(file_path, target_column, classifier=None):


    # Read the dataset and perform initial data processing
    df = pd.read_csv(file_path)
    df = handle_duplicates(df)
    X, y = separate_features_target(df, target_column)
    numerical_features, categorical_features = identify_features(X)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train the pipeline
    preprocessor = create_preprocessor(numerical_features, categorical_features)
    if classifier is None:
        classifier = RandomForestClassifier()
    pipeline = create_pipeline(preprocessor, classifier)
    train_pipeline(pipeline, X_train, y_train)

    # Evaluate the pipeline
    accuracy, report, conf_matrix, mae, mse, r_squared = evaluate_model(pipeline, X_test, y_test)

    # Log metrics and model with MLflow
    with mlflow.start_run():
        mlflow.log_params({
            "classifier": classifier.__class__.__name__,
            "timestamp": pd.Timestamp.now().strftime('%Y%m%d%H%M%S'),
            "accuracy": accuracy,
            "mae": mae,
            "mse": mse,
            "r_squared": r_squared,
            "mean_absolute_error": mae,
            "mean_squared_error": mse
        })

        # Log the model using log_model
        mlflow.sklearn.log_model(pipeline, "ML_pipeline_with_Mlflow")

        # Save the model using save_model
        mlflow.sklearn.save_model(pipeline, "model")

        # Save metrics as JSON
        metrics_dict = {
            'accuracy': accuracy,
            'confusion_matrix': conf_matrix.tolist(),
            'mean_absolute_error': mae,
            'mean_squared_error': mse,
            'r_squared': r_squared
        }

        mlflow.log_param("metrics", json.dumps(metrics_dict, indent=4))

    # Print results
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)
    print("Confusion Matrix:")
    print(conf_matrix)
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R-squared: {r_squared:.4f}")

    return accuracy

# Example usage:
file_path = "train_data.csv"
target_column = "cid"  # Change to your target column
train_and_log_with_mlflow(file_path, target_column)



Accuracy: 0.8808
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       327
           1       0.83      0.62      0.71       101

    accuracy                           0.88       428
   macro avg       0.86      0.79      0.82       428
weighted avg       0.88      0.88      0.87       428

Confusion Matrix:
[[314  13]
 [ 38  63]]
Mean Absolute Error: 0.1192
Mean Squared Error: 0.1192
R-squared: 0.3391


0.8808411214953271