## A Sklearn pipeline
A sklearn pipeline for a small datset that transforms numerical and categorical columns and standardize them

In [3]:
"""
This script addresses the problem of classifying music events into 'sold-out' or 'not sold-out' categories 
based on various features such as genre, social media followers, and likes. The pipeline involves data 
preprocessing, including handling missing values, scaling numerical features, and encoding categorical data.
A decision tree classifier is trained on the processed data, and the resulting model is saved for future use.
"""

# Import necessary libraries
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.exceptions import NotFittedError
from sklearn.metrics import precision_score, recall_score

import joblib
from typing import Tuple, Optional 

def create_dataframe() -> pd.DataFrame:
    """
    Create a DataFrame with sample data for a music genre classification task.
    
    Returns:
        pd.DataFrame: A DataFrame with music genres, social media followers, likes, and sold-out status.
    """
    data = {
        'Genre': ['Rock', 'Metal', 'Bluegrass', 'Rock', np.nan, 'Rock', 'Rock', np.nan, 'Bluegrass', 'Rock'],
        'Social_media_followers': [1000000, 1000000, 2000000, 1310000, 1700000, np.nan, 4100000, 1600000, 2200000, 1000000],
        'likes': [6000000, 1000000, 5000000, 1610000, 1800000, np.nan, 4800000, 1650000, 2680000, 5000000],
        'Sold_out': [1, 1, 0, 1, 0, 0, 0, 1, 0, 1]
    }
    return pd.DataFrame(data)

def preprocess_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series, ColumnTransformer]:
    """
    Preprocess data by setting up a pipeline with imputation, scaling, and encoding steps.

    Args:
        df (pd.DataFrame): The input DataFrame containing features and target variable.
    
    Returns:
        Tuple: Feature matrix X, target vector y, and ColumnTransformer for preprocessing.
    """
    # Separate features and target variable
    X = df.drop(columns='Sold_out')
    y = df['Sold_out']
    
    # Identify numerical and categorical columns
    num_cols = df.select_dtypes(include=['float64']).columns.tolist()
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Define pipelines for numerical and categorical transformations
    num_pipe = Pipeline([
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', StandardScaler())
    ])
    cat_pipe = Pipeline([
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    # Combine pipelines using ColumnTransformer
    col_trans = ColumnTransformer([
        ('num', num_pipe, num_cols),
        ('cat', cat_pipe, cat_cols)
    ], remainder='drop', n_jobs=-1)

    return X, y, col_trans

def build_pipeline(col_trans: ColumnTransformer) -> Pipeline:
    """
    Create a machine learning pipeline with a preprocessor and a decision tree classifier.
    
    Args:
        col_trans (ColumnTransformer): A ColumnTransformer containing preprocessing steps.
    
    Returns:
        Pipeline: A machine learning pipeline with preprocessing and a classifier.
    """
    classifier = DecisionTreeClassifier()
    pipeline = make_pipeline(col_trans, classifier)
    return pipeline

def train_and_evaluate(pipeline: Pipeline, X: pd.DataFrame, y: pd.Series) -> float:
    """
    Train the pipeline on training data, evaluate it on test data, and save the trained model.
    
    Args:
        pipeline (Pipeline): The machine learning pipeline to train and evaluate.
        X (pd.DataFrame): Feature matrix.
        y (pd.Series): Target vector.
    
    Returns:
        float: Accuracy score on the test set.
    """
    try:
        # Split data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        
        # Train the pipeline
        pipeline.fit(X_train, y_train)
        
        # Evaluate the model
        score = pipeline.score(X_test, y_test)
        precision = precision_score(y_test, pipeline.predict(X_test),zero_division=0)
        recall = recall_score(y_test, pipeline.predict(X_test),zero_division=0)
        
        print(f'Model Accuracy: {score:.2f}')
        print(f'Precision: {precision:.2f}')
        print(f'Recall: {recall:.2f}')
        
        # Save the model
        joblib.dump(pipeline, 'pipe.joblib')
        print("Model saved as 'pipe.joblib'")
        
        return score

    except NotFittedError as e:
        print(f"Model not fitted yet: {e}")
        return -1
    
    except ValueError as e:
        print(f"ValueError during training: {e}")
        return -1
    
    except Exception as e:
        print(f"An unexpected error occurred during training or evaluation: {e}")
        return -1

def load_and_verify_model() -> Optional[Pipeline]:
    """
    Load a previously saved machine learning model from a file and verify its contents.

    Returns:
        Optional[Pipeline]: The loaded model (Pipeline object) if successful, 
        or None if there was an error loading the model.
    """
    try:
        # Attempt to load the model from the joblib file
        model = joblib.load('pipe.joblib')
        print("Loaded model successfully.")
        return model
    
    except FileNotFoundError as e:
        # Handle case where the file does not exist
        print(f"Model file not found: {e}")
        return None
    
    except joblib.exceptions.InvalidFileException as e:
        # Handle case where the file is not a valid joblib file
        print(f"Invalid model file: {e}")
        return None

    except Exception as e:
        # Catch any other unforeseen errors and print the error message
        print(f"An unexpected error occurred while loading the model: {e}")
        return None

def make_inference(model: Pipeline, new_data: pd.DataFrame) -> np.ndarray:
    """
    Make predictions on new data using the trained model pipeline.
    
    Args:
        model (Pipeline): The trained model pipeline.
        new_data (pd.DataFrame): A DataFrame containing new data for inference.
        
    Returns:
        np.ndarray: Predictions for the new data.
    """
    try:
        # Predict using the loaded model
        predictions = model.predict(new_data)
        print("Predictions:", predictions)
        return predictions
    
    except NotFittedError as e:
        print(f"Model is not fitted: {e}")
        return np.array([])
    
    except ValueError as e:
        print(f"ValueError during inference: {e}")
        return np.array([])
    
    except Exception as e:
        print(f"An unexpected error occurred during inference: {e}")
        return np.array([])

def main() -> None:
    """
    Main function to execute the ML pipeline for preprocessing, training, evaluation, and saving the model.
    """
    try:
        # Step 1: Load data
        df = create_dataframe()
        
        # Step 2: Preprocess data
        X, y, col_trans = preprocess_data(df)
        
        # Step 3: Build pipeline
        pipeline = build_pipeline(col_trans)
        
        # Step 4: Train and evaluate model
        score = train_and_evaluate(pipeline, X, y)
        
        # Step 5: Load and verify saved model
        loaded_pipeline = load_and_verify_model()
        
        # Step 6: Make inference on new data if the model loaded successfully
        if loaded_pipeline:
            # Example new data for inference
            new_data = pd.DataFrame({
                'Genre': ['Rock', 'Bluegrass', np.nan],
                'Social_media_followers': [1500000, np.nan, 2300000],
                'likes': [1800000, 2100000, np.nan]
            })
            print("Loaded pipeline verification score:", loaded_pipeline.score(X, y))
            make_inference(loaded_pipeline, new_data)

    except Exception as e:
        print(f"An error occurred in the main function: {e}")

# Execute the main function
if __name__ == "__main__":
    main()


Model Accuracy: 1.00
Precision: 1.00
Recall: 1.00
Model saved as 'pipe.joblib'
Loaded model successfully.
Loaded pipeline verification score: 1.0
Predictions: [1 0 0]
