# Preprocess test

In [1]:


from titanic_survival_package.data_preprocessing import load_dataset, fill_missing_values, preprocess_test_data

def test_load_dataset():
    """
    Tests the loading of a dataset from a CSV file.

    Ensures that the `load_dataset` function can successfully load data from a given file path
    and that the resulting DataFrame is not empty.
    """
    # Load training data and verify it's not empty
    train_df = load_dataset('train.csv')
    assert not train_df.empty, "Failed to load training data. The DataFrame is empty."

def test_fill_missing_values():
    """
    Tests the handling of missing values within a dataset.

    Verifies that the `fill_missing_values` function correctly fills or handles all missing values
    within the dataset, ensuring no missing values remain.
    """
    # Load training data
    train_df = load_dataset('train.csv')
    
    # Apply missing value handling
    preprocessed_df = fill_missing_values(train_df.copy())
    
    # Assert no missing values remain
    assert preprocessed_df.isnull().sum().max() == 0, "Missing values were not properly handled."

def test_preprocess_test_data():
    """
    Tests the preprocessing of test data to align with the training data format.

    Ensures that the `preprocess_test_data` function applies similar preprocessing steps to the test
    data as were applied to the training data, including handling missing values and feature engineering,
    and verifies that no unexpected missing values remain after preprocessing.
    """
    # Load training data and test data
    train_df = load_dataset('train.csv')
    test_df = load_dataset('test.csv')
    
    # Preprocess test data using training data as a reference
    test_preprocessed_df = preprocess_test_data(test_df.copy(), train_df)
    
    # Check for any remaining missing values
    missing_values = test_preprocessed_df.isnull().sum()
    print(missing_values[missing_values > 0])
    
    # Assert no missing values remain after preprocessing
    assert test_preprocessed_df.isnull().sum().max() == 0, "Test data still contains missing values after preprocessing."


# Feature Engineering test

In [2]:
import pandas as pd
from titanic_survival_package.feature_engineering import create_family_size_feature, extract_and_process_titles, add_interaction_terms


def test_create_family_size_feature():
    """
    Tests the creation of a 'FamilySize' feature.

    This function verifies that the `create_family_size_feature` function successfully adds a new column
    'FamilySize' to the DataFrame, calculated as the sum of 'SibSp' (siblings/spouses aboard) and 'Parch'
    (parents/children aboard) plus one for the passenger themselves.
    """
    # Load the training dataset
    train_df = pd.read_csv('train.csv')  # Adjust path as needed
    
    # Create 'FamilySize' feature
    family_size_df = create_family_size_feature(train_df.copy())
    
    # Assert 'FamilySize' column exists
    assert 'FamilySize' in family_size_df.columns, "FamilySize feature not created."

def test_extract_and_process_titles():
    """
    Tests the extraction and processing of titles from passenger names.

    Ensures that the `extract_and_process_titles` function can identify and extract titles from the 'Name'
    column, simplify them into common categories, and encode these titles as dummy variables. Verifies that
    a 'Title' column is created to reflect these changes.
    """
    # Load the training dataset
    train_df = pd.read_csv('train.csv')  # Adjust path as needed
    
    # Extract and process titles
    titles_df = extract_and_process_titles(train_df.copy())
    
    # Assert 'Title' feature is created or processed
    assert 'Title' in titles_df.columns, "Title feature not created or processed."

def test_add_interaction_terms():
    """
    Tests the creation of interaction terms between features.

    Verifies that the `add_interaction_terms` function successfully creates new features that are products
    of existing features, specifically testing for the creation of an 'Age_Pclass' interaction term, which
    combines 'Age' and 'Pclass' (ticket class).
    """
    # Load the training dataset
    train_df = pd.read_csv('train.csv')  # Adjust path as needed
    
    # Add interaction terms
    interaction_df = add_interaction_terms(train_df.copy())
    
    # Assert interaction term 'Age_Pclass' is created
    assert 'Age_Pclass' in interaction_df.columns, "Interaction term 'Age_Pclass' not created."


# Hyperparameter Tuning test

In [3]:

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

def tune_hyperparameters_grid(model, X, y, parameters):
    """
    Performs hyperparameter tuning using Grid Search Cross Validation. This method exhaustively 
    searches through all specified parameter combinations in 'parameters'.

    Parameters:
    - model (estimator): The machine learning model/estimator for which hyperparameters are to be optimized.
    - X (array-like): Feature dataset used for training the model.
    - y (array-like): Target values corresponding to 'X'.
    - parameters (dict): Dictionary with parameters names (str) as keys and lists of parameter settings to try as values.

    Returns:
    - dict: The best parameter setting found on the given data.
    - float: Mean cross-validated score of the best_estimator.
    """
    grid_search = GridSearchCV(model, parameters, cv=5, scoring='accuracy')
    grid_search.fit(X, y)
    return grid_search.best_params_, grid_search.best_score_

def tune_hyperparameters_random(model, X, y, parameters, n_iter=10):
    """
    Performs hyperparameter tuning using Randomized Search Cross Validation. This method samples 
    'n_iter' parameter settings from the specified 'parameters' distributions.

    Parameters:
    - model (estimator): The machine learning model/estimator for which hyperparameters are to be optimized.
    - X (array-like): Feature dataset used for training the model.
    - y (array-like): Target values corresponding to 'X'.
    - parameters (dict): Dictionary where keys are parameter names and values are distributions or lists of parameters to sample.
    - n_iter (int, optional): Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.

    Returns:
    - dict: The best parameter setting found on the given data.
    - float: Mean cross-validated score of the best_estimator.
    """
    random_search = RandomizedSearchCV(model, parameters, n_iter=n_iter, cv=5, scoring='accuracy', random_state=42)
    random_search.fit(X, y)
    return random_search.best_params_, random_search.best_score_


# Models test

In [4]:
from titanic_survival_package.models import ModelTrainer, ModelEvaluator
from sklearn.ensemble import RandomForestClassifier
import numpy as np


def test_model_training_and_evaluation():
    """
    Tests the training and evaluation of RandomForestClassifier using the ModelTrainer and ModelEvaluator classes.

    This function demonstrates the process of initializing a model trainer with predefined hyperparameters,
    training the model on a subset of the data for efficiency, and evaluating its performance on a separate validation set.
    The function asserts the accuracy type to ensure the evaluation returns a floating-point number, indicative of the model's performance.
    """
    # Define best hyperparameters obtained from previous tuning efforts
    best_params = {
        'n_estimators': [100],  # Example: Number of trees in the forest
        'max_depth': [None],  # Example: Maximum depth of the trees
        'min_samples_split': [2],  # Example: Minimum number of samples required to split an internal node
        'min_samples_leaf': [1]  # Example: Minimum number of samples required to be at a leaf node
    }

    # Initialize the ModelTrainer with a RandomForestClassifier and the best hyperparameters
    model_trainer = ModelTrainer(RandomForestClassifier(random_state=42), best_params)

    # Load training and validation sets here (not shown for brevity)
    # For example purposes, using X_train, y_train, X_val, y_val

    # Train the RandomForestClassifier model on a subset of the training data for quick testing
    trained_model = model_trainer.train(X_train[:100], y_train[:100])

    # Evaluate the trained model on a subset of the validation data
    accuracy = ModelEvaluator.evaluate_model(trained_model, X_val[:20], y_val[:20])

    # Assert the type of the accuracy to ensure the evaluation process returns a floating-point number
    assert isinstance(accuracy, np.float64) or isinstance(accuracy, float), "Model evaluation did not return accuracy as a floating-point number."

    print("Model training and evaluation test passed.")
