# Classification Template
v1.0

### Imports

In [1824]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("whitegrid")

RANDOM_STATE = 101

### Read Functions

In [1825]:
def show_data(df):
    """
    Display the first two and the last two records of a DataFrame
    """
    print(pd.concat([df.head(2), df.tail(2)]))


def show_missing_data(df):
    """
    Display number and percentage of missing values in all columns
    """
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (
        ((df.isnull().sum() / df.isnull().count()) * 100)
        .sort_values(ascending=False)
        .round(2)
    )
    missing_data = pd.concat([total, percent], axis=1, keys=["# missing", "% missing"])
    print(missing_data)


def show_unique_values(df, fields):
    """
    Show unique values in DataFrame given a list of fields
    """
    for field in fields:
        try:
            print(f"{field}: {df[field].unique()}")
        except KeyError:
            print(f"`{field}` not found in DataFrame")

### Evaluation Functions

In [1826]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

def evaluate_model(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)

    print(f"1) classification_report:\n\n", classification_report(y_test, y_pred))
    print(f"2) confusion_matrix:\n\n", confusion_matrix(y_test, y_pred), "\n")
    print(f"3) accuracy_score:\n\n", accuracy_score(y_test, y_pred))

In [1827]:
# def evaluate_model_kfold(classifiers, X_train, y_train, cv=10):
def evaluate_model_kfold(classifiers, _X_train, _X_train_scaled, y_train, models_to_scale, cv=10):
    """
    TBD
    """
    results = []

    for classifier in classifiers:

        if type(classifier).__name__ in models_to_scale:
            X_train = _X_train_scaled
        else:
            X_train = _X_train

        accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=cv)
        results.append({
            "Model": type(classifier).__name__,
            "Accuracy": accuracies.mean() * 100,
            "SD": accuracies.std() * 100
        })

    # Sort the results by accuracy in descending order
    sorted_results = sorted(results, key=lambda x: x["Accuracy"], reverse=True)

    # Print the results in a table format
    print(f"{'Model':<25} {'Accuracy %':<12} {'SD %':<10}")
    print('-' * 43)
    for result in sorted_results:
        print(f"{result['Model']:<25} {result['Accuracy']:<12.2f} {result['SD']:<10.2f}")


In [1828]:
from sklearn.model_selection import GridSearchCV


def evaluate_model_grid_search_cv(
    classifier, X_train, y_train, params, scoring="accuracy", cv=10, n_jobs=-1
):
    grid_search = GridSearchCV(
        estimator=classifier, param_grid=params, scoring=scoring, cv=cv, n_jobs=n_jobs
    )

    grid_search.fit(X=X_train, y=y_train)

    best_accuracy = grid_search.best_score_
    best_params = grid_search.best_params_

    print(f"Model: {type(classifier).__name__}")
    print(f" - Best accuracy: {best_accuracy * 100:,.2f}%")
    print(f" - Best params: {best_params}")

### Write Functions

In [1829]:
from sklearn.impute import SimpleImputer


def update_null_values(df, fields, strategy, fill_value=np.nan):
    """
    Update values with a given strategy.
    `fill_value` is only used when `strategy` = "constant"
    `strategy`:  {'constant', 'most_frequent', 'mean', 'median'}
    """
    try:
        imputer = SimpleImputer(
            missing_values=np.nan, strategy=strategy, fill_value=fill_value
        )
        imputer.fit(df[fields])
        df_transformed = df.copy()
        df_transformed[fields] = imputer.transform(df[fields])
        return df_transformed
    except ValueError as e:
        print(f"❌ Error: {e}")
        return df

In [1830]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder


def encode_categorical_data(df, fields, encoder):
    """
    Function to encode categorical data in a DataFrame:
    - OneHot: tbd
    - Dummy: tbd
    - Label: tbd
    @TODO: explain when applying each one
    """
    try:
        if encoder == "OneHot":
            # Create a ColumnTransformer, applying OneHotEncoder to specified fields
            ct = ColumnTransformer(
                transformers=[("encoder", OneHotEncoder(), fields)],
                remainder="passthrough",
            )
            # Apply ColumnTransformer, resulting in an array
            transformed_data = ct.fit_transform(df)
            # Create new column names for the one-hot encoded columns
            encoded_columns = ct.named_transformers_["encoder"].get_feature_names_out(
                fields
            )
            # Combine the new column names with the non-transformed columns
            non_transformed_columns = [col for col in df.columns if col not in fields]
            new_column_names = list(encoded_columns) + non_transformed_columns
            # Create a DataFrame from the transformed data
            df_transformed = pd.DataFrame(
                transformed_data, columns=new_column_names, index=df.index
            )

        elif encoder == "Dummy":
            # Create dummy variables
            dummies = pd.get_dummies(df[fields], drop_first=True)
            # Drop the original fields and concatenate the dummy variables
            df_transformed = pd.concat([df.drop(fields, axis=1), dummies], axis=1)

        elif encoder == "Label":
            df_transformed = df.copy()
            # update original target fields with 0-N categorical values
            for field in fields:
                le = LabelEncoder()
                df_transformed[field] = le.fit_transform(df_transformed[field])
        else:
            print(f"encoder `{encoder}` not found")
            return df

        return df_transformed
    except KeyError as e:
        print(f"❌ Error: {e}")
        return df


In [1831]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()


def scale_features(X_train, X_test, fields):
    """
    - Only for non-dummy numerical features
    - For KNN, SVM or Logistic Reg/Linear Reg/NN with Gradient descent optimisation
    - For classification, no need to scale dependent variable
    """
    try:
        # Create copies of the original DataFrames
        X_train_scaled, X_test_scaled = X_train.copy(), X_test.copy()

        # Scale only the specified fields
        X_train_scaled[fields] = sc.fit_transform(X_train[fields])
        X_test_scaled[fields] = sc.transform(X_test[fields])

        return X_train_scaled, X_test_scaled
    except Exception as e:
        print(f"❌ Error: {e}")
        return X_train, X_test

### Regression Model Functions

@TODO: include random_state in all models

In [1832]:
from sklearn.linear_model import LogisticRegression


def fit_logistic_regression(
    X_train, y_train, c=1.0, solver="lbfgs", penalty="l2", max_iter=1000
):
    lr = LogisticRegression(
        random_state=RANDOM_STATE,
        max_iter=max_iter,
        C=c,
        solver=solver,
        penalty=penalty,
    )
    lr.fit(X_train, y_train)
    return lr

In [1833]:
from sklearn.neighbors import KNeighborsClassifier


def fit_knn(
    X_train,
    y_train,
    n_neighbors=5,
    weights="uniform",
    algorithm="auto",
    p=2,
    leaf_size=30,
):
    knn = KNeighborsClassifier(
        n_neighbors=n_neighbors,
        weights=weights,
        algorithm=algorithm,
        p=p,
        leaf_size=leaf_size,
    )
    knn.fit(X_train, y_train)
    return knn

In [1834]:
from sklearn.svm import SVC


def fit_svc(X_train, y_train, kernel="rbf", c=1.0, gamma="scale", degree=3):
    svc = SVC(kernel=kernel, random_state=RANDOM_STATE, C=c, gamma=gamma, degree=degree)
    svc.fit(X_train, y_train)
    return svc

In [1835]:
from sklearn.naive_bayes import GaussianNB


def fit_nb(X_train, y_train, var_smoothing=1e-9):
    nb = GaussianNB(var_smoothing=var_smoothing)
    nb.fit(X_train, y_train)
    return nb

In [1836]:
from sklearn.tree import DecisionTreeClassifier


# TODO: default criterion: gini vs. entropy
def fit_decision_tree(
    X_train,
    y_train,
    criterion="entropy",
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=None,
    max_leaf_nodes=None,
    splitter="best",
):
    dt = DecisionTreeClassifier(
        random_state=RANDOM_STATE,
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        max_leaf_nodes=max_leaf_nodes,
        splitter=splitter,
    )
    dt.fit(X_train, y_train)
    return dt

In [1837]:
from sklearn.ensemble import RandomForestClassifier


# TODO: default criterion: gini vs. entropy
def fit_random_forest(
    X_train,
    y_train,
    n_estimators=100,
    criterion="entropy",
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    bootstrap=True,
    class_weight=None,
):
    rf = RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=n_estimators,
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        class_weight=class_weight,
    )
    rf.fit(X_train, y_train)
    return rf

In [1838]:
from xgboost import XGBClassifier


def fit_xgboost(
    X_train,
    y_train,
    learning_rate=0.3,
    n_estimators=100,
    max_depth=6,
    min_child_weight=1,
    gamma=0,
    subsample=1,
    colsample_bytree=1,
):
    xgb = XGBClassifier(
        random_state=RANDOM_STATE,
        use_label_encoder=False,
        eval_metric="logloss",
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        gamma=gamma,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
    )
    xgb.fit(X_train, y_train)
    return xgb

In [1839]:
from catboost import CatBoostClassifier


def fit_catboost(
    X_train,
    y_train,
    eval_metric="Logloss",
    learning_rate=0.009,
    depth=6,
    l2_leaf_reg=3,
    iterations=1000,
    border_count=254,
    bootstrap_type="MVS",
    subsample=0.8,
):
    cat = CatBoostClassifier(
        random_state=RANDOM_STATE,
        verbose=0,
        eval_metric=eval_metric,
        learning_rate=learning_rate,
        depth=depth,
        l2_leaf_reg=l2_leaf_reg,
        iterations=iterations,
        border_count=border_count,
        bootstrap_type=bootstrap_type,
        subsample=subsample,
    )
    cat.fit(X_train, y_train)
    return cat

### Data Collection

In [1840]:
df = pd.read_csv('../src/v1/07_scikit-learn/filez/titanic_train.csv')
show_data(df)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
889                              Behr, Mr. Karl Howell    male  26.0      0   
890                                Dooley, Mr. Patrick    male  32.0      0   

     Parch     Ticket     Fare Cabin Embarked  
0        0  A/5 21171   7.2500   NaN        S  
1        0   PC 17599  71.2833   C85        C  
889      0     111369  30.0000  C148        C  
890      0     370376   7.7500   NaN        Q  


- field_1: description_1. Explanation.
- field_2: description_2. Explanation.
- field_3: description_3. Explanation.
- field_4: description_4 (0 = No, 1 = Yes). Explanation.

### Exploratory Data Analysis

In [1841]:
# Display DataFrame info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [1842]:
# Display number and percentage of missing values
show_missing_data(df)

             # missing  % missing
Cabin              687      77.10
Age                177      19.87
Embarked             2       0.22
PassengerId          0       0.00
Survived             0       0.00
Pclass               0       0.00
Name                 0       0.00
Sex                  0       0.00
SibSp                0       0.00
Parch                0       0.00
Ticket               0       0.00
Fare                 0       0.00


In [1843]:
# Describe statistics on numerical fields
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [1844]:
# show unique values given a list of df fields
show_unique_values(df, ['Sex', 'Embarked'])

Sex: ['male' 'female']
Embarked: ['S' 'C' 'Q' nan]


In [1845]:
# show count for each value of field 'Embarked'
df['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [1846]:
df.groupby('Pclass')['Age'].mean()

Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64

In [1847]:
#@TODO: seaborn charts

### Data cleaning & Preprocessing

    TO-BE-REMOVED
- remove or update null values
- manage outliers
- drop irrelevant fields (i.e.: ids, names, ..)
- correct data entry errors

In [1848]:
# Update null values
# 1) Age
mean_age_per_class = df.groupby('Pclass')['Age'].transform('mean')
df['Age'] = df['Age'].fillna(mean_age_per_class)
# 2) Embarked
df = update_null_values(df=df, strategy="constant", fields=["Embarked"], fill_value="S")

# Remove unnecessary fields
df = df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

### Encoding categorical data

In [1849]:
# OneHot encoder - binary values / keep all values
# df = encode_categorical_data(df=df, fields=['Embarked', 'Sex'], encoder='OneHot')
# Dummy encoder - binary values / remove first value
df = encode_categorical_data(df=df, fields=['Embarked', 'Sex'], encoder='Dummy')
# Label encoder - integer values 0-N / keep all values
# df = encode_categorical_data(df=df, fields=['Embarked', 'Sex'], encoder='Label')
df.head(2)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Sex_male
0,0,3,22.0,1,0,7.25,False,True,True
1,1,1,38.0,1,0,71.2833,False,False,False


### Splitting dataset into Train/Test set

In [1850]:
from sklearn.model_selection import train_test_split

X = df.drop("Survived", axis=1)
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

### Feature scaling

In [1851]:
fields_to_scale = ["Pclass", "Age", "SibSp", "Parch", "Fare"]

X_train_scaled, X_test_scaled = scale_features(
    X_train=X_train, X_test=X_test, fields=fields_to_scale
)

### Fit Models

In [1852]:
# LogReg requires scaling to avoid ConvergenceWarning issue with kfold
# KNN & SVC require scaling
lr = fit_logistic_regression(X_train_scaled, y_train)
knn = fit_knn(X_train_scaled, y_train)
svc = fit_svc(X_train_scaled, y_train)
nb = fit_nb(X_train, y_train)
dt = fit_decision_tree(X_train, y_train)
rf = fit_random_forest(X_train, y_train)
xgb = fit_xgboost(X_train, y_train)
cat = fit_catboost(X_train, y_train)

### Evaluate Models

In [1853]:
models = [lr, knn, svc, nb, dt, rf, xgb, cat]
models_to_scale = ["KNeighborsClassifier", "SVC", "LogisticRegression"]
evaluate_model_kfold(models, X_train, X_train_scaled, y_train, models_to_scale)

Model                     Accuracy %   SD %      
-------------------------------------------
CatBoostClassifier        81.75        5.41      
SVC                       81.60        4.82      
RandomForestClassifier    80.33        4.79      
LogisticRegression        80.21        4.86      
XGBClassifier             80.20        3.80      
KNeighborsClassifier      78.37        3.76      
GaussianNB                78.10        5.46      
DecisionTreeClassifier    76.41        4.84      


### Tune Models
*hyperparameters*

In [1854]:
# Logistic Regression

params = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l2"],
    "solver": ["lbfgs", "newton-cg", "sag"],
    "max_iter": [1000],
}

# Feature scaling to avoid ConvergenceWarning issue
evaluate_model_grid_search_cv(lr, X_train_scaled, y_train, params)

Model: LogisticRegression
 - Best accuracy: 80.35%
 - Best params: {'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'lbfgs'}


In [1855]:
# KNN

params = {
    "n_neighbors": [3, 5, 7, 10],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "p": [1, 2],
    "leaf_size": [20, 30, 35, 40, 45],
}

evaluate_model_grid_search_cv(knn, X_train_scaled, y_train, params)

Model: KNeighborsClassifier
 - Best accuracy: 80.90%
 - Best params: {'algorithm': 'auto', 'leaf_size': 20, 'n_neighbors': 10, 'p': 2, 'weights': 'uniform'}


In [1856]:
# SVC

params = {
    "C": [0.1, 1, 10, 100],
    "kernel": ["rbf", "poly"], # linear
    "gamma": ["scale", "auto", 0.1, 1],
    "degree": [2, 3, 4],
}

evaluate_model_grid_search_cv(svc, X_train_scaled, y_train, params)

"""
Further options:
- with 'poly' and 'sigmoid' kernels, try 'coef0' ranges -> [0, 1] or [0, 10])
- with 'poly' kernel, try more granular 'degree' ranges
- when dataset is imbalanced (i.e., unequal number of instances in each class),
  try 'class_weight' param
"""

Model: SVC
 - Best accuracy: 81.88%
 - Best params: {'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly'}


"\nFurther options:\n- with 'poly' and 'sigmoid' kernels, try 'coef0' ranges -> [0, 1] or [0, 10])\n- with 'poly' kernel, try more granular 'degree' ranges\n- when dataset is imbalanced (i.e., unequal number of instances in each class),\n  try 'class_weight' param\n"

In [1857]:
# Gaussian Naive Bayes

params = {"var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]}

evaluate_model_grid_search_cv(nb, X_train, y_train, params)

Model: GaussianNB
 - Best accuracy: 78.24%
 - Best params: {'var_smoothing': 1e-05}


In [1858]:
# Decision Tree

params = {
    "criterion": ["gini", "entropy"],
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
    "max_leaf_nodes": [10, 20, 30, None],
    "splitter": ["best", "random"],
}

evaluate_model_grid_search_cv(dt, X_train, y_train, params)

Model: DecisionTreeClassifier
 - Best accuracy: 82.73%
 - Best params: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'max_leaf_nodes': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}


In [1859]:
# Random Forest

params = {
    "n_estimators": [50, 100, 200],
    "criterion": ["gini", "entropy"],
    "max_depth": [10, 20], # 30, None
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 4], # 2
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False],
    "class_weight": [None], # "balanced"
}

evaluate_model_grid_search_cv(rf, X_train, y_train, params)

Model: RandomForestClassifier
 - Best accuracy: 83.01%
 - Best params: {'bootstrap': True, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}


In [1860]:
# XGBoost

params = {
    "learning_rate": [0.01, 0.1, 0.2, 0.3],
    "n_estimators": [100, 150, 200],
    "max_depth": [3, 5], #default is 6
    "min_child_weight": [1, 3, 5],
    "gamma": [0, 0.1, 0.2],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.8, 0.9, 1.0],
}

evaluate_model_grid_search_cv(xgb, X_train, y_train, params)

Model: XGBClassifier
 - Best accuracy: 84.41%
 - Best params: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.8}


In [1861]:
# CatBoost

params = {
    "eval_metric": ["Logloss"],
    "learning_rate": [0.01, 0.1, 0.2],
    "depth": [6, 8, 10],
    "l2_leaf_reg": [1, 3, 5],
    "iterations": [100, 200],
    "border_count": [32, 64, 128],
    "bootstrap_type": ["Bernoulli"],
    "subsample": [0.7, 0.9],
}

evaluate_model_grid_search_cv(cat, X_train, y_train, params)

Model: CatBoostClassifier
 - Best accuracy: 82.87%
 - Best params: {'bootstrap_type': 'Bernoulli', 'border_count': 32, 'depth': 6, 'eval_metric': 'Logloss', 'iterations': 200, 'l2_leaf_reg': 5, 'learning_rate': 0.1, 'subsample': 0.7}


### Re-evaluate Models with tunned parameters

In [1869]:
lr = fit_logistic_regression(X_train_scaled, y_train, 0.1, "lbfgs", "l2", 1000)
knn = fit_knn(X_train_scaled, y_train, 10, "uniform", "auto", 2, 20)
svc = fit_svc(X_train_scaled, y_train, "poly", 10, "scale", 2)
nb = fit_nb(X_train, y_train, 1e-05)
dt = fit_decision_tree(X_train, y_train, "entropy", 10, 2, 1, "log2", 30, "best")
rf = fit_random_forest(X_train, y_train, 50, "entropy", 10, 5, 1, "sqrt", True, None)
xgb = fit_xgboost(X_train, y_train, 0.1, 100, 5, 5, 0.1, 0.8, 0.8)
cat = fit_catboost(X_train, y_train, "Logloss", 0.1, 6, 5, 200, 32, "Bernoulli", 0.7)

models = [lr, knn, svc, nb, dt, rf, xgb, cat]
models_to_scale = ["KNeighborsClassifier", "SVC", "LogisticRegression"]
evaluate_model_kfold(models, X_train, X_train_scaled, y_train, models_to_scale)

Model                     Accuracy %   SD %      
-------------------------------------------
XGBClassifier             84.41        4.94      
RandomForestClassifier    83.01        4.72      
CatBoostClassifier        82.87        4.92      
DecisionTreeClassifier    82.73        3.10      
SVC                       81.88        4.65      
KNeighborsClassifier      80.90        4.56      
LogisticRegression        80.35        4.07      
GaussianNB                78.24        4.78      


### Predictions

In [1863]:
# Values to predict
new_data = pd.DataFrame(
    [
        [2, 23, 0, 0, 13, False, True, True],
        [1, 51, 0, 0, 26.5, False, True, True],
        [3, 29, 0, 0, 9.5, False, True, True],
        [1, 40, 1, 1, 134.5, False, False, True],
        [2, 6, 0, 1, 33, False, True, False],
        [3, 19, 0, 0, 14.5, False, True, True],
        [3, 32, 0, 0, 56.4958, False, True, True],
        [1, 41, 0, 0, 134.5, False, False, False],
        [1, 44, 0, 1, 57.9792, False, False, False],
        [3, 29.699118, 8, 2, 69.5500, False, True, False],
    ],
    columns=[
        "Pclass",
        "Age",
        "SibSp",
        "Parch",
        "Fare",
        "Embarked_Q",
        "Embarked_S",
        "Sex_male",
    ],
)

In [1864]:
def make_prediction(
    new_data: pd.DataFrame, model: any, scaled: bool, fields_to_scale: list
):
    """
    TBD
    """
    if scaled and fields_to_scale:
        # Separate the data into features that need scaling and those that don't
        features_to_scale = new_data[fields_to_scale]
        features_not_to_scale = new_data.drop(columns=fields_to_scale)

        # Apply scaling only to the required features
        scaled_features = sc.transform(features_to_scale)

        # Recombine the scaled and unscaled features
        new_data_prepared = pd.concat(
            [
                pd.DataFrame(
                    scaled_features, columns=fields_to_scale, index=new_data.index
                ),
                features_not_to_scale,
            ],
            axis=1,
        )
        new_prediction = model.predict(new_data_prepared)
    else:
        new_prediction = model.predict(new_data)

    print(f"{new_prediction} {type(model).__name__}")

print(f"[{' '.join(map(str, y_train.head(5).tolist() + y_train.tail(5).tolist()))}] y_train")
make_prediction(new_data, xgb, False, fields_to_scale)
make_prediction(new_data, rf, False, fields_to_scale)
make_prediction(new_data, cat, False, fields_to_scale)
make_prediction(new_data, svc, True, fields_to_scale)

[0 1 1 1 1 0 1 1 1 0] y_train
[0 0 1 1 1 0 1 1 1 0] XGBClassifier
[0 1 0 0 1 0 0 1 1 0] RandomForestClassifier
[0 0 0 0 1 0 1 1 1 0] CatBoostClassifier
[0 0 0 0 1 0 0 1 1 0] SVC


TODO: use new df's after every change, but beware of the mem space required.

1. **Initial Stages**:
   - `df_raw`: The original, unmodified dataset.
   - `df_loaded`: Data after initial loading, possibly from multiple sources.

2. **Cleaning and Preprocessing**:
   - `df_cleaned`: After basic cleaning (removing duplicates, handling missing values).
   - `df_filtered`: Data after filtering based on certain criteria.
   - `df_imputed`: Where missing values have been imputed.
   - `df_deduped`: After removing duplicates.

3. **Feature Engineering**:
   - `df_engineered`: After feature engineering (new features created).
   - `df_transformed`: After applying transformations (log, square root, etc.).
   - `df_normalized`: If the data has been normalized.
   - `df_standardized`: If the data has been standardized.

4. **Encoding and Formatting**:
   - `df_encoded`: After encoding categorical variables (one-hot, label encoding).
   - `df_binned`: After binning continuous variables.
   - `df_pivoted`: If data has been pivoted or reshaped.
   - `df_aggregated`: After aggregation operations (group by, etc.).

5. **Splitting**:
   - `df_train`: Training set.
   - `df_test`: Test set.
   - `df_validate`: Validation set.

6. **Modeling**:
   - `df_predictions`: Contains model predictions.
   - `df_residuals`: Residuals from model predictions.
   - `df_analyzed`: DataFrames used for deeper analysis post-modeling.

7. **Results and Export**:
   - `df_results`: Final results or outputs.
   - `df_export`: Data ready to be exported to a file or database.

8. **Special Cases**:
   - `df_merged`: After merging with another DataFrame.
   - `df_joined`: After joining with another DataFrame.
   - `df_sampled`: If a sample has been taken from the data.
   - `df_segmented`: If the data has been segmented (e.g., by customer type).

Each name corresponds to a common data processing or analysis task and makes it easier to track the purpose of each DataFrame in your workflow. Remember, these are just examples, and the actual names should align with the specific operations and logic of your project.