## `Dependencies`

In [48]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

## `Config`

In [54]:
task = "Classification"

In [44]:
estimator = LogisticRegression()

In [50]:
estimator_grid = {
    "logisticregression__C": [0.01, 0.1, 1.0, 10.0],
    "logisticregression__fit_intercept": [True, False]
}

In [52]:
primary_metric = "mean_squared_error"

## `Data Ingestion`

In [37]:
# Load data
path = "/Users/mlprof/Documents/GitHub/machine-learning-ready/04 ML Pipelines/Titanic/"

data = pd.read_csv("train.csv")
submission = pd.read_csv("test.csv")

In [43]:
display(data.info())
display(submission.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Sex          889 non-null    object 
 4   Age          712 non-null    float64
 5   SibSp        889 non-null    int64  
 6   Parch        889 non-null    int64  
 7   Embarked     889 non-null    object 
dtypes: float64(1), int64(5), object(2)
memory usage: 62.5+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    object 
 3   Age          332 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Embarked     418 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 23.0+ KB


None

In [42]:
display(data.head(3))
display(submission.head(3))

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,1,0,3,male,22.0,1,0,S
1,2,1,1,female,38.0,1,0,C
2,3,1,3,female,26.0,0,0,S


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Embarked
0,892,3,male,34.5,0,0,Q
1,893,3,female,47.0,1,0,S
2,894,2,male,62.0,0,0,Q


## `Data Pre-Processing`

#### `Data Engineering`

In [40]:
# Drop cols with NaN in train and test set
cols_to_drop = ["Name", "Fare", "Ticket", "Cabin"]

data = data.drop(columns=cols_to_drop)
submission = submission.drop(columns=cols_to_drop)

In [41]:
# Drop remaining rows with NaN in train set
data = data.dropna(subset=["Embarked"])

In [None]:
# Define X and y
X = data.drop(columns=["PassengerId", "Survived"])
y = data.loc[:, "Survived"]

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42
)

#### `Feature Engineering`

for `Classifiers`:

In [47]:
if estimator in [LogisticRegression(), SVC(), KNeighborsClassifier()]:
    
    # ----------------------------------------
    # Numerical columns
    # ----------------------------------------
    numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns

    # Scaler
    standard_scaler = make_column_transformer(
        (StandardScaler(), numerical_cols),
        remainder="passthrough",
        verbose_feature_names_out=True
    )

    # ----------------------------------------
    # Categorical columns
    # ----------------------------------------
    categorical_cols = X.select_dtypes(include=["object", "category"]).columns

    # One Hot Encoder
    one_hot_encoder = make_column_transformer(
        (OneHotEncoder(), categorical_cols),
        remainder="passthrough",
        verbose_feature_names_out=True
    )



## `Data Modeling`

Pipeline

In [101]:
pipeline = make_pipeline(
    standard_scaler,
    one_hot_encoder,  
    estimator  
)

Parameter Grid

In [51]:
# Define transformer params
param_grid = {}

# Add custom estimator params
param_grid.update(estimator_grid)

display(param_grid)

{'logisticregression__C': [0.01, 0.1, 1.0, 10.0],
 'logisticregression__fit_intercept': [True, False]}

Evaluation Metrics

In [58]:
# ----------------------------------------
# Classification
# ----------------------------------------

if task == "Classification":
    metrics = ["precision", "recall"]

    if primary_metric not in metrics:
        message = f""" 
        Error: **{primary_metric}** is not a valid metric for {task} tasks.
        Valid metrics are: {metrics}
        """
        raise Exception(message)
    else:
        pass

# ----------------------------------------
# Regression
# ----------------------------------------


Exception:  
        Error: **mean_squared_error** is not a valid metric for Classification tasks.
        Valid metrics are: ['precision', 'recall']
        