## `Dependencies`

Requirements

In [21]:
#pip freeze > requirements.txt

In [22]:
#pip install -r requirements.txt

Imports

In [23]:
# utils
from colorama import Fore, Back, Style, init
import threadpoolctl
import joblib

import numpy as np
import pandas as pd
import scipy
import matplotlib
import sklearn
# sklearn
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, 
    OrdinalEncoder, OneHotEncoder, TargetEncoder
)
from sklearn.feature_selection import (
    VarianceThreshold,
    SelectKBest,
    SelectPercentile,
    f_classif,
    f_regression,
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
# xgboost
#from xgboost import XGBClassifier, XGBRegressor

In [24]:
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Scipy version: {scipy.__version__}")
print(f"Joblib version: {joblib.__version__}")
print(f"Matplotlib version: {matplotlib.__version__}")
print(f"Sklearn version: {sklearn.__version__}")

NumPy version: 1.26.2
Pandas version: 2.1.4
Scipy version: 1.11.4
Joblib version: 1.3.2
Matplotlib version: 3.8.2
Sklearn version: 1.3.2


## `Config`

In [25]:
task = "Classification"

In [26]:
estimator = LogisticRegression()

In [27]:
estimator_grid = {
    "logisticregression__C": [0.01, 0.1, 1.0, 10.0],
    "logisticregression__fit_intercept": [True, False]
}

In [28]:
primary_metric = "recall"

## `Data Ingestion`

In [29]:
# Load data
path = "/Users/mlprof/Documents/GitHub/machine-learning-ready/04 ML Pipelines/Titanic/"

data = pd.read_csv(f"{path}train.csv")
submission = pd.read_csv(f"{path}test.csv")

In [30]:
display(data.info())
display(submission.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


None

In [31]:
display(data.head(3))
display(submission.head(3))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [32]:
display(data.describe())

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## `Data Pre-Processing`

#### `Data Engineering`

In [33]:
# Drop cols with NaN in train and test set
cols_to_drop = ["Name", "Fare", "Ticket", "Cabin"]

data = data.drop(columns=cols_to_drop)
submission = submission.drop(columns=cols_to_drop)

In [34]:
# Drop remaining rows with NaN in train set
data = data.dropna(subset=["Age", "Embarked"])

In [35]:
# Define X and y
X = data.drop(columns=["PassengerId", "Survived"])
y = data.loc[:, "Survived"]

In [36]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42
)

#### `Feature Engineering`

In [37]:
preprocessor = ColumnTransformer(
    transformers=[], 
    remainder="passthrough"
)

In [38]:
categorical_cols = X_train.select_dtypes(
    include=["object", "category"]
).columns.tolist()

numerical_cols = X_train.select_dtypes(
    include=["int64", "float64"]
).columns.tolist()

print(f"Categorical Columns: {categorical_cols}")
print(f"Numerical Columns: {numerical_cols}")



Categorical Columns: ['Sex', 'Embarked']
Numerical Columns: ['Pclass', 'Age', 'SibSp', 'Parch']


for `Classifiers`:

In [39]:
if isinstance(estimator, (LogisticRegression, SVC, KNeighborsClassifier)):

    # ----------------------------------------
    # Numerical columns
    # ----------------------------------------

    # Scaler
    preprocessor.transformers.append(
        ('min_max_scaler', MinMaxScaler(), numerical_cols)
    )

    # ----------------------------------------
    # Categorical columns
    # ----------------------------------------

    # One Hot Encoder
    preprocessor.transformers.append(
        ('one_hot_encoder', OneHotEncoder(), categorical_cols)
    )

## `Data Modeling`

Pipeline

In [40]:
print(preprocessor)

ColumnTransformer(remainder='passthrough',
                  transformers=[('min_max_scaler', MinMaxScaler(),
                                 ['Pclass', 'Age', 'SibSp', 'Parch']),
                                ('one_hot_encoder', OneHotEncoder(),
                                 ['Sex', 'Embarked'])])


In [41]:
pipeline = make_pipeline(preprocessor, estimator)

Parameter Grid

In [42]:
# Define transformer params
param_grid = {}

# Add custom estimator params
param_grid.update(estimator_grid)

display(param_grid)

{'logisticregression__C': [0.01, 0.1, 1.0, 10.0],
 'logisticregression__fit_intercept': [True, False]}

Evaluation Metrics

In [43]:
# ----------------------------------------
# Classification
# ----------------------------------------

if task == "Classification":
    metrics = ["precision", "recall"]

    if primary_metric not in metrics:
        message = f""" 
        Error: **{primary_metric}** is not a valid metric for {task} tasks.
        Valid metrics are: {metrics}
        """
        raise Exception(message)
    else:
        pass

# ----------------------------------------
# Regression
# ----------------------------------------


GridSearch

In [44]:
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=3,
    scoring=metrics,
    refit=primary_metric,
    n_jobs=-1,
)

Training

In [45]:
grid.fit(X_train, y_train)

In [46]:
print("Best parameters (CV Score=%0.3f):" % grid.best_score_, grid.best_params_)

Best parameters (CV Score=0.717): {'logisticregression__C': 10.0, 'logisticregression__fit_intercept': True}


In [47]:
display(grid.best_estimator_)

CV Results

In [48]:
cv_results = pd.DataFrame(grid.cv_results_)
display(cv_results.sort_values(by=f"rank_test_{primary_metric}"))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logisticregression__C,param_logisticregression__fit_intercept,params,split0_test_precision,split1_test_precision,split2_test_precision,mean_test_precision,std_test_precision,rank_test_precision,split0_test_recall,split1_test_recall,split2_test_recall,mean_test_recall,std_test_recall,rank_test_recall
6,0.008902,0.002413,0.003241,0.000126,10.0,True,"{'logisticregression__C': 10.0, 'logisticregre...",0.712644,0.810127,0.819444,0.780738,0.0483,3,0.72093,0.744186,0.686047,0.717054,0.023893,1
7,0.007009,0.003299,0.003254,0.000388,10.0,False,"{'logisticregression__C': 10.0, 'logisticregre...",0.704545,0.810127,0.819444,0.778039,0.052107,4,0.72093,0.744186,0.686047,0.717054,0.023893,1
4,0.005377,0.000624,0.008543,0.006631,1.0,True,"{'logisticregression__C': 1.0, 'logisticregres...",0.688889,0.790123,0.785714,0.754909,0.046718,5,0.72093,0.744186,0.639535,0.70155,0.044868,3
5,0.006347,0.001887,0.005312,0.002437,1.0,False,"{'logisticregression__C': 1.0, 'logisticregres...",0.681319,0.802469,0.776119,0.753302,0.052024,6,0.72093,0.755814,0.604651,0.693798,0.064625,4
2,0.011366,0.002646,0.003991,0.000482,0.1,True,"{'logisticregression__C': 0.1, 'logisticregres...",0.705882,0.765432,0.785714,0.752343,0.03388,7,0.697674,0.72093,0.639535,0.686047,0.034232,5
3,0.013659,0.00431,0.010599,0.003911,0.1,False,"{'logisticregression__C': 0.1, 'logisticregres...",0.705882,0.765432,0.785714,0.752343,0.03388,7,0.697674,0.72093,0.639535,0.686047,0.034232,5
1,0.008841,0.005794,0.011917,0.010942,0.01,False,"{'logisticregression__C': 0.01, 'logisticregre...",0.852459,0.928571,0.9375,0.906177,0.038159,2,0.604651,0.604651,0.523256,0.577519,0.03837,7
0,0.010222,0.00724,0.005805,0.003408,0.01,True,"{'logisticregression__C': 0.01, 'logisticregre...",0.852459,0.945455,0.934783,0.910899,0.041552,1,0.604651,0.604651,0.5,0.569767,0.049333,8
