In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# --- 1. Sample Data Creation (mimicking a typical ML dataset) ---
# Let's create a slightly more complex dataset for demonstration
data = {
    'feature1_num': np.random.rand(100) * 10,
    'feature2_num': np.random.rand(100) * 5,
    'feature3_cat': np.random.choice(['A', 'B', 'C'], size=100),
    'feature4_ord': np.random.choice(['Low', 'Medium', 'High'], size=100, p=[0.2, 0.5, 0.3]),
    'feature5_nan_num': np.concatenate([np.random.rand(80)*20, np.full(20, np.nan)]), # some NaNs
    'target': np.random.choice([0, 1], size=100)
}
df = pd.DataFrame(data)
df


Unnamed: 0,feature1_num,feature2_num,feature3_cat,feature4_ord,feature5_nan_num,target
0,5.671302,2.195431,B,Medium,1.408754,1
1,5.309978,2.359990,B,Low,2.895803,0
2,1.817809,0.470005,A,Medium,14.102842,1
3,7.801179,1.046817,B,Low,18.636163,1
4,3.064763,0.498534,A,Low,15.141681,1
...,...,...,...,...,...,...
95,8.969257,1.938489,A,Medium,,1
96,0.293798,2.192780,A,Medium,,1
97,0.096265,1.652768,B,Medium,,1
98,0.619111,3.017372,A,Medium,,1


In [7]:
np.random.shuffle(df['feature5_nan_num'].values) # Shuffle to spread NaNs

print("Sample DataFrame head:")
df.head()

Sample DataFrame head:


Unnamed: 0,feature1_num,feature2_num,feature3_cat,feature4_ord,feature5_nan_num,target
0,5.671302,2.195431,B,Medium,,1
1,5.309978,2.35999,B,Low,6.470319,0
2,1.817809,0.470005,A,Medium,18.987471,1
3,7.801179,1.046817,B,Low,,1
4,3.064763,0.498534,A,Low,13.448532,1


In [8]:
print("\nMissing values before preprocessing:")
df.isnull().sum()


Missing values before preprocessing:


feature1_num         0
feature2_num         0
feature3_cat         0
feature4_ord         0
feature5_nan_num    20
target               0
dtype: int64

In [65]:
# Define features (X) and target (y)
X = df.drop('target', axis=1)
y = df['target']

# --- 2. Train-Test Split ---
# Stratify by y to ensure similar class proportions in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=46, stratify=y)
print(f"\nShape of X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}, y_test: {y_test.shape}")


Shape of X_train: (80, 5), X_test: (20, 5)
Shape of y_train: (80,), y_test: (20,)


In [66]:
y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)

(target
 1    0.5375
 0    0.4625
 Name: proportion, dtype: float64,
 target
 1    0.55
 0    0.45
 Name: proportion, dtype: float64)

In [67]:
# --- 3. Preprocessing with Scikit-learn ---
# Identify column types for different preprocessing steps
numerical_features = ['feature1_num', 'feature2_num', 'feature5_nan_num']
categorical_features = ['feature3_cat']
# Ordinal features would ideally use OrdinalEncoder, but for simplicity,
# we'll treat 'feature4_ord' as categorical for OneHotEncoding here.
# In a real scenario:
# ordinal_features = ['feature4_ord']
# ordinal_categories = [['Low', 'Medium', 'High']] # Define order for OrdinalEncoder

# Create preprocessing pipelines for different data types

# Pipeline for numerical features:
# 1. Impute missing values (e.g., with median)
# 2. Scale the features (e.g., StandardScaler)
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')), # Handles NaN in 'feature5_nan_num'
    ('scaler', StandardScaler())
])

# Pipeline for categorical features:
# 1. Impute missing values (e.g., with most frequent, though our example doesn't have them here)
# 2. One-Hot Encode the features
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), # For robustness if new data has NaNs
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # ignore new categories in test data
])

# --- 4. Combine Preprocessing Steps using ColumnTransformer ---
# ColumnTransformer applies specified transformers to designated columns of the DataFrame
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features + ['feature4_ord']) # Include ordinal as categorical for now
    ],
    remainder='passthrough' # Keep other columns (if any) not specified. 'drop' is another option.
)
# Note: If 'feature4_ord' was handled by OrdinalEncoder:
# ('ord', OrdinalEncoder(categories=ordinal_categories), ordinal_features)

In [68]:
numerical_pipeline

In [69]:
preprocessor

In [76]:
# --- 5. Create the Full ML Pipeline (Preprocessing + Model) ---
# We'll use Logistic Regression as an example model
ml_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])
ml_pipeline


In [77]:
ml_pipeline.steps

[('preprocessor',
  ColumnTransformer(remainder='passthrough',
                    transformers=[('num',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(strategy='median')),
                                                   ('scaler', StandardScaler())]),
                                   ['feature1_num', 'feature2_num',
                                    'feature5_nan_num']),
                                  ('cat',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(strategy='most_frequent')),
                                                   ('onehot',
                                                    OneHotEncoder(handle_unknown='ignore'))]),
                                   ['feature3_cat', 'feature4_ord'])])),
 ('classifier', LogisticRegression(random_state=42, solver='liblinear'))]

In [78]:
# --- 6. Train the Model using the Pipeline ---
print("\nTraining the ML pipeline...")
ml_pipeline.fit(X_train, y_train)
print("Training complete.")


Training the ML pipeline...
Training complete.


In [79]:
ml_pipeline.steps[-1][1].coef_ # Coefficients of the logistic regression model

array([[ 0.29735475, -0.54889967, -0.16285682, -0.00173508,  0.04859864,
         0.05570595,  0.07658506,  0.00856837,  0.01741607]])

In [80]:

# --- 7. Make Predictions on the Test Set ---
y_pred = ml_pipeline.predict(X_test)
y_pred_proba = ml_pipeline.predict_proba(X_test)[:, 1] # Probabilities for the positive class
y_pred, y_pred_proba


(array([0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1]),
 array([0.3081241 , 0.52220975, 0.59892399, 0.80839126, 0.57272651,
        0.34565825, 0.48180063, 0.63199293, 0.78538108, 0.29553044,
        0.29374644, 0.3675992 , 0.56853775, 0.65702434, 0.53959661,
        0.58501841, 0.27501264, 0.3082383 , 0.53720903, 0.78103691]))

### Precision when minimizing false positives is crucial, and recall when minimizing false negatives is critical

![image](https://www.researchgate.net/publication/336402347/figure/fig3/AS:812472659349505@1570719985505/Calculation-of-Precision-Recall-and-Accuracy-in-the-confusion-matrix.ppm)

In [81]:
# --- 8. Evaluate the Model ---
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.4500

Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.33      0.35         9
           1       0.50      0.55      0.52        11

    accuracy                           0.45        20
   macro avg       0.44      0.44      0.44        20
weighted avg       0.44      0.45      0.45        20



In [None]:



# You can inspect steps in the pipeline
# print("\nFitted preprocessor:")
# print(ml_pipeline.named_steps['preprocessor'])
# print("\nClassifier coefficients (if applicable and preprocessed features are accessible):")
# Can be complex to get feature names after ColumnTransformer, requires get_feature_names_out()

# --- Example of how preprocessed data looks (optional to show) ---
# X_train_transformed = ml_pipeline.named_steps['preprocessor'].transform(X_train)
# print(f"\nShape of transformed X_train: {X_train_transformed.shape}")
# print("First 5 rows of transformed X_train (can be a sparse matrix or numpy array):")
# print(X_train_transformed[:5])

print("\nScikit-learn pipeline demonstration complete.")

# Further points to discuss:
# - GridSearch/RandomizedSearch for hyperparameter tuning with pipelines.
# - Saving and loading pipelines (using joblib or pickle).
# - Custom transformers.
# - More complex feature engineering within pipelines.

In [82]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

# Example 1: Simple Feature Engineering Transformer
class LogTransformer(BaseEstimator, TransformerMixin):
    """Apply log transformation to specified columns"""
    
    def __init__(self, columns=None, add_constant=1):
        self.columns = columns
        self.add_constant = add_constant
    
    def fit(self, X, y=None):
        # Nothing to fit for log transformation
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        
        if self.columns is None: # series
            # Apply to all numeric columns
            if isinstance(X, pd.DataFrame):
                numeric_cols = X.select_dtypes(include=[np.number]).columns
                X_copy[numeric_cols] = np.log(X[numeric_cols] + self.add_constant)
            else:
                X_copy = np.log(X + self.add_constant)
        else: # multiple columns specified
            # Apply to specified columns
            if isinstance(X, pd.DataFrame):
                X_copy[self.columns] = np.log(X[self.columns] + self.add_constant)
            else:
                X_copy[:, self.columns] = np.log(X[:, self.columns] + self.add_constant)
        
        return X_copy

In [83]:
df

Unnamed: 0,feature1_num,feature2_num,feature3_cat,feature4_ord,feature5_nan_num,target
0,5.671302,2.195431,B,Medium,,1
1,5.309978,2.359990,B,Low,6.470319,0
2,1.817809,0.470005,A,Medium,18.987471,1
3,7.801179,1.046817,B,Low,,1
4,3.064763,0.498534,A,Low,13.448532,1
...,...,...,...,...,...,...
95,8.969257,1.938489,A,Medium,,1
96,0.293798,2.192780,A,Medium,6.339578,1
97,0.096265,1.652768,B,Medium,15.350622,1
98,0.619111,3.017372,A,Medium,18.669264,1


In [84]:
np.log(df['feature2_num']+1) # Example of log transformation

0     1.161722
1     1.211938
2     0.385266
3     0.716286
4     0.404488
        ...   
95    1.077896
96    1.160892
97    0.975604
98    1.390628
99    1.322553
Name: feature2_num, Length: 100, dtype: float64

In [85]:
# Example 1: Using LogTransformer
log_transformer = LogTransformer(columns=['feature2_num'])
data_log = log_transformer.transform(df)
df


Unnamed: 0,feature1_num,feature2_num,feature3_cat,feature4_ord,feature5_nan_num,target
0,5.671302,2.195431,B,Medium,,1
1,5.309978,2.359990,B,Low,6.470319,0
2,1.817809,0.470005,A,Medium,18.987471,1
3,7.801179,1.046817,B,Low,,1
4,3.064763,0.498534,A,Low,13.448532,1
...,...,...,...,...,...,...
95,8.969257,1.938489,A,Medium,,1
96,0.293798,2.192780,A,Medium,6.339578,1
97,0.096265,1.652768,B,Medium,15.350622,1
98,0.619111,3.017372,A,Medium,18.669264,1


In [54]:
data_log

Unnamed: 0,feature1_num,feature2_num,feature3_cat,feature4_ord,feature5_nan_num,target
0,5.671302,1.161722,B,Medium,,1
1,5.309978,1.211938,B,Low,6.470319,0
2,1.817809,0.385266,A,Medium,18.987471,1
3,7.801179,0.716286,B,Low,,1
4,3.064763,0.404488,A,Low,13.448532,1
...,...,...,...,...,...,...
95,8.969257,1.077896,A,Medium,,1
96,0.293798,1.160892,A,Medium,6.339578,1
97,0.096265,0.975604,B,Medium,15.350622,1
98,0.619111,1.390628,A,Medium,18.669264,1


In [55]:
df.to_csv('data_sklearn.csv', index=False)