In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
data = {
    "location": ["urban", "rural", "suburban", np.nan, "urban", "rural"],
    "device": ["mobile", "desktop", "mobile", "tablet", np.nan, "mobile"],
    "age": [25, 45, 35, np.nan, 22, 55],
    "income": [50000, 60000, 55000, 52000, np.nan, 58000],
    "bought_product": [1, 0, 1, 0, 1, 0]
}
df = pd.DataFrame(data)

X = df.drop("bought_product", axis=1)
y = df["bought_product"]


In [3]:
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ]
)

In [4]:
categorical_preprocessor = Pipeline(
    steps=[
        ("imputation_constant", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)


In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", categorical_preprocessor, ["location", "device"]),
        ("numerical", numeric_preprocessor, ["age", "income"])
    ]
)

In [6]:
# Create the final pipeline with logistic regression
pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
pipe

In [7]:
# Split and train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipe.fit(X_train, y_train)

# Make predictions
predictions = pipe.predict(X_test)
print("Predictions:", predictions)

Predictions: [1 0]


In [8]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline
pipe.fit(X_train, y_train)

# Make predictions
predictions = pipe.predict(X_test)

# Create a DataFrame to show results
results_df = X_test.copy()
results_df["Actual"] = y_test.values
results_df["Predicted"] = predictions
results_df.reset_index(drop=True, inplace=True)

# Show the results
print(results_df)


  location   device   age   income  Actual  Predicted
0    urban   mobile  25.0  50000.0       1          1
1    rural  desktop  45.0  60000.0       0          0
