In [38]:
from sklearn.model_selection import cross_val_score

In [39]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score



# Sample data
df = pd.DataFrame({
    "age": [25, 32, np.nan, 45, 52],
    "income": [30000, 54000, 42000, np.nan, 80000],
    "city": ["Chennai", "Madurai", "Chennai", "Coimbatore", "Salem"],
    "bought": [0, 1, 0, 1, 1]
})

X = df.drop(columns=["bought"])
y = df["bought"]

numeric_features = ["age", "income"]
categorical_features = ["city"]

numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features),
    ]
)

model = LogisticRegression(max_iter=1000)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

clf.fit(X_train, y_train)
print("Test accuracy:", clf.score(X_test, y_test))



Test accuracy: 1.0


In [40]:
scores = cross_val_score(clf, X, y, cv=3, scoring="accuracy")
print("CV accuracy:", scores.mean(), "+/-", scores.std())

CV accuracy: 0.8333333333333334 +/- 0.23570226039551584


