In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix

In [4]:
df = sns.load_dataset('tips')

X = df.drop('total_bill', axis=1)
y = df['total_bill'] > df['total_bill'].median() 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)



In [5]:
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64',
'float64']).columns.tolist()

In [6]:
numeric_preprocessor = Pipeline([
 ('imputer', SimpleImputer(strategy='mean')),
 ('scaler', StandardScaler())
])

In [7]:
categorical_preprocessor = Pipeline([
 ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
 ('encoder', OneHotEncoder(handle_unknown='ignore'))
])


In [8]:
preprocessor = ColumnTransformer([
 ('num', numeric_preprocessor, numerical_cols),
 ('cat', categorical_preprocessor, categorical_cols)
])

In [9]:
models = {
 'Decision Tree': DecisionTreeClassifier(),
 'Random Forest': RandomForestClassifier(),
 'Logistic Regression': LogisticRegression(max_iter=200)
}


In [10]:
results = {}

for name, model in models.items():
 print(f"\nTraining Model: {name}")

 # Create pipeline
 pipe = Pipeline(steps=[
 ('preprocessor', preprocessor),
 ('classifier', model)
 ])



Training Model: Decision Tree

Training Model: Random Forest

Training Model: Logistic Regression


In [19]:
pipe.fit(X_train, y_train)


train_pred = pipe.predict(X_train)
test_pred = pipe.predict(X_test)

train_acc = accuracy_score(y_train, train_pred)
pipe.fit(X_train, y_train)

 
train_pred = pipe.predict(X_train)
test_pred = pipe.predict(X_test)

train_acc = accuracy_score(y_train, train_pred)

In [20]:
print("\nMODEL PERFORMANCE SUMMARY:")
for name, info in results.items():
 print(f"\n=== {name} ===")
 print(f"Training Accuracy: {info['train_accuracy']:.4f}")
 print(f"Testing Accuracy: {info['test_accuracy']:.4f}")
 print("Confusion Matrix (Test):")
 print(info['confusion_matrix'])
 print("Classification Report (Test):")
 print(info['classification_report'])



MODEL PERFORMANCE SUMMARY:


In [17]:
best_model = max(results.items(), key=lambda x: x[1]['test_accuracy'])
print(f"/nBest Model: {best_model[0]} with Test Accuracy:{best_model[1]['test_accuracy']:.4f}")

ValueError: max() arg is an empty sequence