In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix


In [7]:
df = sns.load_dataset('tips')
X = df.drop('total_bill', axis=1)
y = df['total_bill'] > df['total_bill'].median() 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

In [6]:
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64',
'float64']).columns.tolist()

In [8]:
numeric_preprocessor = Pipeline([
 ('imputer', SimpleImputer(strategy='mean')),
 ('scaler', StandardScaler())
])

In [9]:
categorical_preprocessor = Pipeline([
 ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
 ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [10]:
preprocessor = ColumnTransformer([
 ('num', numeric_preprocessor, numerical_cols),
 ('cat', categorical_preprocessor, categorical_cols)
])

In [11]:
models = {
 'Decision Tree': DecisionTreeClassifier(),
 'Random Forest': RandomForestClassifier(),
 'Logistic Regression': LogisticRegression(max_iter=200)
}

In [13]:
results = {}
for name, model in models.items():
 print(f"\nTraining Model: {name}")


Training Model: Decision Tree

Training Model: Random Forest

Training Model: Logistic Regression


In [17]:
pipe = Pipeline(steps=[
 ('preprocessor', preprocessor),
 ('classifier', model)
 ])
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,200


In [24]:

 # Predict
 train_pred = pipe.predict(X_train)
 test_pred = pipe.predict(X_test)
 test_acc = accuracy_score(y_test, test_pred)
 train_acc = accuracy_score(y_train, train_pred)

In [25]:
results[name] = {
 'model': pipe,
 'train_accuracy': train_acc,
 'test_accuracy': test_acc,
 'train_predictions': train_pred,
 'test_predictions': test_pred,
 'confusion_matrix': confusion_matrix(y_test, test_pred),
 'classification_report': classification_report(y_test, test_pred)
 }

In [26]:
results[name] = {
 'model': pipe,
 'train_accuracy': train_acc,
 'test_accuracy': test_acc,
 'train_predictions': train_pred,
 'test_predictions': test_pred,
 'confusion_matrix': confusion_matrix(y_test, test_pred),
 'classification_report': classification_report(y_test, test_pred)
 }

In [28]:
print("\nMODEL PERFORMANCE SUMMARY:")
for name, info in results.items():
 print(f"\n=== {name} ===")
 print(f"Training Accuracy: {info['train_accuracy']:.4f}")
 print(f"Testing Accuracy: {info['test_accuracy']:.4f}")
 print("Confusion Matrix (Test):")
 print(info['confusion_matrix'])
 print("Classification Report (Test):")
 print(info['classification_report'])


MODEL PERFORMANCE SUMMARY:

=== Logistic Regression ===
Training Accuracy: 0.7487
Testing Accuracy: 0.7755
Confusion Matrix (Test):
[[23  3]
 [ 8 15]]
Classification Report (Test):
              precision    recall  f1-score   support

       False       0.74      0.88      0.81        26
        True       0.83      0.65      0.73        23

    accuracy                           0.78        49
   macro avg       0.79      0.77      0.77        49
weighted avg       0.78      0.78      0.77        49



In [31]:
best_model = max(results.items(), key=lambda x: x[1]['test_accuracy'])
print(f"\nBest Model: {best_model[0]} with Test Accuracy:{best_model[1]['test_accuracy']:.4f}")


Best Model: Logistic Regression with Test Accuracy:0.7755
