In [1]:
import pandas as pd
import seaborn as sns

In [2]:
df=pd.read_csv("tips.csv")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn. impute import SimpleImputer # handling missing value
from sklearn.preprocessing import StandardScaler # feature scaling
from sklearn.preprocessing import OneHotEncoder  # categorical to numerical
from sklearn.compose import ColumnTransformer

In [4]:
X_train,X_test,Y_train,Y_test=train_test_split(df.drop("time",axis=1),df.time,test_size=0.20,random_state=42)

In [13]:
categorical_col=X_train.dtypes[X_train.dtypes=='O'].index
numerical_col=X_train.dtypes[X_train.dtypes!='O'].index

In [14]:
print(categorical_col)
print(numerical_col)

Index(['sex', 'smoker', 'day'], dtype='object')
Index(['total_bill', 'tip', 'size'], dtype='object')


In [18]:
# feature engeineering automation
# numerical pipeline
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

# categorical pipeline
cat_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ('encoder',OneHotEncoder())
    ]
)

# combining both pipeline
preprocess=ColumnTransformer(
    [
        ("num_pipeline",num_pipeline,numerical_col),
        ("cat_pipeline",cat_pipeline,categorical_col)
    ]
)

In [19]:
X_train=preprocess.fit_transform(X_train)
X_test=preprocess.transform(X_test)

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [23]:
## Model Training Automation
models={
    'Random Forest':RandomForestClassifier(),
    'Logistic Regression':LogisticRegression(),
    'Decision Tree':DecisionTreeClassifier()
}

In [25]:
def evaluate(X_train,Y_train,X_test,Y_test,models):
    report={}
    for i in models:
        x=models[i]
        x.fit(X_train,Y_train)
        y_pred=x.predict(X_test)
        report[i]=accuracy_score(Y_test,y_pred)
        
    return report       


In [26]:
evaluate(X_train,Y_train,X_test,Y_test,models)

{'Random Forest': 0.9591836734693877,
 'Logistic Regression': 1.0,
 'Decision Tree': 0.9387755102040817}