In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df= sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [39]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [40]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [41]:
df['time'].unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [42]:
from sklearn.preprocessing import LabelEncoder
encoder= LabelEncoder()
df['time']=encoder.fit_transform(df['time'])
df['time'].unique()

array([0, 1])

In [43]:
x = df.drop(columns=['time'],axis=1)
y = df['time']


In [44]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.40,random_state=42)

In [45]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer ##Handle missing values
from sklearn.preprocessing import StandardScaler #Feature scaling
from sklearn.preprocessing import OneHotEncoder ##categoricakl to numerical
from sklearn.compose import ColumnTransformer

numerical_cols = ['total_bill','tip','size']
categorical_cols = ['sex','smoker','day']

In [46]:
## Feature Engineering Automation
## Numerical Pipelines
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('feature_scaling',StandardScaler())
    ]
)

## Categoriocal Pipelines
cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehot_encoding',OneHotEncoder())
    ]
)

In [47]:
preprocessor = ColumnTransformer([
    ('numerical_pipeline',num_pipeline,numerical_cols),
    ('Categorical_pipeline',cat_pipeline,categorical_cols)
])

In [48]:
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

In [53]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
## Model Training Automation
models = {
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}

In [50]:
from sklearn.metrics import accuracy_score

In [54]:
def evaluate_model(x_train, x_test, y_train, y_test, models):
    report = {}
    for model_name, model in models.items():
        #Train Model
        model.fit(x_train,y_train)

        #prediction on model
        y_pred = model.predict(x_test)

        #Accuracy on the model
        test_model_score = accuracy_score(y_test, y_pred)

        report[model_name] = test_model_score
        
    return report



In [67]:
evaluate_model(x_train,x_test,y_train,y_test,models)

{'Random Forest': 0.9693877551020408,
 'KNN': 0.9693877551020408,
 'Decision Tree': 0.9591836734693877}

In [68]:
#hyperparameter tuning
params = {
    'max_depth':[3,5,10,None],
    'n_estimators':[100,200,300],
    'criterion':['gini','entropy']
}

In [69]:
from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(RandomForestClassifier(random_state=42),param_grid=params,scoring='accuracy',cv=5)
cv.fit(x_train,y_train)

0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, ...], 'n_estimators': [100, 200, ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [70]:
cv.best_params_

{'criterion': 'gini', 'max_depth': 3, 'n_estimators': 100}

In [71]:
y_pred = cv.predict(x_test)
score = accuracy_score(y_test,y_pred)
print(score)

0.9897959183673469


In [72]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99        74
           1       1.00      0.96      0.98        24

    accuracy                           0.99        98
   macro avg       0.99      0.98      0.99        98
weighted avg       0.99      0.99      0.99        98

