In [1]:
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
df=sns.load_dataset('tips')

In [3]:
df['day'].unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [4]:
df['time'].unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [5]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
encoder=LabelEncoder()

In [12]:
df['time']=encoder.fit_transform(df['time'])

In [13]:
x=df.drop(labels=['time'],axis=1)

In [14]:
y=df.time

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=69)

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [24]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 195 entries, 211 to 54
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  195 non-null    float64 
 1   tip         195 non-null    float64 
 2   sex         195 non-null    category
 3   smoker      195 non-null    category
 4   day         195 non-null    category
 5   size        195 non-null    int64   
dtypes: category(3), float64(2), int64(1)
memory usage: 7.1 KB


In [25]:
cat_col=['sex','smoker','day']
num_col=['total_bill','tip','size']

In [32]:
cat_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("encoder",OneHotEncoder())
    ]

)
num_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
         ("scaler",StandardScaler())
    ]
)

In [33]:
preprocesor=ColumnTransformer([
    ("num_pipeline",num_pipeline,num_col),
    ("cat_pipeline",cat_pipeline,cat_col)
])

In [34]:
x_train=preprocesor.fit_transform(x_train)
x_test=preprocesor.transform(x_test)

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


In [37]:
models={
    "random_forest":RandomForestClassifier(),
    "logistic_regression":LogisticRegression(),
    "decision_tree":DecisionTreeClassifier()
}

In [63]:
def eval_model(x_train,x_test,y_train,y_test,models):
    report={}
    for i in range(len(models)):
        
        model=list(models.values())[i]
        model.fit(x_train,y_train)
        print(model.score(x_train,y_train))
        y_pred=model.predict(x_test)
        accuracy=accuracy_score(y_test,y_pred)
        report[list(models.keys())[i]]=accuracy
    return report
    

In [64]:
from sklearn.metrics import accuracy_score

In [65]:
eval_model(x_train,x_test,y_train,y_test,models)

1.0
0.9794871794871794
1.0


{'random_forest': 0.9183673469387755,
 'logistic_regression': 0.8979591836734694,
 'decision_tree': 0.9387755102040817}

In [70]:
params={
    "n_estimators":[50,100,200],
    "criterion":["gini","entropy"],
    "max_depth":[3,5,10]
}

In [71]:
from sklearn.model_selection import RandomizedSearchCV

In [72]:
cv=RandomizedSearchCV(estimator=RandomForestClassifier(),param_distributions=params,cv=5,verbose=3,scoring="accuracy")

In [73]:
cv.fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=5, n_estimators=200;, score=1.000 total time=   0.3s
[CV 2/5] END criterion=gini, max_depth=5, n_estimators=200;, score=0.949 total time=   0.3s
[CV 3/5] END criterion=gini, max_depth=5, n_estimators=200;, score=1.000 total time=   0.3s
[CV 4/5] END criterion=gini, max_depth=5, n_estimators=200;, score=0.923 total time=   0.3s
[CV 5/5] END criterion=gini, max_depth=5, n_estimators=200;, score=1.000 total time=   0.3s
[CV 1/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=1.000 total time=   0.3s
[CV 2/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.923 total time=   0.3s
[CV 3/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=1.000 total time=   0.3s
[CV 4/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.923 total time=   0.3s
[CV 5/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.974 total time=   0.3s

In [74]:
cv.best_params_

{'n_estimators': 200, 'max_depth': 5, 'criterion': 'gini'}

In [87]:
best_model=RandomForestClassifier(n_estimators= 200, max_depth= 5, criterion= 'gini')

In [88]:
best_model.fit(x_train,y_train)

In [89]:
y_pred1=model.predict(x_test)
accuracy=accuracy_score(y_test,y_pred1)

In [90]:
print(accuracy)

0.9387755102040817


In [91]:
best_model.score(x_train,y_train)

1.0