## Random Forest Classifier with Pipelining and Hyperparameter Tuning

# Load Dataset 

In [1]:
import seaborn as sns
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## Check the Missing values

In [2]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

## EDA

In [3]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [4]:
## Our task is to predict the time based on other inputs ????

## Binary classification - Label Encoding on time

In [5]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['time'] = encoder.fit_transform(df['time'])

In [6]:
df['time'].unique()

array([0, 1])

## Division into Dependent and Independent Feature

In [7]:
X = df.drop(labels='time', axis = 1)

In [8]:
X

Unnamed: 0,total_bill,tip,sex,smoker,day,size
0,16.99,1.01,Female,No,Sun,2
1,10.34,1.66,Male,No,Sun,3
2,21.01,3.50,Male,No,Sun,3
3,23.68,3.31,Male,No,Sun,2
4,24.59,3.61,Female,No,Sun,4
...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,3
240,27.18,2.00,Female,Yes,Sat,2
241,22.67,2.00,Male,Yes,Sat,2
242,17.82,1.75,Male,No,Sat,2


In [9]:
y= df['time']
y

0      0
1      0
2      0
3      0
4      0
      ..
239    0
240    0
241    0
242    0
243    0
Name: time, Length: 244, dtype: int64

## Train Test Split

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42 , test_size = 0.3)

## PIPELINE

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer ## TO HANDLE MISSING VALUES 
from sklearn.preprocessing import StandardScaler ## Feature Scaling
from sklearn.preprocessing import OneHotEncoder ## Categorical to Numerical
from sklearn.compose import ColumnTransformer ## Transforms our columns


In [12]:
categorical_cols = ['sex','smoker','day']
numerical_cols = ['total_bill','tip','size']

In [13]:
## NUMERICAL PIPELINE

num_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='median')), #Missing values
        ('scaler',  StandardScaler()) # feature Scaling
    ]
)
    

In [14]:
## Categorical Pipeline

cat_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')), # missing value
        ('onehotencoder', OneHotEncoder()) #categorical to Numerical
    ]
)

In [15]:

preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline, numerical_cols),
    ('cat_pipeline',cat_pipeline, categorical_cols)
])

In [22]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

## Model Training Automation 

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [25]:
models = {
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression' : LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier()
}

In [19]:
from sklearn.metrics import accuracy_score

In [26]:
def evaluate_model(X_train, y_train, X_test, y_test, models):
    
    report = {}
    for i in range(len(models)):
        model  = list(models.values())[i]
                   
        #Train Model
        model.fit(X_train, y_train)
                   
        #predict
        y_pred = model.predict(X_test)
                   
        #Accuracy Score
        test_model_score = accuracy_score(y_pred, y_test)
                   
        report[list(models.keys())[i]] = test_model_score
                   
    return report
                   

In [27]:
evaluate_model(X_train, y_train, X_test, y_test, models)

{'Random Forest': 0.972972972972973,
 'Logistic Regression': 0.9864864864864865,
 'Decision Tree': 0.9459459459459459}

## CONCLUSION : We will use Random Forest

In [28]:
classifier = RandomForestClassifier()

In [32]:
# Hyperparameter tuning

parameter = {'max_depth' : [3, 5, 10, None],
             'n_estimators': [100, 200, 300],
             'criterion' : ['gini', 'entropy']
            }

In [33]:
from sklearn.model_selection import RandomizedSearchCV
cv = RandomizedSearchCV(classifier,
                   param_distributions=parameter,
                   scoring='accuracy',
                   cv = 5,
                   verbose = 3)

In [34]:
cv.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=entropy, max_depth=3, n_estimators=200;, score=0.971 total time=   0.3s
[CV 2/5] END criterion=entropy, max_depth=3, n_estimators=200;, score=1.000 total time=   0.3s
[CV 3/5] END criterion=entropy, max_depth=3, n_estimators=200;, score=0.971 total time=   0.3s
[CV 4/5] END criterion=entropy, max_depth=3, n_estimators=200;, score=0.912 total time=   0.3s
[CV 5/5] END criterion=entropy, max_depth=3, n_estimators=200;, score=0.912 total time=   0.3s
[CV 1/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.971 total time=   0.3s
[CV 2/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.971 total time=   0.3s
[CV 3/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=1.000 total time=   0.3s
[CV 4/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.912 total time=   0.3s
[CV 5/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.912 tot

In [35]:
cv.best_params_

{'n_estimators': 200, 'max_depth': 5, 'criterion': 'entropy'}

In [36]:
y_pred = cv.predict(X_test)

In [37]:
accuracy_score(y_pred, y_test)

0.972972972972973