## Traning Model for prediction

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score,roc_auc_score,recall_score,precision_score
import pandas as pd

In [6]:
data = pd.read_csv('Holiday_cleaned_dataset.csv')

In [8]:
x=data.copy()
x.drop('ProdTaken',axis=1,inplace=True)
y=data['ProdTaken']

In [9]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [12]:
cat_feature = x.select_dtypes(include=['object']).columns
num_feature = x.select_dtypes(exclude=['object']).columns


In [14]:
nun_transformer = StandardScaler()
oneH_transformer = OneHotEncoder(drop='first')
preprocessor = ColumnTransformer([
    ('standardizer',nun_transformer,num_feature),
    ('oneHotencoder',oneH_transformer,cat_feature)
])

In [15]:
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

In [18]:
models = {
    'RandomForestClassifier':RandomForestClassifier()
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train,y_train)

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    print('---------------------------------------------------------')
    print(f'model name: {list(models.keys())[i]}')
    print('---------------------------------------------------------')
    print('Training report')
    print('---------------------------------------------------------')
    print(f'Accuracy: {accuracy_score(y_train,y_train_pred)}')
    print(f'F1 score: {f1_score(y_train , y_train_pred, average='weighted')}')
    print(f'precision_score: {precision_score(y_train,y_train_pred)}')
    print(f'Recall score: {recall_score(y_train,y_train_pred)}')
    print(f'Roc Auc score: {roc_auc_score(y_train,y_train_pred)}')
    print('---------------------------------------------------------')
    print('---------------------------------------------------------')
    print('Testing report')
    print(f'Accuracy: {accuracy_score(y_test,y_test_pred)}')
    print(f'F1 score: {f1_score(y_test,y_test_pred, average='weighted')}')
    print(f'precision_score: {precision_score(y_test,y_test_pred)}')
    print(f'Recall score: {recall_score(y_test,y_test_pred)}')
    print(f'Roc Auc score: {roc_auc_score(y_test,y_test_pred)}')
    print('---------------------------------------------------------')
    print('---------------------------------------------------------')
    print(confusion_matrix(y_test,y_test_pred))
    print(classification_report(y_test,y_test_pred))
    print('---------------------------------------------------------')


---------------------------------------------------------
model name: RandomForestClassifier
---------------------------------------------------------
Training report
---------------------------------------------------------
Accuracy: 1.0
F1 score: 1.0
precision_score: 1.0
Recall score: 1.0
Roc Auc score: 1.0
---------------------------------------------------------
---------------------------------------------------------
Testing report
Accuracy: 0.9274028629856851
F1 score: 0.9211642407299702
precision_score: 0.9838709677419355
Recall score: 0.6387434554973822
Roc Auc score: 0.8181010797181956
---------------------------------------------------------
---------------------------------------------------------
[[785   2]
 [ 69 122]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       787
           1       0.98      0.64      0.77       191

    accuracy                           0.93       978
   macro avg       0.95      0.82      0

## Hyperperameter tunning

In [19]:
from sklearn.model_selection import RandomizedSearchCV

In [20]:
rf_params = {
    "max_depth":[5,8,10,15,None],
    "max_features":[5,7,8,"auto"],
    "min_samples_split":[2,8,15,20],
    "n_estimators":[100,200,500,1000,1500,2000]
}
random_models = [
    ("rf",RandomForestClassifier(),rf_params)
]

In [21]:
for name,model,params in random_models:
    random = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=100,
        cv=3,
        verbose=2,
        n_jobs=-1
        )
    random.fit(x_train,y_train)
    
y_pred=random.predict(x_test)
print(accuracy_score(y_pred,y_test))
print(confusion_matrix(y_pred,y_test))



Fitting 3 folds for each of 100 candidates, totalling 300 fits


81 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "e:\codes\course\mlenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\codes\course\mlenv\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "e:\codes\course\mlenv\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "e:\codes\course\mlenv\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
skle

0.9335378323108384
[[782  60]
 [  5 131]]
