## Importing Libraries and Data

In [1]:
import pandas as pd
import numpy as np
train = pd.read_csv("train_for_feature_engineering.csv")
test = pd.read_csv("test_for_feature_engineering.csv")
train.head()

Unnamed: 0,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,sales_9_month,min_bank,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,ppap_risk,went_on_backorder
0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,No,No,No
1,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99,0.99,0.0,No,No,No
2,2.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,Yes,No,No
3,7.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.1,0.13,0.0,No,No,No
4,8.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,,,0.0,Yes,No,No


- We haven't done any outlier treatment.

In [2]:
# Encoding deck_risk feature in both training and testing datasets.
deck_risk_freq_encoding = train["deck_risk"].value_counts(normalize=True).to_dict()
train["deck_risk"] = train["deck_risk"].map(deck_risk_freq_encoding)
test["deck_risk"] = test["deck_risk"].map(deck_risk_freq_encoding)

In [3]:
# Encoding ppap_risk, went_on_backorder features in both training and testing datasets.
ppap_risk_freq_encoding = train['ppap_risk'].value_counts(normalize=True).to_dict()
train['ppap_risk'] = train['ppap_risk'].map(ppap_risk_freq_encoding)
test['ppap_risk'] = test['ppap_risk'].map(ppap_risk_freq_encoding)

train["went_on_backorder"] = train["went_on_backorder"].map({"Yes" : 1, "No" : 0})
test['went_on_backorder'] = test['went_on_backorder'].map({"Yes" : 1, "No" : 0})

In [4]:
# Dropping rows based on went_on_backorder column that has missing values
train.dropna(subset=["went_on_backorder"], inplace=True)
test.dropna(subset=["went_on_backorder"], inplace=True)

In [5]:
# Missing values in both the datasets
pd.concat([pd.DataFrame(train.isnull().sum(), columns=['train']),
          pd.DataFrame(test.isnull().sum(), columns=['test'])], axis=1)

Unnamed: 0,train,test
national_inv,5888,766
lead_time,100893,14724
in_transit_qty,0,0
forecast_3_month,0,0
forecast_6_month,0,0
forecast_9_month,0,0
sales_1_month,0,0
sales_3_month,0,0
sales_6_month,0,0
sales_9_month,0,0


In [6]:
from sklearn.impute import SimpleImputer, MissingIndicator
## Step 1: Creating missing indicator in both training and testing datasets to capture the missingness of the data.
## Step 2: Imputing a Missing values in the lead_time, national_inv, perf_6_month_avg, perf_12_month_avg features in both datasets
indicator = MissingIndicator(error_on_new = True, features = "missing-only")
## Fitting on train dataset
indicator.fit(train)
## Getting feature names
indicator_cols = [col+"_NA" for col in train.columns[indicator.features_]]
## Transforming the dataset to get the missing flag
train_temp = indicator.transform(train)
## Concatenating the transformed dataset with the original train dataset.
train = pd.concat([train.reset_index(drop=True),
                  pd.DataFrame(train_temp, columns = indicator_cols)], axis = 1)
train.head()

Unnamed: 0,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,sales_9_month,...,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,ppap_risk,went_on_backorder,national_inv_NA,lead_time_NA,perf_6_month_avg_NA,perf_12_month_avg_NA
0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.0,0.770429,0.879235,0.0,False,True,True,True
1,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.99,0.99,0.0,0.770429,0.879235,0.0,False,False,False,False
2,2.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.0,0.229571,0.879235,0.0,False,True,True,True
3,7.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1,0.13,0.0,0.770429,0.879235,0.0,False,False,False,False
4,8.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,,,0.0,0.229571,0.879235,0.0,False,True,True,True


In [7]:
## Applying the same things on the test data.
test_temp = indicator.transform(test)
## Concatenating the transformed dataset with the original test dataset.
test = pd.concat([test.reset_index(drop=True),
                  pd.DataFrame(test_temp, columns = indicator_cols)], axis = 1)
test.head()

Unnamed: 0,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,sales_9_month,...,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,ppap_risk,went_on_backorder,national_inv_NA,lead_time_NA,perf_6_month_avg_NA,perf_12_month_avg_NA
0,62.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.0,0.229571,0.879235,0.0,False,True,True,True
1,9.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.0,0.770429,0.120765,0.0,False,True,True,True
2,17.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.92,0.95,0.0,0.770429,0.879235,0.0,False,False,False,False
3,9.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.78,0.75,0.0,0.770429,0.120765,0.0,False,False,False,False
4,2.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.54,0.71,0.0,0.770429,0.879235,0.0,False,False,False,False


In [8]:
cols = train.columns
print(cols)

Index(['national_inv', 'lead_time', 'in_transit_qty', 'forecast_3_month',
       'forecast_6_month', 'forecast_9_month', 'sales_1_month',
       'sales_3_month', 'sales_6_month', 'sales_9_month', 'min_bank',
       'pieces_past_due', 'perf_6_month_avg', 'perf_12_month_avg',
       'local_bo_qty', 'deck_risk', 'ppap_risk', 'went_on_backorder',
       'national_inv_NA', 'lead_time_NA', 'perf_6_month_avg_NA',
       'perf_12_month_avg_NA'],
      dtype='object')


In [9]:
## Imputing Missing values using median 
si = SimpleImputer(strategy = 'median')
## learning the values for the entire dataset
si.fit(train)
## transforming the train dataset
train = pd.DataFrame(si.transform(train), columns = cols)
## transforming the test dataset
test = pd.DataFrame(si.transform(test), columns = cols)

In [10]:
## Missing values in both the datasets
pd.concat([pd.DataFrame(train.isnull().sum(), columns=['train']),
          pd.DataFrame(test.isnull().sum(), columns=['test'])], axis=1)

Unnamed: 0,train,test
national_inv,0,0
lead_time,0,0
in_transit_qty,0,0
forecast_3_month,0,0
forecast_6_month,0,0
forecast_9_month,0,0
sales_1_month,0,0
sales_3_month,0,0
sales_6_month,0,0
sales_9_month,0,0


In [11]:
round(train.describe(), 2).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
national_inv,1687860.0,496.55,29615.18,0.0,4.0,15.0,80.0,12334404.0
lead_time,1687860.0,7.88,6.84,0.0,4.0,8.0,8.0,52.0
in_transit_qty,1687860.0,44.05,1342.74,0.0,0.0,0.0,0.0,489408.0
forecast_3_month,1687860.0,178.12,5026.55,0.0,0.0,0.0,4.0,1427612.0
forecast_6_month,1687860.0,344.99,9795.15,0.0,0.0,0.0,12.0,2461360.0
forecast_9_month,1687860.0,506.36,14378.92,0.0,0.0,0.0,20.0,3777304.0
sales_1_month,1687860.0,55.93,1928.2,0.0,0.0,0.0,4.0,741774.0
sales_3_month,1687860.0,175.03,5192.38,0.0,0.0,1.0,15.0,1105478.0
sales_6_month,1687860.0,341.73,9613.17,0.0,0.0,2.0,31.0,2146625.0
sales_9_month,1687860.0,525.27,14838.61,0.0,0.0,4.0,47.0,3205172.0


In [12]:
## Separating features to scale
features_to_scale = ['national_inv', 'lead_time', 'in_transit_qty', 'forecast_3_month',
                        'forecast_6_month', 'forecast_9_month', 'sales_1_month',
                       'sales_3_month', 'sales_6_month', 'sales_9_month', 'min_bank',
                       'pieces_past_due','local_bo_qty']
## Separating features to not scale
features_not_to_scale = ['deck_risk', 'lead_time_NA', 'national_inv_NA', 'perf_12_month_avg',
                        'perf_12_month_avg_NA', 'perf_6_month_avg', 'perf_6_month_avg_NA', 
                        'ppap_risk', 'went_on_backorder']
## Separating the train data to not scale 
train_not_to_scaled_features = train[features_not_to_scale]
## Separating the test data to not scale 
test_not_to_scaled_features = test[features_not_to_scale]
## Separating the train data to scale 
train_scaled_features = train[features_to_scale]
## Separating the test data to scale 
test_scaled_features = test[features_to_scale]

from sklearn.preprocessing import MinMaxScaler
## Scaling has been done because when we use SMOTE it uses KNN algo internally therefore scaling is necessary there.
scaler = MinMaxScaler().fit(train_scaled_features)
train_scaled_features = pd.DataFrame(scaler.transform(train_scaled_features), columns=features_to_scale)
test_scaled_features = pd.DataFrame(scaler.transform(test_scaled_features), columns=features_to_scale)

In [13]:
## Concatenating the train_scaled_features and train_not_to_scaled_features
train = pd.concat([train_scaled_features, train_not_to_scaled_features], axis=1)
test = pd.concat([test_scaled_features, test_not_to_scaled_features], axis=1)

In [14]:
from datetime import datetime
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import (
    BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier)
from sklearn.model_selection import GridSearchCV

## GridSearchCV + EasyEnsembleClassifier

In [24]:
start = datetime.now()
param_dict = {"n_estimators" : [10, 20, 30, 50, 100]}
gs = GridSearchCV(EasyEnsembleClassifier(n_jobs=-1, random_state=2021, n_estimators=5, sampling_strategy="auto"), 
                 param_grid=param_dict, scoring="roc_auc", cv=5, refit=True, verbose=False)
gs.fit(train.drop(["went_on_backorder"], axis=1), train["went_on_backorder"])
print(gs.best_params_)
print("-"*50)
print(gs.best_score_)
print(f"Time taken to run the cell - {datetime.now() - start}")

{'n_estimators': 100}
--------------------------------------------------
0.8970027928764814
Time taken to run the cell - 1:23:33.221248


In [26]:
test_preds = gs.predict_proba(test.drop(["went_on_backorder"], axis=1))[:,1]
test_auc_score = roc_auc_score(test["went_on_backorder"], test_preds)
print(f"Test AUC - {test_auc_score}")

Test AUC - 0.884067484177593


In [28]:
## Best HyperParameter n_estimators = 100 found using GridSearchCV
def Easy_ensemble_model_with_5_folds(train, test):
    X_train = train.drop(["went_on_backorder"], axis=1)
    X_test = test.drop(["went_on_backorder"], axis=1)
    Y_train = train["went_on_backorder"]
    Y_test = test['went_on_backorder']
    
    skf = StratifiedKFold(n_splits = 5, random_state = 2021, shuffle = True)
    train_auc_scores = []
    valid_auc_scores = []
    
    for fold, (train_index, validation_index) in enumerate(skf.split(X_train, Y_train)):
        x_train, y_train  = X_train.iloc[train_index], Y_train[train_index]
        x_valid, y_valid = X_train.iloc[validation_index], Y_train[validation_index]
        
        model = EasyEnsembleClassifier(n_estimators=100, sampling_strategy="auto", random_state=2021, n_jobs=-1)
        model.fit(x_train, y_train)
        
        train_preds = model.predict_proba(x_train)[:,1]
        valid_preds = model.predict_proba(x_valid)[:,1]
        train_auc = roc_auc_score(y_train, train_preds)
        valid_auc = roc_auc_score(y_valid, valid_preds)
        train_auc_scores.append(train_auc)
        valid_auc_scores.append(valid_auc)
        print(f"Fold : {fold} Done!")
    print(f"Train Mean AUC on 5 Folds data - {np.mean(train_auc_scores)} +/- {np.std(train_auc_scores)}")
    print(f"Validation Mean AUC on 5 Folds data - {np.mean(valid_auc_scores)} +/- {np.std(valid_auc_scores)}")
    
    test_preds = model.predict_proba(X_test)[:,1]
    test_auc_score = roc_auc_score(Y_test, test_preds)
    print(f"Test AUC - {test_auc_score}")

In [29]:
start = datetime.now()
Easy_ensemble_model_with_5_folds(train, test)
print(f"Time taken to run the cell - {datetime.now() - start}")

Fold : 0 Done!
Fold : 1 Done!
Fold : 2 Done!
Fold : 3 Done!
Fold : 4 Done!
Train Mean AUC on 5 Folds data - 0.9005578217767123 +/- 0.000904233855470924
Validation Mean AUC on 5 Folds data - 0.8989786407441042 +/- 0.0029939813844230257
Test AUC - 0.8849664273637308
Time taken to run the cell - 2:19:30.088702


## RUSBoostClassifier + HyperParameter Tuning

In [31]:
start = datetime.now()
param_dict = {"n_estimators" : [10, 20, 30, 50, 100],
              "learning_rate": [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]}
gs = GridSearchCV(RUSBoostClassifier(base_estimator=None ,random_state=2021, n_estimators=5, sampling_strategy="auto"), 
                 param_grid=param_dict, scoring="roc_auc", cv=5, refit=True, verbose=False)
gs.fit(train.drop(["went_on_backorder"], axis=1), train["went_on_backorder"])
print(gs.best_params_)
print("-"*50)
print(gs.best_score_)
print(f"Time taken to run the cell - {datetime.now() - start}")

{'learning_rate': 0.7, 'n_estimators': 100}
--------------------------------------------------
0.8977369475907995
Time taken to run the cell - 2:39:19.511660


In [32]:
test_preds = gs.predict_proba(test.drop(["went_on_backorder"], axis=1))[:,1]
test_auc_score = roc_auc_score(test["went_on_backorder"], test_preds)
print(f"Test AUC - {test_auc_score}")

Test AUC - 0.8843740700453758


In [33]:
## Best HyperParameters n_estimators=100, learning_rate=0.7
def RUS_boost_model_with_5_folds(train, test):
    X_train = train.drop(["went_on_backorder"], axis=1)
    X_test = test.drop(["went_on_backorder"], axis=1)
    Y_train = train["went_on_backorder"]
    Y_test = test['went_on_backorder']
    
    skf = StratifiedKFold(n_splits = 5, random_state = 2021, shuffle = True)
    train_auc_scores = []
    valid_auc_scores = []
    
    for fold, (train_index, validation_index) in enumerate(skf.split(X_train, Y_train)):
        x_train, y_train  = X_train.iloc[train_index], Y_train[train_index]
        x_valid, y_valid = X_train.iloc[validation_index], Y_train[validation_index]
        
        model = RUSBoostClassifier(base_estimator=None, n_estimators=100, learning_rate=0.7,
                                   sampling_strategy='auto', random_state=2021)
        model.fit(x_train, y_train)
        
        train_preds = model.predict_proba(x_train)[:,1]
        valid_preds = model.predict_proba(x_valid)[:,1]
        train_auc = roc_auc_score(y_train, train_preds)
        valid_auc = roc_auc_score(y_valid, valid_preds)
        train_auc_scores.append(train_auc)
        valid_auc_scores.append(valid_auc)
        print(f"Fold : {fold} Done!")
    print(f"Train Mean AUC on 5 Folds data - {np.mean(train_auc_scores)} +/- {np.std(train_auc_scores)}")
    print(f"Validation Mean AUC on 5 Folds data - {np.mean(valid_auc_scores)} +/- {np.std(valid_auc_scores)}")
    
    test_preds = model.predict_proba(X_test)[:,1]
    test_auc_score = roc_auc_score(Y_test, test_preds)
    print(f"Test AUC - {test_auc_score}")

In [34]:
start = datetime.now()
RUS_boost_model_with_5_folds(train, test)
print(f"Time taken to run the cell - {datetime.now() - start}")

Fold : 0 Done!
Fold : 1 Done!
Fold : 2 Done!
Fold : 3 Done!
Fold : 4 Done!
Train Mean AUC on 5 Folds data - 0.9021714048844321 +/- 0.001681296122291629
Validation Mean AUC on 5 Folds data - 0.9001855483875948 +/- 0.002103119039235368
Test AUC - 0.8862751830904113
Time taken to run the cell - 0:16:10.049186


## BalancedRandomForestClassifier + HyperParameter Tuning

In [15]:
start = datetime.now()
param_dict = {"n_estimators" : [10, 20, 30, 50, 100],
               "max_depth" : [3,5,7,9,11,13,15]}
gs = GridSearchCV(BalancedRandomForestClassifier(n_estimators=20, criterion='gini', max_depth=3,
                      sampling_strategy='auto',n_jobs=-1 ,random_state=2021), 
                 param_grid=param_dict, scoring="roc_auc", cv=5, refit=True, verbose=False)
gs.fit(train.drop(["went_on_backorder"], axis=1), train["went_on_backorder"])
print(gs.best_params_)
print("-"*50)
print(gs.best_score_)
print(f"Time taken to run the cell - {datetime.now() - start}")

{'max_depth': 15, 'n_estimators': 100}
--------------------------------------------------
0.9553662718705496
Time taken to run the cell - 0:48:15.707227


In [16]:
test_preds = gs.predict_proba(test.drop(["went_on_backorder"], axis=1))[:,1]
test_auc_score = roc_auc_score(test["went_on_backorder"], test_preds)
print(f"Test AUC - {test_auc_score}")

Test AUC - 0.9204662088803406


In [17]:
## Best HyperParameters max_depth = 15, n_estimators = 100
def Balanced_RF_model_with_5_folds(train, test):
    X_train = train.drop(["went_on_backorder"], axis=1)
    X_test = test.drop(["went_on_backorder"], axis=1)
    Y_train = train["went_on_backorder"]
    Y_test = test['went_on_backorder']
    
    skf = StratifiedKFold(n_splits = 5, random_state = 2021, shuffle = True)
    train_auc_scores = []
    valid_auc_scores = []
    
    for fold, (train_index, validation_index) in enumerate(skf.split(X_train, Y_train)):
        x_train, y_train  = X_train.iloc[train_index], Y_train[train_index]
        x_valid, y_valid = X_train.iloc[validation_index], Y_train[validation_index]
        
        model = BalancedRandomForestClassifier(max_depth = 15, n_estimators = 100, criterion='gini',
                                               sampling_strategy='auto',n_jobs=-1 ,random_state=2021)
        model.fit(x_train, y_train)
        
        train_preds = model.predict_proba(x_train)[:,1]
        valid_preds = model.predict_proba(x_valid)[:,1]
        train_auc = roc_auc_score(y_train, train_preds)
        valid_auc = roc_auc_score(y_valid, valid_preds)
        train_auc_scores.append(train_auc)
        valid_auc_scores.append(valid_auc)
        print(f"Fold : {fold} Done!")
    print(f"Train Mean AUC on 5 Folds data - {np.mean(train_auc_scores)} +/- {np.std(train_auc_scores)}")
    print(f"Validation Mean AUC on 5 Folds data - {np.mean(valid_auc_scores)} +/- {np.std(valid_auc_scores)}")
    
    test_preds = model.predict_proba(X_test)[:,1]
    test_auc_score = roc_auc_score(Y_test, test_preds)
    print(f"Test AUC - {test_auc_score}")

In [19]:
start = datetime.now()
Balanced_RF_model_with_5_folds(train, test)
print(f"Time taken to run the cell - {datetime.now() - start}")

Fold : 0 Done!
Fold : 1 Done!
Fold : 2 Done!
Fold : 3 Done!
Fold : 4 Done!
Train Mean AUC on 5 Folds data - 0.9713029239508497 +/- 0.00014569463465051417
Validation Mean AUC on 5 Folds data - 0.9545956283050808 +/- 0.0014340952186438017
Test AUC - 0.9193096648443534
Time taken to run the cell - 0:05:34.160769


## RandomForestClassifier + HyperParameter Tuning + ClassBalance

In [21]:
import optuna
from sklearn.model_selection import cross_val_score
## Defining the objective function
def objective(trial):
    rf_n_estimators = trial.suggest_int("rf_n_estimators", 10, 150)
    rf_criterion = trial.suggest_categorical("rf_criterion", ["gini", "entropy"])
    rf_max_depth = trial.suggest_int("rf_max_depth", 1,6)
    rf_min_samples_split = trial.suggest_float("rf_min_samples_split", 0.01, 1)
    rf_class_weight = trial.suggest_categorical("rf_class_balance", ["balanced", "balanced_subsample"])
    
    model = RandomForestClassifier(n_estimators = rf_n_estimators,
                                  criterion=rf_criterion,
                                  max_depth=rf_max_depth,
                                  min_samples_split=rf_min_samples_split,
                                  class_weight=rf_class_weight,
                                  random_state=2021,
                                  n_jobs=-1)
    score = cross_val_score(model, train.drop(["went_on_backorder"], axis=1), train["went_on_backorder"], cv = 5,
                           scoring="roc_auc")
    roc_auc = score.mean()
    return roc_auc

### Randomized Search
study = optuna.create_study(direction = "maximize",
                           sampler=optuna.samplers.RandomSampler())
study.optimize(objective, n_trials=10)

[32m[I 2022-08-14 11:35:16,201][0m A new study created in memory with name: no-name-6adf589e-f678-4b13-9784-4f8af2d8cce8[0m
[32m[I 2022-08-14 11:37:39,929][0m Trial 0 finished with value: 0.8299073757503133 and parameters: {'rf_n_estimators': 72, 'rf_criterion': 'gini', 'rf_max_depth': 6, 'rf_min_samples_split': 0.5394192462353075, 'rf_class_balance': 'balanced'}. Best is trial 0 with value: 0.8299073757503133.[0m
[32m[I 2022-08-14 11:38:19,645][0m Trial 1 finished with value: 0.5 and parameters: {'rf_n_estimators': 35, 'rf_criterion': 'gini', 'rf_max_depth': 3, 'rf_min_samples_split': 0.888221214741534, 'rf_class_balance': 'balanced'}. Best is trial 0 with value: 0.8299073757503133.[0m
[32m[I 2022-08-14 11:41:44,200][0m Trial 2 finished with value: 0.5 and parameters: {'rf_n_estimators': 84, 'rf_criterion': 'gini', 'rf_max_depth': 5, 'rf_min_samples_split': 0.9066210918866217, 'rf_class_balance': 'balanced_subsample'}. Best is trial 0 with value: 0.8299073757503133.[0m
[3

In [24]:
print(study.best_params)
print(study.best_value)

{'rf_n_estimators': 59, 'rf_criterion': 'entropy', 'rf_max_depth': 2, 'rf_min_samples_split': 0.4256462531364499, 'rf_class_balance': 'balanced'}
0.8318846797607163


In [26]:
## Best HyperParameters rf_n_estimators = 59, rf_criterion = entropy, rf_max_depth = 2,
  ### rf_min_samples_split = 0.4256462531364499, rf_class_balance = balanced
def RF_model_with_5_folds(train, test):
    X_train = train.drop(["went_on_backorder"], axis=1)
    X_test = test.drop(["went_on_backorder"], axis=1)
    Y_train = train["went_on_backorder"]
    Y_test = test['went_on_backorder']
    
    skf = StratifiedKFold(n_splits = 5, random_state = 2021, shuffle = True)
    train_auc_scores = []
    valid_auc_scores = []
    
    for fold, (train_index, validation_index) in enumerate(skf.split(X_train, Y_train)):
        x_train, y_train  = X_train.iloc[train_index], Y_train[train_index]
        x_valid, y_valid = X_train.iloc[validation_index], Y_train[validation_index]
        
        model = RandomForestClassifier(n_estimators = 59,
                                  criterion="entropy",
                                  max_depth=2,
                                  min_samples_split=0.4256462531364499,
                                  class_weight="balanced",
                                  random_state=2021,
                                  n_jobs=-1)
        model.fit(x_train, y_train)
        
        train_preds = model.predict_proba(x_train)[:,1]
        valid_preds = model.predict_proba(x_valid)[:,1]
        train_auc = roc_auc_score(y_train, train_preds)
        valid_auc = roc_auc_score(y_valid, valid_preds)
        train_auc_scores.append(train_auc)
        valid_auc_scores.append(valid_auc)
        print(f"Fold : {fold} Done!")
    print(f"Train Mean AUC on 5 Folds data - {np.mean(train_auc_scores)} +/- {np.std(train_auc_scores)}")
    print(f"Validation Mean AUC on 5 Folds data - {np.mean(valid_auc_scores)} +/- {np.std(valid_auc_scores)}")
    
    test_preds = model.predict_proba(X_test)[:,1]
    test_auc_score = roc_auc_score(Y_test, test_preds)
    print(f"Test AUC - {test_auc_score}")

In [27]:
start = datetime.now()
RF_model_with_5_folds(train, test)
print(f"Time taken to run the cell - {datetime.now() - start}")

Fold : 0 Done!
Fold : 1 Done!
Fold : 2 Done!
Fold : 3 Done!
Fold : 4 Done!
Train Mean AUC on 5 Folds data - 0.8339189048499971 +/- 0.00034572459127258964
Validation Mean AUC on 5 Folds data - 0.8337395087996986 +/- 0.0014441345880806273
Test AUC - 0.824480006019094
Time taken to run the cell - 0:02:24.464558


## XGBoost + HyperParameter Tuning + ClassBalance

In [29]:
## Estimating the class_weight for XGBoostClassifier
from collections import Counter
import xgboost as xgb
counter = Counter(train["went_on_backorder"])
estimate = counter[0] / counter[1]
print(f'Estimate: {round(estimate,0)}')

Estimate: 148.0


In [48]:
start = datetime.now()
## Defining the objective function
## Randomized Search Cross-Validation
def objective(trial):
    
    n_estimators = trial.suggest_int("n_estimators", 10, 200)
    learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 1.0)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10)    
    max_depth = trial.suggest_int("max_depth", 1, 50)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    colsample_bylevel = trial.suggest_float("colsample_bylevel", 0.1, 1.0)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-9, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-9, 100.0)
    gamma = trial.suggest_loguniform("gamma", 1e-9, 0.5)
    scale_pos_weight = trial.suggest_loguniform("scale_pos_weight", 1e-6, 500.0)
    objective = "binary:logistic" 
    
    model = xgb.XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
                              min_child_weight=min_child_weight, max_depth=max_depth,
                              subsample=subsample, colsample_bytree=colsample_bytree,
                              colsample_bylevel=colsample_bylevel, reg_lambda=reg_lambda,
                              reg_alpha=reg_alpha, gamma=gamma, scale_pos_weight=scale_pos_weight,
                              random_state=2021, n_jobs=-1)
    score = cross_val_score(model, train.drop(["went_on_backorder"], axis=1), train["went_on_backorder"], cv = 5,
                           scoring="roc_auc")
    roc_auc = score.mean()
    return roc_auc

### Randomized Search
study = optuna.create_study(direction = "maximize",
                           sampler=optuna.samplers.RandomSampler())
study.optimize(objective, n_trials=30)
print("-*-"*40)
print(f"Time taken to run the cell - {datetime.now() - start}")

[32m[I 2022-08-14 14:56:47,239][0m A new study created in memory with name: no-name-4ce36edd-7260-47cb-b96c-f5b31242c609[0m
[32m[I 2022-08-14 15:02:26,054][0m Trial 0 finished with value: 0.8249601641119565 and parameters: {'n_estimators': 143, 'learning_rate': 0.5799135168437762, 'min_child_weight': 2, 'max_depth': 10, 'subsample': 0.6313250919774398, 'colsample_bytree': 0.6459688871965389, 'colsample_bylevel': 0.41208667247134567, 'reg_lambda': 0.0001406950345560307, 'reg_alpha': 2.9745201223709223, 'gamma': 0.03240554192280146, 'scale_pos_weight': 0.0007728641780725432}. Best is trial 0 with value: 0.8249601641119565.[0m
[32m[I 2022-08-14 15:16:37,654][0m Trial 1 finished with value: 0.9204123426316085 and parameters: {'n_estimators': 115, 'learning_rate': 0.06445828641400057, 'min_child_weight': 2, 'max_depth': 4, 'subsample': 0.5511446552084855, 'colsample_bytree': 0.7847383729605939, 'colsample_bylevel': 0.8776387618642794, 'reg_lambda': 9.821297889624241e-06, 'reg_alpha'

[32m[I 2022-08-14 17:29:14,131][0m Trial 16 finished with value: 0.8783452084262933 and parameters: {'n_estimators': 81, 'learning_rate': 0.21009401080206105, 'min_child_weight': 6, 'max_depth': 38, 'subsample': 0.1334261939190146, 'colsample_bytree': 0.9197363697491171, 'colsample_bylevel': 0.43614131884284213, 'reg_lambda': 1.5563410445937694e-09, 'reg_alpha': 0.00015512615536687063, 'gamma': 3.1063468028203614e-07, 'scale_pos_weight': 0.00020816304381333987}. Best is trial 6 with value: 0.9743955390766889.[0m
[32m[I 2022-08-14 17:29:56,845][0m Trial 17 finished with value: 0.6558761471843674 and parameters: {'n_estimators': 13, 'learning_rate': 0.21612317887439783, 'min_child_weight': 4, 'max_depth': 29, 'subsample': 0.9621256941217331, 'colsample_bytree': 0.39429023240888383, 'colsample_bylevel': 0.1968772519187847, 'reg_lambda': 3.9180199508102334e-05, 'reg_alpha': 5.016115610080261e-06, 'gamma': 2.969575812306905e-06, 'scale_pos_weight': 0.0010557250102665098}. Best is trial

-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
Time taken to run the cell - 5:53:52.112667


In [49]:
## Best XGBoost HyperParameters and Best AUC Score
print(study.best_params)
print(study.best_value)

{'n_estimators': 126, 'learning_rate': 0.21125728020218873, 'min_child_weight': 2, 'max_depth': 47, 'subsample': 0.6747509344835381, 'colsample_bytree': 0.31890348436409244, 'colsample_bylevel': 0.42973814201972615, 'reg_lambda': 6.5679984796541, 'reg_alpha': 0.11956857281464722, 'gamma': 0.36044347227230367, 'scale_pos_weight': 5.774916874084893}
0.9764235709122525


In [52]:
## Best HyperParameters of XGBoostClassifier are 
def XGB_model_with_5_folds(train, test):
    X_train = train.drop(["went_on_backorder"], axis=1)
    X_test = test.drop(["went_on_backorder"], axis=1)
    Y_train = train["went_on_backorder"]
    Y_test = test['went_on_backorder']
    
    skf = StratifiedKFold(n_splits = 5, random_state = 2021, shuffle = True)
    train_auc_scores = []
    valid_auc_scores = []
    
    for fold, (train_index, validation_index) in enumerate(skf.split(X_train, Y_train)):
        x_train, y_train  = X_train.iloc[train_index], Y_train[train_index]
        x_valid, y_valid = X_train.iloc[validation_index], Y_train[validation_index]
        
        model = xgb.XGBClassifier(**study.best_params, random_state = 2021, n_jobs = -1, objective="binary:logistic",
                                 eval_metric=roc_auc_score)
        model.fit(x_train, y_train)
        
        train_preds = model.predict(x_train)
        valid_preds = model.predict(x_valid)
        train_auc = roc_auc_score(y_train, train_preds)
        valid_auc = roc_auc_score(y_valid, valid_preds)
        train_auc_scores.append(train_auc)
        valid_auc_scores.append(valid_auc)
        print(f"Fold : {fold} Done!")
    print(f"Train Mean AUC on 5 Folds data - {np.mean(train_auc_scores)} +/- {np.std(train_auc_scores)}")
    print(f"Validation Mean AUC on 5 Folds data - {np.mean(valid_auc_scores)} +/- {np.std(valid_auc_scores)}")
    
    test_preds = model.predict(X_test)
    test_auc_score = roc_auc_score(Y_test, test_preds)
    print(f"Test AUC - {test_auc_score}")

In [53]:
start = datetime.now()
XGB_model_with_5_folds(train, test)
print(f"Time taken to run the cell - {datetime.now() - start}")

Fold : 0 Done!
Fold : 1 Done!
Fold : 2 Done!
Fold : 3 Done!
Fold : 4 Done!
Train Mean AUC on 5 Folds data - 0.9231570135459064 +/- 0.0014578767143699073
Validation Mean AUC on 5 Folds data - 0.7038521246363053 +/- 0.0017847745717428664
Test AUC - 0.5302492870803741
Time taken to run the cell - 0:32:07.722000
