In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler


In [48]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Products.csv')

# Show the first 5 rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())

# Summary of the dataset
print("\nDataset Summary:")
print(df.describe())

# Checking the data types and missing values
print("\nData Types and Missing Values:")
print(df.info())


First 5 rows of the dataset:
       sku  national_inv  lead_time  in_transit_qty  forecast_3_month  \
0  1113121             0        8.0               1                 6   
1  1113268             0        8.0               0                 2   
2  1113874            20        2.0               0                45   
3  1114222             0        8.0               0                 9   
4  1114823             0       12.0               0                31   

   forecast_6_month  forecast_9_month  sales_1_month  sales_3_month  \
0                 6                 6              0              4   
1                 3                 4              1              2   
2                99               153             16             42   
3                14                21              5             17   
4                31                31              7             15   

   sales_6_month  ...  pieces_past_due  perf_6_month_avg perf_12_month_avg  \
0              9  ...      

In [49]:
# Checking for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Dropping missing values (if applicable)
df_cleaned = df.dropna()

# Handling categorical features by converting 'Yes'/'No' to binary values (0 and 1)
df_cleaned['went_on_backorder'] = df_cleaned['went_on_backorder'].map({'Yes': 1, 'No': 0})
df_cleaned['potential_issue'] = df_cleaned['potential_issue'].map({'Yes': 1, 'No': 0})
df_cleaned['deck_risk'] = df_cleaned['deck_risk'].map({'Yes': 1, 'No': 0})
df_cleaned['oe_constraint'] = df_cleaned['oe_constraint'].map({'Yes': 1, 'No': 0})
df_cleaned['ppap_risk'] = df_cleaned['ppap_risk'].map({'Yes': 1, 'No': 0})
df_cleaned['stop_auto_buy'] = df_cleaned['stop_auto_buy'].map({'Yes': 1, 'No': 0})
df_cleaned['rev_stop'] = df_cleaned['rev_stop'].map({'Yes': 1, 'No': 0})

print("\nCleaned Data:")
print(df_cleaned.head())



Missing values in each column:
sku                     0
national_inv            0
lead_time            1078
in_transit_qty          0
forecast_3_month        0
forecast_6_month        0
forecast_9_month        0
sales_1_month           0
sales_3_month           0
sales_6_month           0
sales_9_month           0
min_bank                0
potential_issue         0
pieces_past_due         0
perf_6_month_avg        0
perf_12_month_avg       0
local_bo_qty            0
deck_risk               0
oe_constraint           0
ppap_risk               0
stop_auto_buy           0
rev_stop                0
went_on_backorder       0
dtype: int64

Cleaned Data:
       sku  national_inv  lead_time  in_transit_qty  forecast_3_month  \
0  1113121             0        8.0               1                 6   
1  1113268             0        8.0               0                 2   
2  1113874            20        2.0               0                45   
3  1114222             0        8.0               

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['went_on_backorder'] = df_cleaned['went_on_backorder'].map({'Yes': 1, 'No': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['potential_issue'] = df_cleaned['potential_issue'].map({'Yes': 1, 'No': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['deck_risk'] = d

In [50]:
# Defining features and target variable
X = df_cleaned.drop(columns=['sku', 'went_on_backorder'])  # Removing 'sku' and 'went_on_backorder' from features
y = df_cleaned['went_on_backorder']

print("\nSelected Features:")
print(X.columns)



Selected Features:
Index(['national_inv', 'lead_time', 'in_transit_qty', 'forecast_3_month',
       'forecast_6_month', 'forecast_9_month', 'sales_1_month',
       'sales_3_month', 'sales_6_month', 'sales_9_month', 'min_bank',
       'potential_issue', 'pieces_past_due', 'perf_6_month_avg',
       'perf_12_month_avg', 'local_bo_qty', 'deck_risk', 'oe_constraint',
       'ppap_risk', 'stop_auto_buy', 'rev_stop'],
      dtype='object')


In [51]:
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining and Testing Data Shapes:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")



Training and Testing Data Shapes:
X_train: (14380, 21), X_test: (3595, 21)
y_train: (14380,), y_test: (3595,)


In [52]:
from sklearn.ensemble import RandomForestClassifier

# Training the model using RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

print("\nModel Training Complete")



Model Training Complete


In [53]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predicting the test data
y_pred = model.predict(X_test)

# Evaluating the model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")

# Confusion matrix and classification report
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Model Accuracy: 0.93

Confusion Matrix:
[[3063   99]
 [ 161  272]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      3162
           1       0.73      0.63      0.68       433

    accuracy                           0.93      3595
   macro avg       0.84      0.80      0.82      3595
weighted avg       0.92      0.93      0.93      3595



In [54]:
from sklearn.model_selection import GridSearchCV

# Defining the parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Performing grid search with cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters from the grid search
best_params = grid_search.best_params_
print(f"\nBest Parameters: {best_params}")

# Model with best parameters
best_model = grid_search.best_estimator_

# Re-evaluate the tuned model
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"\nTuned Model Accuracy: {accuracy_best:.2f}")


Fitting 3 folds for each of 27 candidates, totalling 81 fits

Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 300}

Tuned Model Accuracy: 0.92
