In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import mlflow
import mlflow.sklearn

# Load the cleaned dataset
data = pd.read_csv('S:/SJSU/DATA_226/group_project/data/processed/cleaned_improved_dataset.csv')

# 1. Feature Engineering
# Include "SKU" in the categorical columns for one-hot encoding
categorical_columns = ["Product type", "SKU", "Customer demographics", "Shipping carriers", 
                       "Supplier name", "Location", "Inspection results", 
                       "Transportation modes", "Routes", "Season"]

# One-hot encoding for all categorical variables, including SKU
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Extract features and target variable
X = data_encoded.drop(columns=["Revenue generated"])  # Features
y = data_encoded["Revenue generated"]  # Target

# Identify the numeric columns for transformations
numeric_columns = X.select_dtypes(include=['float64', 'int64']).columns

# Standardize numerical features
scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Cyclical Encoding for Temporal Features
season_mapping = {"Winter": 0, "Spring": 1, "Summer": 2, "Fall": 3}
data_encoded["Season_Cyclical"] = data["Season"].map(season_mapping)
X["Season_Sin"] = np.sin(2 * np.pi * data_encoded["Season_Cyclical"] / 4)
X["Season_Cos"] = np.cos(2 * np.pi * data_encoded["Season_Cyclical"] / 4)
X = X.drop(columns=["Season_Cyclical"], errors='ignore')

# 2. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Start an MLflow run
mlflow.start_run()

# 3. Model Training: Random Forest Regressor
model = RandomForestRegressor(n_estimators=400, random_state=45)
model.fit(X_train, y_train)

# 4. Model Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Log parameters and metrics to MLflow
mlflow.log_param("n_estimators", 400)
mlflow.log_param("random_state", 45)
mlflow.log_metric("mse", mse)
mlflow.log_metric("r2", r2)

# Log the model
mlflow.sklearn.log_model(model, "random_forest_model")

# End the MLflow run
mlflow.end_run()

# Print metrics
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# 5. Feature Importance Analysis
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

# Print the top 10 important features
print("\nTop 10 Important Features:")
print(feature_importance_df.head(10))




Mean Squared Error: 0.013185391907327442
R-squared: 0.9926454894831901

Top 10 Important Features:
                    Feature  Importance
2   Number of products sold    0.767571
0                     Price    0.219490
7            Shipping costs    0.001033
1              Availability    0.000952
12                    Costs    0.000902
5          Order quantities    0.000884
13            Demand Factor    0.000839
4                Lead times    0.000729
11             Defect rates    0.000702
8        Production volumes    0.000696


In [3]:
from sklearn.model_selection import RandomizedSearchCV

# Correcting the hyperparameter grid to remove the invalid 'auto' option
param_distributions = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [10, 15, 20, 25, 30, None],
    "min_samples_split": [2, 5, 10, 15],
    "min_samples_leaf": [1, 2, 5, 10],
    "max_features": ["sqrt", "log2", 0.5, 0.75]  # Removed 'auto' and kept valid options
}

# Initialize the Random Forest model
model = RandomForestRegressor(random_state=45)

# Set up RandomizedSearchCV with the corrected parameter grid
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    n_iter=20,  # Number of parameter settings to sample
    cv=5,  # 5-fold cross-validation
    verbose=2,  # Print progress
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit the RandomizedSearchCV to the training data
random_search.fit(X_train, y_train)

# Extract the best model and hyperparameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Log the best parameters and metrics to MLflow
mlflow.start_run()
mlflow.log_params(best_params)
mlflow.log_metric("mse", mse)
mlflow.log_metric("r2", r2)
mlflow.sklearn.log_model(best_model, "random_forest_best_model")
mlflow.end_run()

# Print the best hyperparameters and metrics
best_params, mse, r2


Fitting 5 folds for each of 20 candidates, totalling 100 fits




({'n_estimators': 100,
  'min_samples_split': 2,
  'min_samples_leaf': 2,
  'max_features': 0.75,
  'max_depth': None},
 np.float64(0.02533564904186372),
 0.9858683535052877)

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import mlflow
import mlflow.sklearn

# Load the cleaned dataset
data = pd.read_csv('S:/SJSU/DATA_226/group_project/data/processed/cleaned_improved_dataset.csv')

# 1. Feature Engineering
# Include "SKU" in the categorical columns for one-hot encoding
categorical_columns = ["Product type", "SKU", "Customer demographics", "Shipping carriers", 
                       "Supplier name", "Location", "Inspection results", 
                       "Transportation modes", "Routes", "Season"]

# One-hot encoding for all categorical variables, including SKU
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Extract features and target variable
X = data_encoded.drop(columns=["Revenue generated"])  # Features
y = data_encoded["Revenue generated"]  # Target

# Identify the numeric columns for transformations
numeric_columns = X.select_dtypes(include=['float64', 'int64']).columns

# Standardize numerical features
scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Cyclical Encoding for Temporal Features
season_mapping = {"Winter": 0, "Spring": 1, "Summer": 2, "Fall": 3}
data_encoded["Season_Cyclical"] = data["Season"].map(season_mapping)
X["Season_Sin"] = np.sin(2 * np.pi * data_encoded["Season_Cyclical"] / 4)
X["Season_Cos"] = np.cos(2 * np.pi * data_encoded["Season_Cyclical"] / 4)
X = X.drop(columns=["Season_Cyclical"], errors='ignore')

# 2. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Start an MLflow run
mlflow.start_run()

# 3. Model Training: Random Forest Regressor
model = RandomForestRegressor(n_estimators=400, random_state=45)
model.fit(X_train, y_train)

# 4. Model Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Log parameters and metrics to MLflow
mlflow.log_param("n_estimators", 400)
mlflow.log_param("random_state", 45)
mlflow.log_metric("mse", mse)
mlflow.log_metric("r2", r2)

# Log feature importances to MLflow
feature_importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)
mlflow.log_text(feature_importance_df.to_string(), "feature_importance.txt")

# Log the model
mlflow.sklearn.log_model(model, "random_forest_model")

# End the MLflow run
mlflow.end_run()

# Print metrics
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Print the top 10 important features
print("\nTop 10 Important Features:")
print(feature_importance_df.head(10))

# 5. Warehouse Management Predictions
def predict_inventory_management(new_data):
    """
    Predict warehouse metrics using the trained model.
    """
    # One-hot encode new data and align it with training data
    new_data_encoded = pd.get_dummies(new_data, columns=categorical_columns, drop_first=True)
    
    # Identify missing columns and convert the set to a list
    missing_cols = list(set(X.columns) - set(new_data_encoded.columns))
    
    # Add missing columns with default value 0
    new_data_encoded = pd.concat([new_data_encoded, pd.DataFrame(0, index=new_data_encoded.index, columns=missing_cols)], axis=1)
    
    # Reorder columns to match the training set
    new_data_encoded = new_data_encoded[X.columns]
    
    # Standardize numeric columns
    new_data_encoded[numeric_columns] = scaler.transform(new_data_encoded[numeric_columns])
    
    # Make predictions
    predicted_values = model.predict(new_data_encoded)
    return predicted_values


# Example usage for warehouse management
new_data = pd.DataFrame({
    "Product type": ["haircare"],
    "SKU": ["SKU45"],
    "Price": [25.5],
    "Availability": [100],
    "Number of products sold": [50],
    "Customer demographics": ["Female"],
    "Stock levels": [80],
    "Lead times": [5],
    "Order quantities": [30],
    "Shipping times": [3],
    "Shipping carriers": ["Carrier A"],
    "Shipping costs": [12.5],
    "Supplier name": ["Supplier 1"],
    "Location": ["Mumbai"],
    "Production volumes": [500],
    "Manufacturing lead time": [10],
    "Manufacturing costs": [35.0],
    "Inspection results": ["Pass"],
    "Defect rates": [0.5],
    "Transportation modes": ["Road"],
    "Routes": ["Route A"],
    "Costs": [150.0],
    "Season": ["Winter"],
    "Demand Factor": [1.3]
})

# Generate and print predictions
predicted_values = predict_inventory_management(new_data)
print("Predicted Inventory Metrics:", predicted_values)




Mean Squared Error: 0.013185391907327442
R-squared: 0.9926454894831901

Top 10 Important Features:
                    Feature  Importance
2   Number of products sold    0.767571
0                     Price    0.219490
7            Shipping costs    0.001033
1              Availability    0.000952
12                    Costs    0.000902
5          Order quantities    0.000884
13            Demand Factor    0.000839
4                Lead times    0.000729
11             Defect rates    0.000702
8        Production volumes    0.000696
Predicted Inventory Metrics: [7.24355085]


MultiRegressor Output Model

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import mlflow
import mlflow.sklearn

# Load the cleaned dataset
data = pd.read_csv('S:/SJSU/DATA_226/group_project/data/processed/cleaned_improved_dataset.csv')

# 1. Feature Engineering
# Include "SKU" in the categorical columns for one-hot encoding
categorical_columns = ["Product type", "SKU", "Customer demographics", "Shipping carriers", 
                       "Supplier name", "Location", "Inspection results", 
                       "Transportation modes", "Routes", "Season"]

# One-hot encoding for all categorical variables, including SKU
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Extract features
X = data_encoded.drop(columns=["Revenue generated"])  # Features

# Derive multiple target variables for the warehouse use case
data_encoded["Restock Indicator"] = (data_encoded["Stock levels"] < data_encoded["Demand Factor"] * 10).astype(int)
data_encoded["Restock Date"] = data_encoded["Lead times"]  # Example: Use lead times as a proxy for restock date
data_encoded["Restock Quantity"] = np.maximum(data_encoded["Demand Factor"] * 5, 10)
data_encoded["Predicted Costs"] = data_encoded["Costs"] + data_encoded["Shipping costs"] + data_encoded["Manufacturing costs"]

# Define target variables
y = data_encoded[["Restock Indicator", "Restock Date", "Restock Quantity", "Predicted Costs"]]

# Identify the numeric columns for transformations
numeric_columns = X.select_dtypes(include=['float64', 'int64']).columns

# Standardize numerical features
scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Cyclical Encoding for Temporal Features
season_mapping = {"Winter": 0, "Spring": 1, "Summer": 2, "Fall": 3}
data_encoded["Season_Cyclical"] = data["Season"].map(season_mapping)
X["Season_Sin"] = np.sin(2 * np.pi * data_encoded["Season_Cyclical"] / 4)
X["Season_Cos"] = np.cos(2 * np.pi * data_encoded["Season_Cyclical"] / 4)
X = X.drop(columns=["Season_Cyclical"], errors='ignore')

# 2. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Start an MLflow run
mlflow.start_run()

# 3. Multi-Output Model Training: Random Forest Regressor
base_model = RandomForestRegressor(n_estimators=400, random_state=45)
multi_output_model = MultiOutputRegressor(base_model)
multi_output_model.fit(X_train, y_train)

# 4. Model Evaluation
y_pred = multi_output_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
r2 = r2_score(y_test, y_pred, multioutput='variance_weighted')

# Log parameters and metrics to MLflow
mlflow.log_param("n_estimators", 400)
mlflow.log_param("random_state", 45)
mlflow.log_metric("mse_restock_indicator", mse[0])
mlflow.log_metric("mse_restock_date", mse[1])
mlflow.log_metric("mse_restock_quantity", mse[2])
mlflow.log_metric("mse_predicted_costs", mse[3])
mlflow.log_metric("r2_score", r2)

# Log the model
mlflow.sklearn.log_model(multi_output_model, "multi_output_random_forest_model")

# End the MLflow run
mlflow.end_run()

# Print metrics
print(f"Mean Squared Errors: {mse}")
print(f"R-squared: {r2}")

# Extended Prediction Function
def predict_inventory_management(new_data):
    """
    Predict warehouse metrics for multiple SKUs using the trained multi-output model.
    """
    # One-hot encode new data and align it with training data
    new_data_encoded = pd.get_dummies(new_data, columns=categorical_columns, drop_first=True)
    
    # Identify and add any missing columns to match the training data
    missing_cols = list(set(X.columns) - set(new_data_encoded.columns))
    for col in missing_cols:
        new_data_encoded[col] = 0  # Add missing columns with default value 0

    # Reorder columns to match the training set
    new_data_encoded = new_data_encoded[X.columns]

    # Standardize numeric columns
    new_data_encoded[numeric_columns] = scaler.transform(new_data_encoded[numeric_columns])

    # Make predictions
    predictions = multi_output_model.predict(new_data_encoded)

    # Format the results for each SKU
    inventory_metrics = []
    for i in range(len(new_data)):
        metrics = {
            "SKU": new_data["SKU"].iloc[i],
            "Restock Indicator": int(predictions[i][0]),
            "Restock Date (days)": int(predictions[i][1]),
            "Restock Quantity": int(predictions[i][2]),
            "Predicted Costs": round(predictions[i][3], 2)
        }
        inventory_metrics.append(metrics)

    return inventory_metrics

# Example usage for multiple SKUs
new_data = pd.DataFrame({
    "Product type": ["haircare", "skincare"],
    "SKU": ["SKU45", "SKU46"],
    "Price": [25.5, 40.0],
    "Availability": [100, 150],
    "Number of products sold": [50, 60],
    "Customer demographics": ["Female", "Male"],
    "Stock levels": [80, 90],
    "Lead times": [5, 7],
    "Order quantities": [30, 40],
    "Shipping times": [3, 4],
    "Shipping carriers": ["Carrier A", "Carrier B"],
    "Shipping costs": [12.5, 15.0],
    "Supplier name": ["Supplier 1", "Supplier 2"],
    "Location": ["Mumbai", "Delhi"],
    "Production volumes": [500, 700],
    "Manufacturing lead time": [10, 12],
    "Manufacturing costs": [35.0, 50.0],
    "Inspection results": ["Pass", "Fail"],
    "Defect rates": [0.5, 1.0],
    "Transportation modes": ["Road", "Air"],
    "Routes": ["Route A", "Route B"],
    "Costs": [150.0, 200.0],
    "Season": ["Winter", "Summer"],
    "Demand Factor": [1.3, 1.5]
})

# Generate and print predictions for multiple SKUs
predicted_values = predict_inventory_management(new_data)
for item in predicted_values:
    print("Predicted Inventory Metrics:", item)




Mean Squared Errors: [  0.           0.           0.         112.59510873]
R-squared: 0.9918138387587876
Predicted Inventory Metrics: {'SKU': 'SKU45', 'Restock Indicator': 1, 'Restock Date (days)': 7, 'Restock Quantity': 10, 'Predicted Costs': np.float64(203.94)}
Predicted Inventory Metrics: {'SKU': 'SKU46', 'Restock Indicator': 1, 'Restock Date (days)': 7, 'Restock Quantity': 10, 'Predicted Costs': np.float64(254.19)}


  new_data_encoded[col] = 0  # Add missing columns with default value 0
  new_data_encoded[col] = 0  # Add missing columns with default value 0
  new_data_encoded[col] = 0  # Add missing columns with default value 0
  new_data_encoded[col] = 0  # Add missing columns with default value 0
  new_data_encoded[col] = 0  # Add missing columns with default value 0
  new_data_encoded[col] = 0  # Add missing columns with default value 0
  new_data_encoded[col] = 0  # Add missing columns with default value 0
  new_data_encoded[col] = 0  # Add missing columns with default value 0
  new_data_encoded[col] = 0  # Add missing columns with default value 0
  new_data_encoded[col] = 0  # Add missing columns with default value 0
  new_data_encoded[col] = 0  # Add missing columns with default value 0
  new_data_encoded[col] = 0  # Add missing columns with default value 0
  new_data_encoded[col] = 0  # Add missing columns with default value 0
  new_data_encoded[col] = 0  # Add missing columns with default 