In [1]:
import pandas as pd
import optuna
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # Assuming RandomForest for this example
from sklearn.metrics import mean_squared_error

# Load dataset
data = pd.read_csv("D:\\Dataset\\DataCoSupplyChainDataset.csv", encoding='cp1252')

# Preprocess the dataset (simplified)
X = data[["Days for shipping (real)", "Days for shipment (scheduled)", "Benefit per order", "Sales per customer", "Late_delivery_risk"]]
y = data["Order Profit Per Order"]

# Define objective function
def objective(trial):
    # Suggest hyperparameters for tuning
    n_estimators = trial.suggest_int("n_estimators", 10, 100)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    
    # Split data
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Model training
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )
    model.fit(X_train, y_train)
    
    # Predictions and evaluation
    y_pred = model.predict(X_valid)
    error = mean_squared_error(y_valid, y_pred)
    
    return error

# Run Optuna optimization with multiple seeds
results = []
for seed in range(5):
    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=seed))
    study.optimize(objective, 10)  # Customize number of trials as needed
    results.append(study.best_value)

# Calculate median and variance
median_result = np.median(results)
variance_result = np.var(results)

print("Median of best scores:", median_result)
print("Variance of best scores:", variance_result)


[I 2024-11-02 15:48:31,704] A new study created in memory with name: no-name-381f3d1f-7acd-42b3-9718-6cfd109586cd
[I 2024-11-02 15:48:33,356] Trial 0 finished with value: 20.7570865746805 and parameters: {'n_estimators': 59, 'max_depth': 8, 'min_samples_split': 7}. Best is trial 0 with value: 20.7570865746805.
[I 2024-11-02 15:48:34,798] Trial 1 finished with value: 23.316998711150276 and parameters: {'n_estimators': 59, 'max_depth': 6, 'min_samples_split': 7}. Best is trial 0 with value: 20.7570865746805.
[I 2024-11-02 15:48:36,533] Trial 2 finished with value: 21.398520950543357 and parameters: {'n_estimators': 49, 'max_depth': 10, 'min_samples_split': 10}. Best is trial 0 with value: 20.7570865746805.
[I 2024-11-02 15:48:38,013] Trial 3 finished with value: 20.54102528457721 and parameters: {'n_estimators': 44, 'max_depth': 9, 'min_samples_split': 6}. Best is trial 3 with value: 20.54102528457721.
[I 2024-11-02 15:48:39,928] Trial 4 finished with value: 15.146531210823104 and parame

Median of best scores: 14.870863939878248
Variance of best scores: 5.925391903933151


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Load the dataset with a different encoding
data_path = r'D:\Dataset\DataCoSupplyChainDataset.csv'
data = pd.read_csv(data_path, encoding='latin1')  # Try 'latin1' or another encoding if needed

# Check the shape and first few rows of the dataset
print("Dataset shape:", data.shape)
print(data.head())

# Check for missing values
print("Missing values in each column:")
print(data.isnull().sum())

# Preprocessing
data.dropna(inplace=True)  # Example: drop rows with missing values

# Define features and target
columns_to_drop = ['Delivery Status', 'Customer Email', 'Customer Fname', 'Customer Id', 'Customer Lname', 'Order Status']
X = data.drop(columns=[col for col in columns_to_drop if col in data.columns], errors='ignore')  # Drop columns safely
y = data['Days for shipping (real)']  # Ensure this column exists

# Confirm the presence of features
print("Features shape:", X.shape)
print("Target shape:", y.shape)

# Splitting the data into train and test
if not X.empty and not y.empty:  # Ensure both are non-empty
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Function to optimize using Optuna
    def objective(trial):
        model_type = trial.suggest_categorical("model_type", ["XGBoost", "SVM", "RandomForest"])
        
        if model_type == "XGBoost":
            params = {
                'max_depth': trial.suggest_int('max_depth', 1, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                'n_estimators': trial.suggest_int('n_estimators', 50, 500),
                'objective': 'reg:squarederror'
            }
            model = xgb.XGBRegressor(**params)
            
        elif model_type == "SVM":
            params = {
                'C': trial.suggest_float('C', 0.1, 10),
                'epsilon': trial.suggest_float('epsilon', 0.01, 1),
                'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])
            }
            model = SVR(**params)
            
        elif model_type == "RandomForest":
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 500),
                'max_depth': trial.suggest_int('max_depth', 1, 10),
            }
            model = RandomForestRegressor(**params)
        
        # Fit the model
        model.fit(X_train, y_train)
        
        # Predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        return rmse  # Optimize for RMSE

    # Run Optuna study
    study = optuna.create_study(direction='minimize')
    for seed in range(5):
        print(f"Running optimization for seed {seed}")
        np.random.seed(seed)
        study.optimize(objective, n_trials=100)

    # Report median and variance of results
    results = [study.best_value for _ in range(5)]  # Assuming each run gives one best value
    median = np.median(results)
    variance = np.var(results)

    print(f"Median RMSE: {median}, Variance: {variance}")
else:
    print("Features or target are empty. Check your data preprocessing.")


Dataset shape: (7895, 53)
       Type  Days for shipping (real)  Days for shipment (scheduled)  \
0     DEBIT                         3                              4   
1  TRANSFER                         5                              4   
2      CASH                         4                              4   
3     DEBIT                         3                              4   
4   PAYMENT                         2                              4   

   Benefit per order  Sales per customer   Delivery Status  \
0          91.250000          314.640015  Advance shipping   
1        -249.089996          311.359985     Late delivery   
2        -247.779999          309.720001  Shipping on time   
3          22.860001          304.809998  Advance shipping   
4         134.210007          298.250000  Advance shipping   

   Late_delivery_risk  Category Id   Category Name Customer City  ...  \
0                   0           73  Sporting Goods        Caguas  ...   
1                   1 

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import optuna
from pygam import LinearGAM

# Load the dataset with a different encoding
data_path = r'D:\Dataset\DataCoSupplyChainDataset.csv'
data = pd.read_csv(data_path, encoding='latin1')

# Check the shape and first few rows of the dataset
print("Initial dataset shape:", data.shape)
print(data.head())

# Preprocessing
# Drop rows with missing values
data.dropna(inplace=True)
print("Shape after dropping missing values:", data.shape)  # Check shape after dropping

# Define features and target
columns_to_drop = ['Delivery Status', 'Customer Email', 'Customer Fname', 'Customer Id', 'Customer Lname', 'Order Status']
X = data.drop(columns=[col for col in columns_to_drop if col in data.columns], errors='ignore')
y = data['Days for shipping (real)']

# Check if we still have data for training
print("Features shape:", X.shape)
print("Target shape:", y.shape)

# Ensure we have enough samples to split
if X.shape[0] > 0 and y.shape[0] > 0:
    # Splitting the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Function to optimize using Optuna
    def objective(trial):
        model_type = trial.suggest_categorical("model_type", ["XGBoost", "SVM", "RandomForest", "GAM"])
        
        if model_type == "XGBoost":
            params = {
                'max_depth': trial.suggest_int('max_depth', 1, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                'n_estimators': trial.suggest_int('n_estimators', 50, 500),
                'objective': 'reg:squarederror'
            }
            model = xgb.XGBRegressor(**params)
            
        elif model_type == "SVM":
            params = {
                'C': trial.suggest_float('C', 0.1, 10),
                'epsilon': trial.suggest_float('epsilon', 0.01, 1),
                'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])
            }
            model = SVR(**params)
            
        elif model_type == "RandomForest":
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 500),
                'max_depth': trial.suggest_int('max_depth', 1, 10),
            }
            model = RandomForestRegressor(**params)
            
        elif model_type == "GAM":
            model = LinearGAM()  # Adjust parameters as necessary
        
        # Fit the model
        model.fit(X_train, y_train)
        
        # Predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        return rmse  # Optimize for RMSE

    # Run Optuna study for different seeds and collect results
    seed_results = {}

    for seed in range(5):
        print(f"Running optimization for seed {seed}")
        np.random.seed(seed)
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=100)
        
        # Store the best RMSE for this seed
        seed_results[seed] = study.best_value

    # Reporting median and variance
    median_rmse = np.median(list(seed_results.values()))
    variance_rmse = np.var(list(seed_results.values()))

    print(f"Results for 5 seeds: {seed_results}")
    print(f"Median RMSE: {median_rmse}, Variance: {variance_rmse}")
else:
    print("Not enough data to perform train-test split.")


Initial dataset shape: (7895, 53)
       Type  Days for shipping (real)  Days for shipment (scheduled)  \
0     DEBIT                         3                              4   
1  TRANSFER                         5                              4   
2      CASH                         4                              4   
3     DEBIT                         3                              4   
4   PAYMENT                         2                              4   

   Benefit per order  Sales per customer   Delivery Status  \
0          91.250000          314.640015  Advance shipping   
1        -249.089996          311.359985     Late delivery   
2        -247.779999          309.720001  Shipping on time   
3          22.860001          304.809998  Advance shipping   
4         134.210007          298.250000  Advance shipping   

   Late_delivery_risk  Category Id   Category Name Customer City  ...  \
0                   0           73  Sporting Goods        Caguas  ...   
1             

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

# Load the dataset
# Load the dataset with a different encoding
data_path = r'D:\Dataset\DataCoSupplyChainDataset.csv'
data = pd.read_csv(data_path, encoding='latin1')

# Check the first few rows of the dataset
print(data.head())

# Assuming the dataset has already been preprocessed:
# Select relevant features and the target variable
X = data.drop(columns=['Days for shipping (real)'])  # Drop the target variable and any non-feature columns
y = data['Days for shipping (real)']

# Handling categorical variables if any (example: using one-hot encoding)
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost testing function
def xgboost_testing(X_train, y_train, X_test, y_test, params=None):
    """
    Trains an XGBoost model and evaluates it on the test dataset.

    Parameters:
    X_train (DataFrame): Training features
    y_train (Series): Training target
    X_test (DataFrame): Testing features
    y_test (Series): Testing target
    params (dict): Hyperparameters for the XGBoost model

    Returns:
    dict: A dictionary containing RMSE and R² scores
    """
    
    # If no params are provided, use default values
    if params is None:
        params = {
            'max_depth': 3,
            'learning_rate': 0.1,
            'n_estimators': 100,
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse'
        }

    # Create an XGBoost Regressor
    model = xgb.XGBRegressor(**params)

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Calculate R² score
    r2 = r2_score(y_test, y_pred)

    # Report the results
    results = {
        'RMSE': rmse,
        'R²': r2
    }

    print(f"XGBoost Testing Results:\nRMSE: {rmse}\nR²: {r2}")

    return results

# Run the XGBoost testing
results = xgboost_testing(X_train, y_train, X_test, y_test)

# Print the results
print(results)


       Type  Days for shipping (real)  Days for shipment (scheduled)  \
0     DEBIT                         3                              4   
1  TRANSFER                         5                              4   
2      CASH                         4                              4   
3     DEBIT                         3                              4   
4   PAYMENT                         2                              4   

   Benefit per order  Sales per customer   Delivery Status  \
0          91.250000          314.640015  Advance shipping   
1        -249.089996          311.359985     Late delivery   
2        -247.779999          309.720001  Shipping on time   
3          22.860001          304.809998  Advance shipping   
4         134.210007          298.250000  Advance shipping   

   Late_delivery_risk  Category Id   Category Name Customer City  ...  \
0                   0           73  Sporting Goods        Caguas  ...   
1                   1           73  Sporting Goo

In [8]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load your data
data_path = r'D:\Dataset\DataCoSupplyChainDataset.csv'
data = pd.read_csv(data_path, encoding='latin1')
# Identify numerical and categorical columns
num_cols = data.select_dtypes(include=['float64', 'int64']).columns
cat_cols = data.select_dtypes(include=['object']).columns

# Print identified columns for verification
print("Numerical Columns:", num_cols.tolist())
print("Categorical Columns:", cat_cols.tolist())

# Check for non-numeric values in numerical columns
for col in num_cols:
    print(f"Unique values in {col}:", data[col].unique())

# Convert non-numeric data to NaN in numerical columns, if necessary
data[num_cols] = data[num_cols].apply(pd.to_numeric, errors='coerce')

# Impute missing values for numerical columns
num_imputer = SimpleImputer(strategy='mean')
data[num_cols] = num_imputer.fit_transform(data[num_cols])

# Impute missing values for categorical columns
cat_imputer = SimpleImputer(strategy='most_frequent')
data[cat_cols] = cat_imputer.fit_transform(data[cat_cols])

# Check the DataFrame shape after imputation
print("Data shape after imputation:", data.shape)

# Select relevant features and the target variable
if 'Days for shipping (real)' in data.columns:
    X = data.drop(columns=['Days for shipping (real)'])  # Drop the target variable
    y = data['Days for shipping (real)']
else:
    raise ValueError("Target variable 'Days for shipping (real)' not found in the DataFrame.")

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the SVM model
model = SVR()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate and print the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


Numerical Columns: ['Days for shipping (real)', 'Days for shipment (scheduled)', 'Benefit per order', 'Sales per customer', 'Late_delivery_risk', 'Category Id', 'Customer Id', 'Customer Zipcode', 'Department Id', 'Latitude', 'Longitude', 'Order Customer Id', 'Order Id', 'Order Item Cardprod Id', 'Order Item Discount', 'Order Item Discount Rate', 'Order Item Id', 'Order Item Product Price', 'Order Item Profit Ratio', 'Order Item Quantity', 'Sales', 'Order Item Total', 'Order Profit Per Order', 'Order Zipcode', 'Product Card Id', 'Product Category Id', 'Product Description', 'Product Price', 'Product Status']
Categorical Columns: ['Type', 'Delivery Status', 'Category Name', 'Customer City', 'Customer Country', 'Customer Email', 'Customer Fname', 'Customer Lname', 'Customer Password', 'Customer Segment', 'Customer State', 'Customer Street', 'Department Name', 'Market', 'Order City', 'Order Country', 'order date (DateOrders)', 'Order Region', 'Order State', 'Order Status', 'Product Image',

ValueError: Columns must be same length as key