In [3]:
import pandas as pd

# Load the dataset
file_path = '/Users/sgzh1/projects/order-fulfillment-forecast-requires/v2/data/learning_data_v2.csv'
data = pd.read_csv(file_path, low_memory=False)

# Display basic information about the dataset
data_info = data.info()
data_head = data.head()

data_info, data_head

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74974 entries, 0 to 74973
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 74974 non-null  int64  
 1   type               74974 non-null  object 
 2   processing_method  74974 non-null  object 
 3   day                74974 non-null  object 
 4   shift              74974 non-null  object 
 5   details            74974 non-null  int64  
 6   sku_id             74974 non-null  int64  
 7   qty                74974 non-null  int64  
 8   op_pallet_pick     74974 non-null  int64  
 9   op_cont_pick       74974 non-null  int64  
 10  op_cont_deliver    74974 non-null  float64
 11  op_cont_aboard     74974 non-null  int64  
 12  op_load_deliver    74974 non-null  int64  
dtypes: float64(1), int64(8), object(4)
memory usage: 7.4+ MB


(None,
        id type processing_method  day shift  details   sku_id      qty  \
 0  245673    3                 b  THU     D        4  1033531      450   
 1  245673    3                 b  THU     D        4  1032755     1200   
 2  245673    3                 b  THU     D        4  1033157     3600   
 3  245673    3                 b  THU     D        4  1032922     3000   
 4  245717    3                 b  WED     D        8  1032789  2000000   
 
    op_pallet_pick  op_cont_pick  op_cont_deliver  op_cont_aboard  \
 0               1             0              0.0               0   
 1               1             0              0.0               0   
 2               1             0              0.0               0   
 3               1             0              0.0               0   
 4              29             0              0.0               0   
 
    op_load_deliver  
 0                4  
 1                4  
 2                4  
 3                4  
 4             

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode categorical features
label_encoder_type = LabelEncoder()
label_encoder_processing_method = LabelEncoder()
label_encoder_day = LabelEncoder()
label_encoder_shift = LabelEncoder()

data['type_encoded'] = label_encoder_type.fit_transform(data['type'])
data['processing_method_encoded'] = label_encoder_processing_method.fit_transform(data['processing_method'])
data['day_encoded'] = label_encoder_day.fit_transform(data['day'])
data['shift_encoded'] = label_encoder_shift.fit_transform(data['shift'])

# Features and target variables
X = data[['type_encoded', 'processing_method_encoded', 'day_encoded', 'shift_encoded', 'details', 'sku_id', 'qty']]
y = data[['op_pallet_pick', 'op_cont_pick', 'op_cont_deliver', 'op_cont_aboard', 'op_load_deliver']]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((59979, 7), (14995, 7), (59979, 5), (14995, 5))

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

# Initialize the model
model = MultiOutputRegressor(RandomForestRegressor(random_state=42))

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
mse

array([7.53328422e-04, 2.10904533e-01, 3.32754054e-01, 2.57754787e+02,
       1.00468156e-03])

In [11]:
import joblib

# Save the trained model to a file
model_filename = '/Users/sgzh1/projects/order-fulfillment-forecast-requires/v2/model/modelorder_fulfillment_model.pkl'
joblib.dump(model, model_filename)

# Also save the label encoders to use for encoding during prediction
label_encoders_filename = '/Users/sgzh1/projects/order-fulfillment-forecast-requires/v2/model/modellabel_encoders.pkl'
joblib.dump({
    'type_encoder': label_encoder_type,
    'processing_method_encoder': label_encoder_processing_method,
    'day_encoder' : label_encoder_day,
    'shift_encoder' : label_encoder_shift
}, label_encoders_filename)

model_filename, label_encoders_filename

('/Users/sgzh1/projects/order-fulfillment-forecast-requires/v2/model/modelorder_fulfillment_model.pkl',
 '/Users/sgzh1/projects/order-fulfillment-forecast-requires/v2/model/modellabel_encoders.pkl')