In [1]:
%load_ext cuml.accel
%load_ext cudf.pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

cuML: Accelerator installed.


In [2]:
data = pd.read_csv('clear.csv')
df =pd.DataFrame(data)
df.head()

Unnamed: 0,DateOfService,DateOfOrder,OrderQty,MenuName,MenuPrice,MenuSubsidy,GroupName,CanceledQty,DateOfCancel,Site,SchoolID
0,2020-01-02,2019-12-02,1,14.0,3.05,0.0,2.0,0,,1.0,2.0
1,2020-01-03,2019-12-02,1,14.0,3.05,0.0,2.0,0,,1.0,2.0
2,2020-01-02,2019-12-03,1,14.0,3.05,0.0,2.0,0,,1.0,2.0
3,2020-01-02,2019-12-04,1,14.0,3.05,0.0,2.0,0,,1.0,2.0
4,2020-01-03,2019-12-04,1,14.0,3.05,0.0,2.0,0,,1.0,2.0


In [3]:
df['DateOfService']  = pd.to_datetime(df['DateOfService'] ,errors='coerce')
df['DateOfOrder']    = pd.to_datetime(df['DateOfOrder'], errors='coerce')
df['DateOfCancel']   = pd.to_datetime(df['DateOfCancel'], errors='coerce')

# بازه زمانی ثبت تا تحویل
df['LeadTime'] = (df['DateOfService'] - df['DateOfOrder']).dt.days
# بازه زمانی ثبت تا کنسل شدن
df['CancellationLeadTime'] = (df['DateOfCancel'] - df['DateOfOrder']).dt.days

df['order_year'] = df['DateOfOrder'].dt.year
df['order_month'] = df['DateOfOrder'].dt.month
df['order_day'] = df['DateOfOrder'].dt.day
df['order_weekday'] = df['DateOfOrder'].dt.weekday  # 0=دوشنبه, 6=یکشنبه
df['order_quarter'] = df['DateOfOrder'].dt.quarter
df['order_WeekOfYear'] = df['DateOfOrder'].dt.isocalendar().week  # هفته سال
df['order_dayofyear'] = df['DateOfOrder'].dt.dayofyear    # شماره روز سال


# استخراج از DateOfService
df['service_year'] = df['DateOfService'].dt.year
df['service_month'] = df['DateOfService'].dt.month
df['service_day'] = df['DateOfService'].dt.day
df['service_weekday'] = df['DateOfService'].dt.weekday
df['service_quarter'] = df['DateOfService'].dt.quarter
df['service_WeekOfYear'] = df['DateOfService'].dt.isocalendar().week  # هفته سال
df['service_dayofyear'] = df['DateOfService'].dt.dayofyear    # شماره روز سال


df = df.sort_values(by='DateOfOrder', ascending=True)
df.head()

Unnamed: 0,DateOfService,DateOfOrder,OrderQty,MenuName,MenuPrice,MenuSubsidy,GroupName,CanceledQty,DateOfCancel,Site,...,order_quarter,order_WeekOfYear,order_dayofyear,service_year,service_month,service_day,service_weekday,service_quarter,service_WeekOfYear,service_dayofyear
0,2020-01-02,2019-12-02,1,14.0,3.05,0.0,2.0,0,NaT,1.0,...,4,49,336,2020,1,2,3,1,1,2
1,2020-01-03,2019-12-02,1,14.0,3.05,0.0,2.0,0,NaT,1.0,...,4,49,336,2020,1,3,4,1,1,3
2,2020-01-02,2019-12-03,1,14.0,3.05,0.0,2.0,0,NaT,1.0,...,4,49,337,2020,1,2,3,1,1,2
3,2020-01-02,2019-12-04,1,14.0,3.05,0.0,2.0,0,NaT,1.0,...,4,49,338,2020,1,2,3,1,1,2
4,2020-01-03,2019-12-04,1,14.0,3.05,0.0,2.0,0,NaT,1.0,...,4,49,338,2020,1,3,4,1,1,3


In [4]:
# df.to_csv('clear2_feature.csv',index=False, date_format='%Y-%m-%d')

In [5]:

aggregation_rules = {
    # ویژگی‌های عددی اصلی
    'OrderQty': 'sum',
    'MenuPrice': ['mean', 'sum'],
    'MenuSubsidy': ['mean', 'sum'],
    'LeadTime': ['mean', 'std', 'max'],
    'CancellationLeadTime': ['mean', 'std'],

    # ویژگی‌های دسته‌ای
    'MenuName': 'nunique',
    'SchoolID': 'nunique',
    'GroupName': 'nunique',
    'Site': 'nunique',

    # ویژگی‌های مبتنی بر DateOfOrder (مقدارشان در هر گروه یکسان است)
    'order_year': 'first',
    'order_month': 'first',
    'order_day': 'first',
    'order_weekday': 'first',
    'order_quarter': 'first',
    'order_WeekOfYear': 'first',
    'order_dayofyear': 'first',

    # ویژگی‌های مبتنی بر DateOfService (تنوع آنها در هر روز مهم است)
    'service_weekday': 'nunique',
    'service_day': 'nunique',
    'service_month': 'nunique',

    # ستونی که بعداً برای ساخت هدف استفاده می‌شود
    'CanceledQty': 'sum'
}

# 4. اجرای تجمیع کامل
daily_aggregated_df = df.groupby('DateOfOrder').agg(aggregation_rules)
daily_aggregated_df.columns = ['_'.join(col).strip() for col in daily_aggregated_df.columns.values]

# 5. ساخت متغیر هدف (Target) به روش ایمن
target_df = df.groupby('DateOfCancel')['CanceledQty'].sum().reset_index()
target_df.rename(columns={'DateOfCancel': 'Date', 'CanceledQty': 'CanceledQty_Target'}, inplace=True)

final_df = daily_aggregated_df.reset_index().merge(
    target_df,
    left_on='DateOfOrder',
    right_on='Date',
    how='left'
)
final_df['CanceledQty_Tomorrow'] = final_df['CanceledQty_Target'].shift(-1)
final_df.fillna(0, inplace=True)


# 1. ساخت Lag Features بر اساس ستون هدف
final_df['CanceledQty_lag_1'] = final_df['CanceledQty_Tomorrow'].shift(1)
final_df['CanceledQty_lag_7'] = final_df['CanceledQty_Tomorrow'].shift(7)

# 2. ساخت Rolling Window Features بر اساس ستون هدف
final_df['CanceledQty_rolling_mean_7'] = final_df['CanceledQty_Tomorrow'].shift(1).rolling(window=7).mean()
final_df['CanceledQty_rolling_std_7'] = final_df['CanceledQty_Tomorrow'].shift(1).rolling(window=7).std()

# 3. پر کردن مقادیر خالی نهایی
# ویژگی‌های Lag و Rolling در ابتدای دیتافریم مقادیر NaN ایجاد می‌کنند
final_df.fillna(0, inplace=True)

final_df.drop(['Date', 'CanceledQty_Target'], axis=1, inplace=True)

final_df

Unnamed: 0,DateOfOrder,OrderQty_sum,MenuPrice_mean,MenuPrice_sum,MenuSubsidy_mean,MenuSubsidy_sum,LeadTime_mean,LeadTime_std,LeadTime_max,CancellationLeadTime_mean,...,order_dayofyear_first,service_weekday_nunique,service_day_nunique,service_month_nunique,CanceledQty_sum,CanceledQty_Tomorrow,CanceledQty_lag_1,CanceledQty_lag_7,CanceledQty_rolling_mean_7,CanceledQty_rolling_std_7
0,2019-12-02,2,3.050000,6.10,0.000000,0.00,31.500000,0.707107,32,0.0,...,336,2,2,1,0,0,0,0,0.000000,0.000000
1,2019-12-03,1,3.050000,3.05,0.000000,0.00,30.000000,0.000000,30,0.0,...,337,1,1,1,0,0,0,0,0.000000,0.000000
2,2019-12-04,2,3.050000,6.10,0.000000,0.00,29.500000,0.707107,30,0.0,...,338,2,2,1,0,0,0,0,0.000000,0.000000
3,2019-12-07,13,3.626923,47.15,0.000000,0.00,31.538462,2.295481,33,0.0,...,341,3,4,1,0,0,0,0,0.000000,0.000000
4,2019-12-08,60,2.442667,146.56,1.064833,63.89,31.300000,0.888724,33,0.0,...,342,5,5,1,0,0,0,0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1408,2023-12-17,608,3.112089,1892.15,0.723355,439.80,2.087171,0.948753,5,0.0,...,351,5,5,1,0,0,3,19,6.142857,6.067085
1409,2023-12-18,451,3.076608,1387.55,0.797339,359.60,1.747228,0.812656,4,0.0,...,352,4,4,1,0,0,0,4,3.428571,2.636737
1410,2023-12-19,141,3.168085,446.70,0.679433,95.80,1.510638,0.692996,3,0.0,...,353,3,3,1,0,1,0,5,2.857143,2.911390
1411,2023-12-20,50,3.603000,180.15,0.071000,3.55,1.320000,0.471212,2,0.0,...,354,2,2,1,0,0,1,5,2.285714,2.811541


In [6]:
#final_df.to_csv('clear2_feature_agg.csv',index=False, date_format='%Y-%m-%d')

In [9]:
import xgboost as xgb
import optuna
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error


drop_cols = ['DateOfOrder','CanceledQty_Tomorrow']
X = final_df.drop(columns=drop_cols, axis=1)
Y = final_df['CanceledQty_Tomorrow'].values.ravel()  # y به عنوان array

# لیست numerical_cols (برای scaling)
numerical_cols = numerical_cols = [
    'OrderQty_sum',
    'MenuPrice_mean',
    'MenuPrice_sum',
    'MenuSubsidy_mean',
    'MenuSubsidy_sum',
    'LeadTime_mean',
    'LeadTime_std',
    'LeadTime_max',
    'CancellationLeadTime_mean',
    'CancellationLeadTime_std',
    'MenuName_nunique',
    'SchoolID_nunique',
    'GroupName_nunique',
    'Site_nunique',
    'service_weekday_nunique',
    'service_day_nunique',
    'service_month_nunique',
    'CanceledQty_sum',

    'CanceledQty_lag_1',
    'CanceledQty_lag_7',
    'CanceledQty_rolling_mean_7',
    'CanceledQty_rolling_std_7',
]

def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'early_stopping_rounds': 20 ,
        'device':'cuda',
        'verbosity' : 0,
    }

    tscv = TimeSeriesSplit(n_splits=5)
    test_rmse_scores = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        Y_train, Y_test = Y[train_index], Y[test_index]

        scaler = MinMaxScaler()
        X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
        X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

        model = xgb.XGBRegressor(**params)
        model.fit(X_train, Y_train, eval_set=[(X_test, Y_test)],verbose=False )

        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(Y_test, preds))
        test_rmse_scores.append(rmse)

    return np.mean(test_rmse_scores)

print("--- Start of Step 1: Parameter Optimization ---")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50) # برای نتایج بهتر می‌توانید تعداد را افزایش دهید

best_params = study.best_params
print("\n* Optimization completed.")
print(f"Best RMSE value in test (CV): {study.best_value:.4f}")
print("Best parameters found:", best_params)


# --- مرحله ۲: ارزیابی کامل مدل بهینه ---

print("\n--- Start of stage 2: Full evaluation of the model with the best parameters ---")
train_r2_scores, train_rmse_scores = [], []
test_r2_scores, test_rmse_scores = [], []

tscv = TimeSeriesSplit(n_splits=5)

for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
    Y_train, Y_test = Y[train_index], Y[test_index]

    scaler = MinMaxScaler()
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

    # ایجاد مدل با بهترین پارامترهای یافت‌شده
    model = xgb.XGBRegressor(**best_params)
    model.fit(X_train, Y_train)

    # پیش‌بینی روی داده‌های آموزشی و تست
    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)

    # محاسبه متریک‌ها
    train_r2 = r2_score(Y_train, Y_train_pred)
    train_rmse = np.sqrt(mean_squared_error(Y_train, Y_train_pred))
    test_r2 = r2_score(Y_test, Y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(Y_test, Y_test_pred))

    # مدل Baseline برای مقایسه
    baseline_pred = np.full_like(Y_test, Y_train.mean())
    baseline_r2 = r2_score(Y_test, baseline_pred)
    baseline_rmse = np.sqrt(mean_squared_error(Y_test, baseline_pred))

    # ذخیره نتایج
    train_r2_scores.append(train_r2)
    train_rmse_scores.append(train_rmse)
    test_r2_scores.append(test_r2)
    test_rmse_scores.append(test_rmse)

    # چاپ گزارش برای هر Fold
    print(f"\nFold {fold+1}:")
    print(f"  Train R2 = {train_r2:.4f}, Train RMSE = {train_rmse:.4f}")
    print(f"  Test R2 = {test_r2:.4f}, Test RMSE = {test_rmse:.4f}")
    print(f"  Baseline (mean) R2 = {baseline_r2:.4f}, Baseline RMSE = {baseline_rmse:.4f}")
    print(f"  Mean Y_test = {Y_test.mean():.4f}")

# میانگین نهایی متریک‌ها
print("\n--- The final average of the metrics (overall evaluation of the optimal model) ---")
print(f"Mean Train R2: {np.mean(train_r2_scores):.4f}")
print(f"Mean Train RMSE: {np.mean(train_rmse_scores):.4f}")
print(f"Mean Test R2: {np.mean(test_r2_scores):.4f}")
print(f"Mean Test RMSE: {np.mean(test_rmse_scores):.4f}")

[I 2025-07-23 02:39:54,080] A new study created in memory with name: no-name-542b2f8c-b49a-46f4-aec3-4d8c8a658e57


--- Start of Step 1: Parameter Optimization ---


[I 2025-07-23 02:39:57,488] Trial 0 finished with value: 240.61108317945232 and parameters: {'n_estimators': 890, 'learning_rate': 0.06981584226708164, 'max_depth': 4, 'subsample': 0.8184647939670444, 'colsample_bytree': 0.8407733675408666, 'lambda': 0.1024857280427571, 'alpha': 2.100634777709473}. Best is trial 0 with value: 240.61108317945232.
[I 2025-07-23 02:40:01,848] Trial 1 finished with value: 238.73302010379513 and parameters: {'n_estimators': 836, 'learning_rate': 0.012345777996914152, 'max_depth': 5, 'subsample': 0.7425833688614805, 'colsample_bytree': 0.7712431768369891, 'lambda': 0.2781603280191379, 'alpha': 0.693760843603758}. Best is trial 1 with value: 238.73302010379513.
[I 2025-07-23 02:40:04,346] Trial 2 finished with value: 239.5428384508992 and parameters: {'n_estimators': 749, 'learning_rate': 0.13069926382373118, 'max_depth': 4, 'subsample': 0.7830423262757295, 'colsample_bytree': 0.6733991357258005, 'lambda': 1.0577626037530008, 'alpha': 0.04258974782502902}. Be


* Optimization completed.
Best RMSE value in test (CV): 233.4891
Best parameters found: {'n_estimators': 817, 'learning_rate': 0.0673674981460466, 'max_depth': 7, 'subsample': 0.6650894381711493, 'colsample_bytree': 0.8305841907975253, 'lambda': 1.9434128031176103, 'alpha': 3.3633076561126685}

--- Start of stage 2: Full evaluation of the model with the best parameters ---


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)



Fold 1:
  Train R2 = 1.0000, Train RMSE = 0.2746
  Test R2 = -1.0236, Test RMSE = 82.3832
  Baseline (mean) R2 = -0.0636, Baseline RMSE = 59.7249
  Mean Y_test = 16.4000

Fold 2:
  Train R2 = 1.0000, Train RMSE = 0.2654
  Test R2 = 0.0575, Test RMSE = 135.1862
  Baseline (mean) R2 = -0.0107, Baseline RMSE = 139.9976
  Mean Y_test = 37.4255

Fold 3:
  Train R2 = 1.0000, Train RMSE = 0.2666
  Test R2 = 0.1468, Test RMSE = 127.9404
  Baseline (mean) R2 = -0.0103, Baseline RMSE = 139.2222
  Mean Y_test = 42.0553

Fold 4:
  Train R2 = 1.0000, Train RMSE = 0.2730
  Test R2 = -0.0064, Test RMSE = 111.9694
  Baseline (mean) R2 = -0.0047, Baseline RMSE = 111.8720
  Mean Y_test = 38.6213

Fold 5:
  Train R2 = 1.0000, Train RMSE = 0.2873
  Test R2 = 0.0273, Test RMSE = 767.8936
  Baseline (mean) R2 = -0.0052, Baseline RMSE = 780.6347
  Mean Y_test = 89.2511

--- The final average of the metrics (overall evaluation of the optimal model) ---
Mean Train R2: 1.0000
Mean Train RMSE: 0.2734
Mean Test 

Unnamed: 0,DateOfOrder,OrderQty_sum,MenuPrice_mean,MenuPrice_sum,MenuSubsidy_mean,MenuSubsidy_sum,LeadTime_mean,LeadTime_std,LeadTime_max,CancellationLeadTime_mean,...,order_day_first,order_weekday_first,order_quarter_first,order_WeekOfYear_first,order_dayofyear_first,service_weekday_nunique,service_day_nunique,service_month_nunique,CanceledQty_sum,CanceledQty_Tomorrow
0,2019-12-02,2,3.050000,6.10,0.000000,0.00,31.500000,0.707107,32,0.0,...,2,0,4,49,336,2,2,1,0,0
1,2019-12-03,1,3.050000,3.05,0.000000,0.00,30.000000,0.000000,30,0.0,...,3,1,4,49,337,1,1,1,0,0
2,2019-12-04,2,3.050000,6.10,0.000000,0.00,29.500000,0.707107,30,0.0,...,4,2,4,49,338,2,2,1,0,0
3,2019-12-07,13,3.626923,47.15,0.000000,0.00,31.538462,2.295481,33,0.0,...,7,5,4,49,341,3,4,1,0,0
4,2019-12-08,60,2.442667,146.56,1.064833,63.89,31.300000,0.888724,33,0.0,...,8,6,4,49,342,5,5,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1408,2023-12-17,608,3.112089,1892.15,0.723355,439.80,2.087171,0.948753,5,0.0,...,17,6,4,50,351,5,5,1,0,0
1409,2023-12-18,451,3.076608,1387.55,0.797339,359.60,1.747228,0.812656,4,0.0,...,18,0,4,51,352,4,4,1,0,0
1410,2023-12-19,141,3.168085,446.70,0.679433,95.80,1.510638,0.692996,3,0.0,...,19,1,4,51,353,3,3,1,0,1
1411,2023-12-20,50,3.603000,180.15,0.071000,3.55,1.320000,0.471212,2,0.0,...,20,2,4,51,354,2,2,1,0,0
