In [1]:
%load_ext cuml.accel
%load_ext cudf.pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

cuML: Accelerator installed.


In [2]:
data = pd.read_csv('clear.csv')
df =pd.DataFrame(data)
df.head()

Unnamed: 0,DateOfService,DateOfOrder,OrderQty,MenuName,MenuPrice,MenuSubsidy,GroupName,Site,SchoolID
0,2020-01-02,2019-12-02,1,14.0,3.05,0.0,2.0,1.0,2.0
1,2020-01-03,2019-12-02,1,14.0,3.05,0.0,2.0,1.0,2.0
2,2020-01-02,2019-12-03,1,14.0,3.05,0.0,2.0,1.0,2.0
3,2020-01-02,2019-12-04,1,14.0,3.05,0.0,2.0,1.0,2.0
4,2020-01-03,2019-12-04,1,14.0,3.05,0.0,2.0,1.0,2.0


In [3]:
df['DateOfService']  = pd.to_datetime(df['DateOfService'] ,errors='coerce')
df['DateOfOrder']    = pd.to_datetime(df['DateOfOrder'], errors='coerce')
df['days_to_service'] = (df['DateOfService'] - df['DateOfOrder']).dt.days  # تفاوت به روز
df.head()

Unnamed: 0,DateOfService,DateOfOrder,OrderQty,MenuName,MenuPrice,MenuSubsidy,GroupName,Site,SchoolID,days_to_service
0,2020-01-02,2019-12-02,1,14.0,3.05,0.0,2.0,1.0,2.0,31
1,2020-01-03,2019-12-02,1,14.0,3.05,0.0,2.0,1.0,2.0,32
2,2020-01-02,2019-12-03,1,14.0,3.05,0.0,2.0,1.0,2.0,30
3,2020-01-02,2019-12-04,1,14.0,3.05,0.0,2.0,1.0,2.0,29
4,2020-01-03,2019-12-04,1,14.0,3.05,0.0,2.0,1.0,2.0,30


In [4]:
df.columns

Index(['DateOfService', 'DateOfOrder', 'OrderQty', 'MenuName', 'MenuPrice',
       'MenuSubsidy', 'GroupName', 'Site', 'SchoolID', 'days_to_service'],
      dtype='object')

In [5]:
# استخراج از DateOfOrder
df['order_year'] = df['DateOfOrder'].dt.year
df['order_month'] = df['DateOfOrder'].dt.month
df['order_day'] = df['DateOfOrder'].dt.day
df['order_weekday'] = df['DateOfOrder'].dt.weekday  # 0=دوشنبه, 6=یکشنبه
df['order_quarter'] = df['DateOfOrder'].dt.quarter
df['order_WeekOfYear'] = df['DateOfOrder'].dt.isocalendar().week  # هفته سال
df['order_dayofyear'] = df['DateOfOrder'].dt.dayofyear    # شماره روز سال


# استخراج از DateOfService
df['service_year'] = df['DateOfService'].dt.year
df['service_month'] = df['DateOfService'].dt.month
df['service_day'] = df['DateOfService'].dt.day
df['service_weekday'] = df['DateOfService'].dt.weekday
df['service_quarter'] = df['DateOfService'].dt.quarter
df['service_WeekOfYear'] = df['DateOfService'].dt.isocalendar().week  # هفته سال
df['service_dayofyear'] = df['DateOfService'].dt.dayofyear    # شماره روز سال

df.head()

Unnamed: 0,DateOfService,DateOfOrder,OrderQty,MenuName,MenuPrice,MenuSubsidy,GroupName,Site,SchoolID,days_to_service,...,order_quarter,order_WeekOfYear,order_dayofyear,service_year,service_month,service_day,service_weekday,service_quarter,service_WeekOfYear,service_dayofyear
0,2020-01-02,2019-12-02,1,14.0,3.05,0.0,2.0,1.0,2.0,31,...,4,49,336,2020,1,2,3,1,1,2
1,2020-01-03,2019-12-02,1,14.0,3.05,0.0,2.0,1.0,2.0,32,...,4,49,336,2020,1,3,4,1,1,3
2,2020-01-02,2019-12-03,1,14.0,3.05,0.0,2.0,1.0,2.0,30,...,4,49,337,2020,1,2,3,1,1,2
3,2020-01-02,2019-12-04,1,14.0,3.05,0.0,2.0,1.0,2.0,29,...,4,49,338,2020,1,2,3,1,1,2
4,2020-01-03,2019-12-04,1,14.0,3.05,0.0,2.0,1.0,2.0,30,...,4,49,338,2020,1,3,4,1,1,3


In [6]:
# اول مرتب‌سازی برای حفظ ترتیب سری زمانی (درون گروه‌ها، بر اساس DateOfOrder)
df = df.sort_values(['SchoolID', 'Site', 'MenuName', 'DateOfOrder'])  # تغییر به DateOfOrder، با گروه‌ها اول

# شیفت گروهی به گذشته (lag=1) برای OrderQty و categoricalها
# این کار lagged features می‌سازه بدون به‌هم‌ریختن ترتیب یا overwrite
df['OrderQty_lag1'] = df.groupby(['SchoolID', 'Site', 'MenuName'])['OrderQty'].shift(1)  # OrderQty روز/سفارش گذشته
df['SchoolID_lag1'] = df.groupby(['SchoolID', 'Site', 'MenuName'])['SchoolID'].shift(1)  # SchoolID گذشته (اگر ثابت باشه، redundantه – می‌تونی drop کنی)
df['Site_lag1'] = df.groupby(['SchoolID', 'Site', 'MenuName'])['Site'].shift(1)  # Site گذشته
df['MenuName_lag1'] = df.groupby(['SchoolID', 'Site', 'MenuName'])['MenuName'].shift(1)  # MenuName گذشته (مفید اگر تغییر کنه)

# شیفت تارگت به آینده (برای forecasting)
df['target'] = df.groupby(['SchoolID', 'Site', 'MenuName'])['OrderQty'].shift(-1)  # OrderQty آینده (افق ۱ روز، بر اساس ترتیب DateOfOrder)

# drop ردیف‌هایی که lag یا target NaN دارن (معمولاً اول/آخر هر گروه – برای جلوگیری از NaN در مدل)
df = df.dropna(subset=['OrderQty_lag1', 'target'])  # یا subset دلخواه، مثل اضافه کردن 'SchoolID_lag1' اگر لازم باشه

print(df.columns)
df.head()


Index(['DateOfService', 'DateOfOrder', 'OrderQty', 'MenuName', 'MenuPrice',
       'MenuSubsidy', 'GroupName', 'Site', 'SchoolID', 'days_to_service',
       'order_year', 'order_month', 'order_day', 'order_weekday',
       'order_quarter', 'order_WeekOfYear', 'order_dayofyear', 'service_year',
       'service_month', 'service_day', 'service_weekday', 'service_quarter',
       'service_WeekOfYear', 'service_dayofyear', 'OrderQty_lag1',
       'SchoolID_lag1', 'Site_lag1', 'MenuName_lag1', 'target'],
      dtype='object')


Unnamed: 0,DateOfService,DateOfOrder,OrderQty,MenuName,MenuPrice,MenuSubsidy,GroupName,Site,SchoolID,days_to_service,...,service_day,service_weekday,service_quarter,service_WeekOfYear,service_dayofyear,OrderQty_lag1,SchoolID_lag1,Site_lag1,MenuName_lag1,target
32,2020-01-07,2019-12-08,1,14.0,2.95,0.0,124.0,1.0,0.0,30,...,7,1,1,2,7,1,0.0,1.0,14.0,1
33,2020-01-08,2019-12-08,1,14.0,2.95,0.0,124.0,1.0,0.0,31,...,8,2,1,2,8,1,0.0,1.0,14.0,1
34,2020-01-09,2019-12-08,1,14.0,2.95,0.0,124.0,1.0,0.0,32,...,9,3,1,2,9,1,0.0,1.0,14.0,1
35,2020-01-10,2019-12-08,1,14.0,2.95,0.0,124.0,1.0,0.0,33,...,10,4,1,2,10,1,0.0,1.0,14.0,1
86,2020-01-07,2019-12-09,1,14.0,2.95,0.0,124.0,1.0,0.0,29,...,7,1,1,2,7,1,0.0,1.0,14.0,1


In [7]:
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error


drop_cols = ['target', 'OrderQty', 'DateOfService', 'DateOfOrder']
X = df.drop(columns=drop_cols, axis=1)
Y = df['target'].values.ravel()  # y به عنوان array

# لیست numerical_cols (برای scaling)
numerical_cols = ['MenuPrice','MenuSubsidy','days_to_service', 'order_year', 'order_month',
                  'order_day', 'order_weekday',
                  'order_quarter', 'order_WeekOfYear', 'order_dayofyear', 'service_year',
                  'service_month', 'service_day', 'service_weekday', 'service_quarter',
                  'service_WeekOfYear', 'service_dayofyear', 'OrderQty_lag1' , ]

# TimeSeriesSplit با ۵ split
tscv = TimeSeriesSplit(n_splits=5)

# لیست برای ذخیره metrics
train_r2_scores = []
train_rmse_scores = []
test_r2_scores = []
test_rmse_scores = []

# loop روی هر fold
for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    # split data
    X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
    Y_train, Y_test = Y[train_index], Y[test_index]

    # scaling داخل fold (فقط روی train fit، روی test transform – جلوگیری از leakage)
    scaler = MinMaxScaler()
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

    # ایجاد DMatrix برای XGBoost
    dtrain = xgb.DMatrix(X_train, label=Y_train)
    dtest = xgb.DMatrix(X_test, label=Y_test)

    # پارامترهای XGBoost (اصلاح‌شده برای GPU)
    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'device': 'cuda',
        'eval_metric': 'rmse',
        'learning_rate': 0.05,
        'max_depth': 5,
        'subsample': 0.8,
    }

    # train مدل
    model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtest, 'test')], verbose_eval=False)

    # predict روی train و test
    Y_train_pred = model.predict(dtrain)
    Y_test_pred = model.predict(dtest)

    # ارزیابی train
    train_r2 = r2_score(Y_train, Y_train_pred)
    train_rmse = np.sqrt(mean_squared_error(Y_train, Y_train_pred))

    # ارزیابی test
    test_r2 = r2_score(Y_test, Y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(Y_test, Y_test_pred))

    # baseline ساده برای مقایسه (میانگین Y_train برای همه پیش‌بینی)
    baseline_pred = np.full_like(Y_test, Y_train.mean())
    baseline_r2 = r2_score(Y_test, baseline_pred)
    baseline_rmse = np.sqrt(mean_squared_error(Y_test, baseline_pred))

    train_r2_scores.append(train_r2)
    train_rmse_scores.append(train_rmse)
    test_r2_scores.append(test_r2)
    test_rmse_scores.append(test_rmse)

    print(f"Fold {fold+1}:")
    print(f"  Train R2 = {train_r2:.4f}, Train RMSE = {train_rmse:.4f}")
    print(f"  Test R2 = {test_r2:.4f}, Test RMSE = {test_rmse:.4f}")
    print(f"  Baseline (mean) R2 = {baseline_r2:.4f}, Baseline RMSE = {baseline_rmse:.4f}")
    print(f"  Mean Y_test = {Y_test.mean():.4f}")  # برای چک scale

# میانگین metrics برای ارزیابی کلی
print("\nMean Train R2:", np.mean(train_r2_scores))
print("Mean Train RMSE:", np.mean(train_rmse_scores))
print("Mean Test R2:", np.mean(test_r2_scores))
print("Mean Test RMSE:", np.mean(test_rmse_scores))


Fold 1:
  Train R2 = 0.5097, Train RMSE = 0.0520
  Test R2 = -0.0076, Test RMSE = 0.0682
  Baseline (mean) R2 = -0.0000, Baseline RMSE = 0.0679
  Mean Y_test = 1.0004
Fold 2:
  Train R2 = 0.3439, Train RMSE = 0.0577
  Test R2 = 0.0022, Test RMSE = 0.1187
  Baseline (mean) R2 = -0.0000, Baseline RMSE = 0.1189
  Mean Y_test = 1.0006
Fold 3:
  Train R2 = 0.1678, Train RMSE = 0.0821
  Test R2 = -1.6305, Test RMSE = 0.0434
  Baseline (mean) R2 = -0.0000, Baseline RMSE = 0.0268
  Mean Y_test = 1.0001
Fold 4:
  Train R2 = 0.1713, Train RMSE = 0.0720
  Test R2 = -0.0032, Test RMSE = 0.1655
  Baseline (mean) R2 = -0.0000, Baseline RMSE = 0.1653
  Mean Y_test = 1.0006
Fold 5:
  Train R2 = 0.3190, Train RMSE = 0.0844
  Test R2 = -0.0149, Test RMSE = 0.0319
  Baseline (mean) R2 = -0.0000, Baseline RMSE = 0.0316
  Mean Y_test = 1.0001

Mean Train R2: 0.3023263454437256
Mean Train RMSE: 0.06962519611795427
Mean Test R2: -0.3307978868484497
Mean Test RMSE: 0.08554627013877225


In [8]:
round(df.describe(), 2)

Unnamed: 0,DateOfService,DateOfOrder,OrderQty,MenuName,MenuPrice,MenuSubsidy,GroupName,Site,SchoolID,days_to_service,...,service_day,service_weekday,service_quarter,service_WeekOfYear,service_dayofyear,OrderQty_lag1,SchoolID_lag1,Site_lag1,MenuName_lag1,target
count,3887612,3887612,3887612.0,3887612.0,3887612.0,3887612.0,3887612.0,3887612.0,3887612.0,3887612.0,...,3887612.0,3887612.0,3887612.0,3887612.0,3887612.0,3887612.0,3887612.0,3887612.0,3887612.0,3887612.0
mean,2022-07-22 15:26:36.992702720,2022-06-29 13:23:29.010982400,1.0,17.25,2.59,1.08,78.86,0.59,43.44,23.09,...,16.04,1.77,2.67,29.66,204.48,1.0,43.44,0.59,17.25,1.0
min,2020-01-02 00:00:00,2019-12-02 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0
25%,2021-11-19 00:00:00,2021-10-13 00:00:00,1.0,14.0,0.0,0.0,33.0,0.0,15.0,13.0,...,9.0,0.0,1.0,13.0,88.0,1.0,15.0,0.0,14.0,1.0
50%,2022-10-31 00:00:00,2022-10-14 00:00:00,1.0,21.0,3.45,0.0,84.0,0.0,44.0,23.0,...,16.0,2.0,3.0,35.0,243.0,1.0,44.0,0.0,21.0,1.0
75%,2023-07-25 00:00:00,2023-07-04 00:00:00,1.0,21.0,3.8,2.85,127.0,1.0,61.0,34.0,...,23.0,3.0,4.0,44.0,306.0,1.0,61.0,1.0,21.0,1.0
max,2023-12-22 00:00:00,2023-12-21 00:00:00,84.0,23.0,4.95,5.45,164.0,2.0,134.0,77.0,...,31.0,6.0,4.0,52.0,358.0,84.0,134.0,2.0,23.0,84.0
std,,,0.09,5.88,1.66,1.64,48.85,0.72,32.03,13.04,...,8.63,1.29,1.19,15.97,112.04,0.09,32.03,0.72,5.88,0.09


In [5]:
import featuretools as ft
from woodwork.logical_types import Categorical, Datetime, Double , Integer


df = df.reset_index(drop=True)
df['id'] = df.index



# ایجاد EntitySet
es = ft.EntitySet(id='my_data')

# اضافه کردن dataframe به EntitySet با مشخص کردن نوع داده‌ها (logical types)
logical_types = {
    'DateOfService': Datetime,
    'DateOfOrder': Datetime,
    'MenuName' : Categorical,
    'MenuPrice' : Double,
    'MenuSubsidy' : Double,
    'GroupName' : Categorical,
    'Site' : Categorical,
    'SchoolID' : Categorical,
    'days_to_service' : Integer,

    'OrderQty' : Integer, # target
}

es = es.add_dataframe(
    dataframe_name='data',
    dataframe=df,
    index='id',  # اگر ایندکس نداری، می‌تونی بسازم
    logical_types=logical_types  # اینجا نوع داده‌ها رو مشخص کردیم
)


# مشخص کردن primitives برای categorical و datetime خاص

#agg_primitives=['mean', 'std', 'Kurtosis']

trans_primitives = [
    'year', 'month', 'day', 'is_weekend','sine','cosine',
]

feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name='data',
    trans_primitives=trans_primitives,
    #agg_primitives=agg_primitives,
    max_depth=1,  # عمق فیچرها
    ignore_columns={'data':['OrderQty']},
    verbose=True
)



# new_feature_columns = feature_matrix.columns.difference(df.columns)
#
# # ویژگی‌های جدید را به دیتافریم اصلی الحاق می‌کنیم
# df = df.join(feature_matrix[new_feature_columns])
#
# # در نهایت ستون‌های تاریخ و شناسه موقت را حذف می‌کنیم
# df.drop(columns=['DateOfService', 'DateOfOrder', 'id'], inplace=True)
df.head()

Built 24 features
Elapsed: 08:59 | Progress:  95%|█████████▌


KeyboardInterrupt: 