In [1]:
import mlflow

In [2]:
mlflow.set_experiment("test_experiment")

2025/06/28 05:42:44 INFO mlflow.tracking.fluent: Experiment with name 'test_experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///d:/eSewa%20Internship/final_project/modeling/mlruns/215093072945015715', creation_time=1751068664932, experiment_id='215093072945015715', last_update_time=1751068664932, lifecycle_stage='active', name='test_experiment', tags={}>

In [29]:
mlflow.set_experiment('training_experiment')


<Experiment: artifact_location='file:///d:/eSewa%20Internship/final_project/modeling/mlruns/989745218648407383', creation_time=1751070565932, experiment_id='989745218648407383', last_update_time=1751070565932, lifecycle_stage='active', name='training_experiment', tags={}>

In [3]:
X_train = [[1, 2], [3, 4], [5, 6]]
y_train = [1, 2, 3]

In [4]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor


# Enable autologging
mlflow.autolog()
mlflow.set_experiment('training_experiment')

with mlflow.start_run(run_name='training_model'):

    model = RandomForestRegressor(n_estimators=100, max_depth=5)
    model.fit(X_train, y_train)
 
    mlflow.sklearn.log_model(model, "model")

2025/06/28 11:12:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [5]:
!mlflow ui

^C


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

# --------------------------- #
# Step 1: Load and Aggregate
# --------------------------- #
def load_and_aggregate(df):
    df['transaction_date'] = pd.to_datetime(df['transaction_date'])
    full_date_range = pd.date_range(start=df['transaction_date'].min(),
                                end=df['transaction_date'].max())
    all_categories = df['category'].unique()
    complete_index = pd.MultiIndex.from_product(
        [all_categories, full_date_range],
        names=['category', 'transaction_date']
    )
    df_full = df_daily.set_index(['category', 'transaction_date']).reindex(complete_index)
    df_full['amount'] = df_full['amount'].fillna(0)
    
    return df.groupby(['transaction_date', 'category']).agg({
                                                'amount': 'sum',
                                                'is_festival': 'max',
                                                'is_holiday': 'max',
                                                'event_name': 'last'
                                            }).reset_index()

# --------------------------- #
# Step 2: Feature Engineering
# --------------------------- #
def engineer_features(df):
    df = df.sort_values(['category', 'transaction_date'])
    df['event_type'] = df['event_name'].apply(
        lambda x: 'disaster' if x in ['Earthquake', 'Recession'] else 'festival' if pd.notna(x) else 'none'
    )
    df['pre_event_window'] = df.groupby('category')['event_type'].shift(-1).isin(['festival', 'disaster']).fillna(False)
    df['post_event_window'] = df.groupby('category')['event_type'].shift(1).isin(['festival', 'disaster']).fillna(False)

    # Cyclical features
    df['day_of_week'] = df['transaction_date'].dt.weekday
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

    df['month'] = df['transaction_date'].dt.month
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

    doy = df['transaction_date'].dt.dayofyear
    df['day_of_year_sin'] = np.sin(2 * np.pi * doy / 365)
    df['day_of_year_cos'] = np.cos(2 * np.pi * doy / 365)

    # Lag + rolling
    for lag in [1, 7, 14]:
        df[f'lag_{lag}'] = df.groupby('category')['amount'].shift(lag)
    for window in [7, 14]:
        df[f'rolling_mean_{window}'] = df.groupby('category')['amount'].shift(1).rolling(window).mean().reset_index(0, drop=True)

    return df.drop(columns=['day_of_week', 'month'])

# --------------------------- #
# Step 3: Encoding
# --------------------------- #
def encode_features(train_df, test_df):
    # Target encoding
    category_target_mean = train_df.groupby('category')['amount'].mean()
    global_mean = train_df['amount'].mean()

    train_df['category'] = train_df['category'].map(category_target_mean)
    test_df['category'] = test_df['category'].map(category_target_mean).fillna(global_mean)

    # One-hot encoding for event_type
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoder.fit(train_df[['event_type']])
    train_event = encoder.transform(train_df[['event_type']])
    test_event = encoder.transform(test_df[['event_type']])

    ohe_cols = encoder.get_feature_names_out(['event_type'])
    train_ohe = pd.DataFrame(train_event, columns=ohe_cols, index=train_df.index)
    test_ohe = pd.DataFrame(test_event, columns=ohe_cols, index=test_df.index)

    train_df = pd.concat([train_df.drop(columns=['event_type']), train_ohe], axis=1)
    test_df = pd.concat([test_df.drop(columns=['event_type']), test_ohe], axis=1)

    return train_df, test_df, category_target_mean, global_mean, encoder

# --------------------------- #
# Step 4: Train Model
# --------------------------- #
def train_model(train_df):
    feature_cols = train_df.columns.difference(['transaction_date', 'amount', 'event_name'])
    X_train = train_df[feature_cols]
    y_train = train_df['amount']

    model = xgb.XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.1)
    model.fit(X_train, y_train)
    return model, feature_cols

# --------------------------- #
# Step 5: Forecast Function
# --------------------------- #
def forecast_next_7_days(category_name, model, original_df, cat_mean_map, global_mean, encoder, feature_cols):
    df_cat = original_df[original_df['category'] == category_name].copy()
    df_cat.sort_values('transaction_date', inplace=True)
    df_cat.reset_index(drop=True, inplace=True)

    forecasts = []
    last_date = df_cat['transaction_date'].max()

    for i in range(1, 8):
        date = last_date + timedelta(days=i)
        row = {
            'transaction_date': date,
            'category': cat_mean_map.get(category_name, global_mean),
            'is_festival': False,
            'is_holiday': False,
            'pre_event_window': False,
            'post_event_window': False,
            'day_of_week_sin': np.sin(2 * np.pi * date.weekday() / 7),
            'day_of_week_cos': np.cos(2 * np.pi * date.weekday() / 7),
            'month_sin': np.sin(2 * np.pi * date.month / 12),
            'month_cos': np.cos(2 * np.pi * date.month / 12),
            'day_of_year_sin': np.sin(2 * np.pi * date.timetuple().tm_yday / 365),
            'day_of_year_cos': np.cos(2 * np.pi * date.timetuple().tm_yday / 365),
            'lag_1': df_cat.iloc[-1]['amount'],
            'lag_7': df_cat.iloc[-7]['amount'] if len(df_cat) >= 7 else df_cat['amount'].mean(),
            'lag_14': df_cat.iloc[-14]['amount'] if len(df_cat) >= 14 else df_cat['amount'].mean(),
            'rolling_mean_7': df_cat['amount'].rolling(window=7).mean().iloc[-1] if len(df_cat) >= 7 else df_cat['amount'].mean(),
            'rolling_mean_14': df_cat['amount'].rolling(window=14).mean().iloc[-1] if len(df_cat) >= 14 else df_cat['amount'].mean()
        }

        event_ohe = encoder.transform([['none']])
        ohe_df = pd.DataFrame(event_ohe, columns=encoder.get_feature_names_out(['event_type']))
        row.update(ohe_df.iloc[0].to_dict())

        X = pd.DataFrame([row])[feature_cols]
        row['predicted_amount'] = model.predict(X)[0]
        forecasts.append(row)

        # Add prediction to history
        df_cat = pd.concat([df_cat, pd.DataFrame({'transaction_date': [date], 'category': [category_name], 'amount': [row['predicted_amount']]})], ignore_index=True)

    return pd.DataFrame(forecasts)

# --------------------------- #
# 🚀 Execute Pipeline
# --------------------------- #
def run_pipeline(raw_df, category_name):
    df = load_and_aggregate(raw_df)
    df = engineer_features(df)
    df.dropna(inplace=True)

    train_df, test_df = train_test_split(df, test_size=0.2, shuffle=False)
    train_df, test_df, cat_mean_map, global_mean, encoder = encode_features(train_df, test_df)

    model, feature_cols = train_model(train_df)
    forecast_df = forecast_next_7_days(category_name, model, df, cat_mean_map, global_mean, encoder, feature_cols)
    return forecast_df
