### Features Engineering
Feature engineering is the process of creating new features or transforming existing ones to improve 
the performance of machine learning models. 

##### Definition of new features functions

In [None]:
# Time Based Features
def create_time_based_features(df):
    # TODO: Use smth similar to pd.to_datetime(order_items['order_purchase_timestamp']).dt.floor('D') - avoid periods

    # df['Purchase Year-Month'] = df['order_purchase_timestamp'].dt.to_period('M')
    # df['Purchase Year'] = df['order_purchase_timestamp'].dt.to_period('Y')
    # df['Purchase Month'] = df['order_purchase_timestamp'].dt.month
    # df['Purchase Day Name'] = df['order_purchase_timestamp'].dt.day_name()
    # df['Purchase Day'] = df['order_purchase_timestamp'].dt.day
    # df['Purchase Week of Year'] = df['order_purchase_timestamp'].dt.isocalendar().week
    # df['Purchase Is Weekend'] = df['order_purchase_timestamp'].dt.dayofweek.isin([5,6]).astype(int)
    return df

# Lag Features
def create_lag_features(df, column, lags):
    for lag in lags:
        df[f'{column}_lag_{lag}'] = df.groupby('product_category_name_english')[column].shift(lag)
    return df

# Rolling Statistics
def create_rolling_features(df, column, windows):
    for window in windows:
        df[f'{column}_rolling_mean_{window}'] = df.groupby('product_category_name_english')[column].rolling(window=window).mean().reset_index(0,drop=True)
        df[f'{column}_rolling_std_{window}'] = df.groupby('product_category_name_english')[column].rolling(window=window).std().reset_index(0,drop=True)
    return df

# Exponential Moving Average
def create_ema_features(df, column, spans):
    for span in spans:
        df[f'{column}_ema_{span}'] = df.groupby('product_category_name_english')[column].ewm(span=span).mean().reset_index(0,drop=True)
    return df

# Fourier Terms for Seasonality
def fourier_series(dates, period, order):
    t = (dates - pd.Timestamp("1970-01-01")) / pd.Timedelta('1D')
    return pd.DataFrame({f'fourier_cos_{period}_{n}': np.cos(2 * n * np.pi * t / period),
                         f'fourier_sin_{period}_{n}': np.sin(2 * n * np.pi * t / period)}
                        for n in range(1, order + 1))

# Price-related Features
def price_related_features(df):
    df['price_rolling_mean'] = df.groupby('product_category_name_english')['price'].rolling(window=7).mean().reset_index(0,drop=True)
    df['price_relative_to_mean'] = df['price'] / df['price_rolling_mean']
    return df

# Sold Items Quantity Features
def quantity_related_features(df):
    # Sort the dataframe by timestamp and category
    df = df.sort_values(['product_category_name_english', 'order_purchase_timestamp'])
    # Calculate rolling count of sold items quantity
    df['quantity_rolling_count'] = df.groupby('product_category_name_english')['order_item_id'].transform(
        lambda x: x.rolling(window='7D').count()
    )
    # Calculate daily quantity
    df['daily_quantity'] = df.groupby(['product_category_name_english', df['order_purchase_timestamp'].dt.date])['order_item_id'].transform('count')
    # Calculate quantity relative to rolling count
    df['quantity_relative_to_count'] = df['daily_quantity'] / df['quantity_rolling_count']
    # Calculate cumulative sum of sold items within each category
    df['cumulative_quantity'] = df.groupby('product_category_name_english')['daily_quantity'].cumsum()
    return df

##### Final data pipeline that leverages functions written above

In [None]:
def features_engineering_pipeline(df):
    df = create_time_based_features(df)
    df = create_lag_features()
    create_rolling_features
    create_ema_features
    fourier_series
    price_related_features
    quantity_related_features
    return df