In [103]:
import pandas as pd

In [104]:
data = pd.read_csv("data/sales_data.csv")
df = data.drop(columns=['order_number', 'customer_type', 'unit_sale_price','revenue'])

creating weekly and monthly dataframes from historical sales data

In [105]:
df['order_date'] = pd.to_datetime(df['order_date'])
df['year'] = df['order_date'].dt.year
df['month'] = df['order_date'].dt.month
df['week'] = df['order_date'].dt.isocalendar().week

weekly_sales = df.groupby(['sku_id','warehouse_id','year','month','week'])['order_quantity'].sum().reset_index()
weekly_sales = weekly_sales.sort_values(by=['year','month','week'])

monthly_sales = df.groupby(['sku_id','warehouse_id','year','month'])['order_quantity'].sum().reset_index()
monthly_sales = monthly_sales.sort_values(by=['year','month'])

Creating a prediction dataset for model prediction. Taking sku_ids from inventory

In [106]:
inventory = pd.read_csv('data/inventory.csv')
inventory = inventory[['sku_id', 'warehouse_id']]

############### Weekly prediction dataset ###############

In [107]:
weekly_extra_data = inventory.copy()

weekly_extra_data['year'] = 2023
weekly_extra_data['month'] = 8
weekly_extra_data['week'] = 31
weekly_extra_data['order_quantity'] = 0

In [108]:
weekly_sales = pd.concat([weekly_sales, weekly_extra_data], ignore_index=True)

In [109]:
weekly_sales['lag_1'] = weekly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].shift(1)
weekly_sales['lag_2'] = weekly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].shift(2)
weekly_sales['lag_7'] = weekly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].shift(7)
weekly_sales['rolling_avg_3_weeks'] = weekly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].transform(lambda x: x.rolling(window=3).mean())
weekly_sales['cumulative_sum'] = weekly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].cumsum()

# Removing events that have NaN values
sku_weekly_sales = weekly_sales.dropna(subset=['lag_1', 'lag_2', 'lag_7', 'rolling_avg_3_weeks', 'cumulative_sum'])

In [110]:
weekly_prediction_data = sku_weekly_sales.query("year==2023 and week==31")
weekly_prediction_data = weekly_prediction_data.drop(columns=['order_quantity'])
weekly_prediction_data.to_csv('data/weekly_prediction_data.csv', index=False)

Monthly Prediction Dataset Creation

In [111]:
monthly_extra_data = inventory.copy()
monthly_extra_data['year'] = 2023
monthly_extra_data['month'] = 8

monthly_extra_data['order_quantity'] = 0

In [112]:
monthly_sales = pd.concat([monthly_sales, monthly_extra_data], ignore_index=True)

In [113]:
monthly_sales['lag_1'] = monthly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].shift(1)
monthly_sales['lag_2'] = monthly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].shift(2)

In [115]:
monthly_sales['lag_1'] = monthly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].shift(1)
monthly_sales['lag_2'] = monthly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].shift(2)
monthly_sales['rolling_avg_3_months'] = monthly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].transform(lambda x: x.rolling(window=3).mean())
monthly_sales['cumulative_sum'] = monthly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].cumsum()

# Removing events that have NaN values
sku_monthly_sales = monthly_sales.dropna(subset=['lag_1', 'lag_2', 'rolling_avg_3_months', 'cumulative_sum'])

In [117]:
monthly_prediction_data = sku_monthly_sales.query("year==2023 and month==8")
monthly_prediction_data = monthly_prediction_data.drop(columns=['order_quantity'])
monthly_prediction_data.to_csv('data/monthly_prediction_data.csv', index= False)