In [None]:
import pandas as pd

In [None]:
df = pd.read_excel("Online Retail.xlsx")
df.head()

In [None]:
## Data Cleaning
df.dropna(subset=['Description','CustomerID','InvoiceDate'],inplace=True)
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
df = df[(df['Quantity']>0) & (df["UnitPrice"]>0)]
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [None]:
## Time based feature engineering
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['Day'] = df['InvoiceDate'].dt.day
df['DayOfWeek'] = df['InvoiceDate'].dt.day_of_week
df['Hour'] = df['InvoiceDate'].dt.hour
df['WeekOfYear'] = df['InvoiceDate'].dt.isocalendar().week
df['TimeOfDay'] = pd.cut(
    df['Hour'],
    bins=[0,6,12,18,24],
    labels =['Night','Morning','Afternoon','Evening'],
    right=False
)
df['IsWeekend'] = df['DayOfWeek'].isin([5,6]).astype(int)

In [None]:
## Overall Stats and Demand

# Product Sales
product_sales = df.groupby('Description')['Quantity'].sum().reset_index()
product_sales.columns =['Description','TotalProductSales']

# Product Revenue
df['Revenue'] = df['Quantity'] * df['UnitPrice']
product_revenue = df.groupby('Description')['Revenue'].sum().reset_index()
product_revenue.columns = ['Description','TotalProductRevenue']

# Average Unit Price
product_avg_price = df.groupby('Description')['UnitPrice'].mean().reset_index()
product_avg_price.columns = ['Description','AvgProductPrice']

product_stats = product_sales.merge(product_revenue,on='Description').merge(product_avg_price,on='Description')

df = df.merge(product_stats,on='Description')

In [None]:
df = df.sort_values(by='InvoiceDate')

rolling_sales = (
    df.groupby('Description')['Quantity']
    .rolling('7D')
    .sum()
    .reset_index()
    .rename(columns={'Quantity': 'Rolling7D_Sales'})
)

df = df.reset_index().merge(rolling_sales, on=['InvoiceDate', 'Description'], how='left')

epsilon = 1e-5
df['Relative7D_Demand'] = df['Rolling7D_Sales'] / (df['TotalProductSales'] + epsilon)

In [None]:
## Data Preprocessing for Model Training
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

le_desc = LabelEncoder()
le_country = LabelEncoder()
le_timeOfDay = LabelEncoder()
df['Description_Encoded'] = le_desc.fit_transform(df['Description'])
df['Country_Encoded'] = le_country.fit_transform(df['Country'])
df['TimeOfDay_Encoded'] = le_timeOfDay.fit_transform(df['TimeOfDay'])

cols_to_drop = ['InvoiceNo', 'StockCode', 'Description', 'CustomerID', 'Country','InvoiceDate', 'TimeOfDay', 'UnitPrice']
x = df.drop(columns=cols_to_drop)
y = df['UnitPrice']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [None]:
## Model Evaluation using KFolds

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

def evaluateModel(model, xtrain,xtest,ytrain,ytest) :
      model.fit(xtrain,ytrain)
      return model.score(xtest,ytest)   
  
kf = KFold(n_splits=5)
rf_scores =[]
xgb_scores=[]
lgm_scores=[]
cb_scores=[]

for train_idx , test_idx in kf.split(x) :
    xtrain, xtest, ytrain, ytest = x.iloc[train_idx], x.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]
    rf_scores.append(evaluateModel(RandomForestRegressor(),xtrain,xtest,ytrain,ytest))
    xgb_scores.append(evaluateModel(XGBRegressor(),xtrain,xtest,ytrain,ytest))
    lgm_scores.append(evaluateModel(LGBMRegressor(),xtrain,xtest,ytrain,ytest))
    cb_scores.append(evaluateModel(CatBoostRegressor(),xtrain,xtest,ytrain,ytest))
    
print("\nRandom Forest : ", rf_scores )
print("\nXGBoost : ", xgb_scores )
print("\nLightgbm : ", lgm_scores )
print("\nCatboost : ", cb_scores )

# Random Forest :  [0.7346024842370784, 0.5386385350957635, 0.9428367193665621, 0.8220178901377679, 0.7869319145447938]
# XGBoost :  [-19.84647015973839, 0.2709374971644768, -7.801290549891426e-05, 0.4110827006239163, -6.317344650602467]
# Lightgbm :  [-0.1323997735987168, 0.4576728316241595, 0.9237278425450924, 0.6281390681396444, 0.8734114246133489]
# Catboost :  [np.float64(-0.9620748188968691), np.float64(0.41323079092446624), np.float64(-0.46392853097161124), np.float64(0.524087803816853), np.float64(0.6390182647258191)]

In [None]:
## Finalizing Model

finalModel = RandomForestRegressor()
finalModel.fit(x_train,y_train)

In [None]:
## Model Performance
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

predictions = finalModel.predict(x_test)
print('R2 Score:', r2_score(y_test, predictions))
print('MAE:', mean_absolute_error(y_test, predictions))
print('RMSE:', np.sqrt(mean_squared_error(y_test, predictions)))

# R2 Score: 0.9011171262367133
# MAE: 0.07493807452535428
# RMSE: 7.178960416817223

In [None]:
## Save Model

import joblib

joblib.dump(finalModel,"DynamicPricingEngine.pkl")