In [7]:
import pandas as pd
import pycaret.classification as clf
import pycaret.regression as reg


In [8]:

# Constants
profit_margin = 0.15  # 15% Profit on Products

# Load data
df = pd.read_csv('data/transaction_data.csv')


In [9]:

# Ensure the 'TransactionTime' column is in datetime format
df['TransactionTime'] = pd.to_datetime(df['TransactionTime'])
df['sales_value'] = df['NumberOfItemsPurchased'] * df['CostPerItem']

# Set the prediction window for the next 90 days
prediction_window_days = 90
latest_date = df['TransactionTime'].max()
prediction_cutoff = latest_date - pd.to_timedelta(prediction_window_days, unit="d")

# Train-Test Split
temporal_in_df = df[df['TransactionTime'] < prediction_cutoff]
temporal_out_df = df[(df['TransactionTime'] > prediction_cutoff) & 
                      (df['UserId'].isin(temporal_in_df['UserId']))]



In [10]:

# FEATURE ENGINEERING

# Target Creation - For the next 90 days
targets_df = (
    temporal_out_df.groupby("UserId")["sales_value"]  # Select only the numeric column
    .sum()
    .rename("sales_90_value")
    .reset_index()  # Reset index to flatten the DataFrame
)


In [11]:

# Add a flag column to indicate these users made purchases in the 90-day period
targets_df["sales_90_flag"] = 1

# Recency Feature Creation - Time since last purchase date for each customer
max_date = temporal_in_df["TransactionTime"].max()
recency_features_df = temporal_in_df[["UserId", "TransactionTime"]] \
    .groupby("UserId") \
    .apply(lambda x: int((max_date - x["TransactionTime"].max()) / pd.to_timedelta(1, "day"))) \
    .to_frame(name="recency")


In [12]:

# Frequency Feature Creation - Count of purchases for each customer
frequency_features_df = temporal_in_df[["UserId", "TransactionTime"]] \
    .groupby("UserId") \
    .count() \
    .rename(columns={"TransactionTime": "frequency"})

# Monetary Feature Creation - Sum and Mean of sales for each customer
monetary_features_df = temporal_in_df[["UserId", "sales_value"]] \
    .groupby("UserId") \
    .agg(sales_value_sum=("sales_value", "sum"), 
         sales_value_mean=("sales_value", "mean"))


In [13]:

# Transactions in the Last Month (28 days)
cutoff_28d = prediction_cutoff - pd.to_timedelta(28, unit="d")
transactions_last_month_df = temporal_in_df[['UserId', 'TransactionTime']] \
    .drop_duplicates() \
    .query("TransactionTime > @cutoff_28d") \
    .groupby("UserId") \
    .size() \
    .to_frame(name='transactions_last_month')


In [14]:

# Transactions in the Last 2 Weeks (14 days)
cutoff_14d = prediction_cutoff - pd.to_timedelta(14, unit="d")
transactions_last_2weeks_df = temporal_in_df[["UserId", "TransactionTime"]] \
    .drop_duplicates() \
    .query("TransactionTime > @cutoff_14d") \
    .groupby("UserId") \
    .size() \
    .to_frame(name='transactions_last_2weeks')


In [16]:
# Spend in the Last 2 Weeks
sales_last_2weeks_df = (
    temporal_in_df[["UserId", "TransactionTime", "sales_value"]]
    .drop_duplicates()
    .query("TransactionTime > @cutoff_14d")
    .groupby("UserId", as_index=False)["sales_value"]  # Restrict to numeric column
    .sum()
    .rename(columns={"sales_value": "sales_value_last_2weeks"})
)


In [17]:

# Combine all features into a single DataFrame
features_df = recency_features_df \
    .merge(frequency_features_df, on="UserId", how="left") \
    .merge(monetary_features_df, on="UserId", how="left") \
    .merge(transactions_last_month_df, on="UserId", how="left") \
    .merge(transactions_last_2weeks_df, on="UserId", how="left") \
    .merge(sales_last_2weeks_df, on="UserId", how="left") \
    .merge(targets_df, on="UserId", how="left").fillna(0)


In [19]:

# Display the combined feature set
print("Combined Features:\n", features_df.head())

# PREDICTIVE MODELING

# Initialize the PyCaret classification setup for predicting sales in the next 90 days
clf_setup = clf.setup(
    data=features_df,
    target='sales_90_flag',
    session_id=123
)



Combined Features:
    UserId  recency  frequency  sales_value_sum  sales_value_mean  \
0      -1        0     266500      12065905.38         45.275442   
1  259266      325          4             0.00          0.000000   
2  259287        1        302         29856.36         98.862119   
3  259308       74         62         14963.04        241.339355   
4  259329       18        146         14571.84         99.807123   

   transactions_last_month  transactions_last_2weeks  sales_value_last_2weeks  \
0                      0.0                       0.0                      0.0   
1                      0.0                       0.0                      0.0   
2                      0.0                       0.0                      0.0   
3                      0.0                       0.0                      0.0   
4                      0.0                       0.0                      0.0   

   sales_90_value  sales_90_flag  
0       -66633.60            1.0  
1            0

Unnamed: 0,Description,Value
0,Session id,123
1,Target,sales_90_flag
2,Target type,Binary
3,Original data shape,"(4368, 10)"
4,Transformed data shape,"(4368, 10)"
5,Transformed train set shape,"(3057, 10)"
6,Transformed test set shape,"(1311, 10)"
7,Numeric features,9
8,Preprocess,True
9,Imputation type,simple


In [20]:


# Compare and select the best model
best_clf_model = clf.compare_models()

# Train the best model
final_clf_model = clf.finalize_model(best_clf_model)

# Display the final classification model
print("Final Classification Model:\n", final_clf_model)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9997,1.0,0.975,1.0,0.9857,0.9856,0.9865,0.09
gbc,Gradient Boosting Classifier,0.999,0.9743,0.935,1.0,0.9635,0.963,0.965,0.054
ada,Ada Boost Classifier,0.9987,0.9723,0.935,0.9833,0.9544,0.9537,0.9562,0.026
dt,Decision Tree Classifier,0.9984,0.9672,0.935,0.9633,0.9433,0.9425,0.9455,0.005
rf,Random Forest Classifier,0.998,0.9885,0.87,1.0,0.9242,0.9232,0.9287,0.032
et,Extra Trees Classifier,0.998,0.9866,0.87,1.0,0.9194,0.9185,0.9263,0.022
xgboost,Extreme Gradient Boosting,0.9977,0.9826,0.845,1.0,0.9052,0.9041,0.9127,0.011
nb,Naive Bayes,0.9951,0.9979,0.935,0.8019,0.8553,0.8529,0.8596,0.005
lr,Logistic Regression,0.9928,0.8896,0.63,0.855,0.7075,0.7041,0.7214,0.333
ridge,Ridge Classifier,0.9863,0.8846,0.105,0.2,0.1289,0.1279,0.1383,0.005


Final Classification Model:
 Pipeline(memory=Memory(location=None),
         steps=[('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['UserId', 'recency', 'frequency',
                                             'sales_value_sum',
                                             'sales_value_mean',
                                             'transactions_last_month',
                                             'transactions_last_2weeks',
                                             'sales_value_last_2weeks',
                                             'sales_90_value'],
                                    transformer=SimpleImputer(add_indicator=False,
                                                              copy=True,
                                                              fill_value=None,
                                                              keep_e...
                 LGBMClassifier(boosting_type='g

In [22]:

# For regression modeling on 'sales_90_value' (predicted sales amount in 90 days)
reg_setup = reg.setup(
    data=features_df,
    target='sales_90_value',
    
    session_id=123
)


Unnamed: 0,Description,Value
0,Session id,123
1,Target,sales_90_value
2,Target type,Regression
3,Original data shape,"(4368, 10)"
4,Transformed data shape,"(4368, 10)"
5,Transformed train set shape,"(3057, 10)"
6,Transformed test set shape,"(1311, 10)"
7,Numeric features,9
8,Preprocess,True
9,Imputation type,simple


In [23]:

# Compare and select the best regression model
best_reg_model = reg.compare_models()

# Train the best regression model
final_reg_model = reg.finalize_model(best_reg_model)

# Display the final regression model
print("Final Regression Model:\n", final_reg_model)



Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
ada,AdaBoost Regressor,123.0928,14978104.1457,1865.2902,0.2376,0.5701,1.2604,0.014
dummy,Dummy Regressor,246.9485,16990197.0344,2142.8346,-0.0193,4.5849,0.9625,0.005
huber,Huber Regressor,149.7719,17017959.1917,2163.9208,-0.1224,0.9417,1.0001,0.005
lightgbm,Light Gradient Boosting Machine,237.014,16910744.9904,2315.2192,-0.892,2.2472,1.8681,0.101
et,Extra Trees Regressor,210.394,29956548.6849,3265.7431,-4.8701,0.2135,1.9243,0.019
knn,K Neighbors Regressor,240.3367,20499132.3922,2828.0513,-8.6598,1.7813,1.0149,0.006
xgboost,Extreme Gradient Boosting,189.9135,31075487.7637,3121.7407,-14.0416,0.3356,1.7776,0.013
gbr,Gradient Boosting Regressor,279.4975,41356302.3891,4343.0984,-29.1974,1.0565,1.7389,0.047
rf,Random Forest Regressor,223.1085,27041175.1762,3499.5265,-31.5742,0.3824,1.5329,0.03
omp,Orthogonal Matching Pursuit,309.8856,27290424.0306,3112.3894,-41.9608,4.6275,0.9682,0.005


Final Regression Model:
 Pipeline(memory=Memory(location=None),
         steps=[('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['UserId', 'recency', 'frequency',
                                             'sales_value_sum',
                                             'sales_value_mean',
                                             'transactions_last_month',
                                             'transactions_last_2weeks',
                                             'sales_value_last_2weeks',
                                             'sales_90_flag'],
                                    transformer=SimpleImputer(add_indicator=False,
                                                              copy=True,
                                                              fill_value=None,
                                                              keep_em...
                                                    