In [None]:
import pandas as pd
import pycaret.classification as clf
import pycaret.regression as reg


In [None]:

# Constants
profit_margin = 0.15  # 15% Profit on Products

# Load data
df = pd.read_csv('data/transaction_data.csv')


In [None]:

# Ensure the 'TransactionTime' column is in datetime format
df['TransactionTime'] = pd.to_datetime(df['TransactionTime'])
df['sales_value'] = df['NumberOfItemsPurchased'] * df['CostPerItem']

# Set the prediction window for the next 90 days
prediction_window_days = 90
latest_date = df['TransactionTime'].max()
prediction_cutoff = latest_date - pd.to_timedelta(prediction_window_days, unit="d")

# Train-Test Split
temporal_in_df = df[df['TransactionTime'] < prediction_cutoff]
temporal_out_df = df[(df['TransactionTime'] > prediction_cutoff) & 
                      (df['UserId'].isin(temporal_in_df['UserId']))]



In [None]:

# FEATURE ENGINEERING

# Target Creation - For the next 90 days
targets_df = (
    temporal_out_df.groupby("UserId")["sales_value"]  # Select only the numeric column
    .sum()
    .rename("sales_90_value")
    .reset_index()  # Reset index to flatten the DataFrame
)


In [None]:

# Add a flag column to indicate these users made purchases in the 90-day period
targets_df["sales_90_flag"] = 1

# Recency Feature Creation - Time since last purchase date for each customer
max_date = temporal_in_df["TransactionTime"].max()
recency_features_df = temporal_in_df[["UserId", "TransactionTime"]] \
    .groupby("UserId") \
    .apply(lambda x: int((max_date - x["TransactionTime"].max()) / pd.to_timedelta(1, "day"))) \
    .to_frame(name="recency")


In [None]:

# Frequency Feature Creation - Count of purchases for each customer
frequency_features_df = temporal_in_df[["UserId", "TransactionTime"]] \
    .groupby("UserId") \
    .count() \
    .rename(columns={"TransactionTime": "frequency"})

# Monetary Feature Creation - Sum and Mean of sales for each customer
monetary_features_df = temporal_in_df[["UserId", "sales_value"]] \
    .groupby("UserId") \
    .agg(sales_value_sum=("sales_value", "sum"), 
         sales_value_mean=("sales_value", "mean"))


In [None]:

# Transactions in the Last Month (28 days)
cutoff_28d = prediction_cutoff - pd.to_timedelta(28, unit="d")
transactions_last_month_df = temporal_in_df[['UserId', 'TransactionTime']] \
    .drop_duplicates() \
    .query("TransactionTime > @cutoff_28d") \
    .groupby("UserId") \
    .size() \
    .to_frame(name='transactions_last_month')


In [None]:

# Transactions in the Last 2 Weeks (14 days)
cutoff_14d = prediction_cutoff - pd.to_timedelta(14, unit="d")
transactions_last_2weeks_df = temporal_in_df[["UserId", "TransactionTime"]] \
    .drop_duplicates() \
    .query("TransactionTime > @cutoff_14d") \
    .groupby("UserId") \
    .size() \
    .to_frame(name='transactions_last_2weeks')


In [None]:
# Spend in the Last 2 Weeks
sales_last_2weeks_df = (
    temporal_in_df[["UserId", "TransactionTime", "sales_value"]]
    .drop_duplicates()
    .query("TransactionTime > @cutoff_14d")
    .groupby("UserId", as_index=False)["sales_value"]  # Restrict to numeric column
    .sum()
    .rename(columns={"sales_value": "sales_value_last_2weeks"})
)


In [None]:

# Combine all features into a single DataFrame
features_df = recency_features_df \
    .merge(frequency_features_df, on="UserId", how="left") \
    .merge(monetary_features_df, on="UserId", how="left") \
    .merge(transactions_last_month_df, on="UserId", how="left") \
    .merge(transactions_last_2weeks_df, on="UserId", how="left") \
    .merge(sales_last_2weeks_df, on="UserId", how="left") \
    .merge(targets_df, on="UserId", how="left").fillna(0)


In [None]:

# Display the combined feature set
print("Combined Features:\n", features_df.head())

# PREDICTIVE MODELING

# Initialize the PyCaret classification setup for predicting sales in the next 90 days
clf_setup = clf.setup(
    data=features_df,
    target='sales_90_flag',
    session_id=123
)



In [None]:


# Compare and select the best model
best_clf_model = clf.compare_models()

# Train the best model
final_clf_model = clf.finalize_model(best_clf_model)

# Display the final classification model
print("Final Classification Model:\n", final_clf_model)


In [None]:

# For regression modeling on 'sales_90_value' (predicted sales amount in 90 days)
reg_setup = reg.setup(
    data=features_df,
    target='sales_90_value',
    
    session_id=123
)


In [None]:

# Compare and select the best regression model
best_reg_model = reg.compare_models()

# Train the best regression model
final_reg_model = reg.finalize_model(best_reg_model)

# Display the final regression model
print("Final Regression Model:\n", final_reg_model)

