In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import lifetimes as lf
from lifetimes.plotting import plot_probability_alive_matrix

# Load the dataset
df = pd.read_csv(r'data\transaction_data.csv')

# Display info about the DataFrame
df.info()

# Filter out rows where UserId is -1
df = df[df['UserId'] != -1]


df['TransactionTime'] = pd.to_datetime(df['TransactionTime'])
df['SalesValue'] = df['NumberOfItemsPurchased'] * df['CostPerItem']



# Ensure 'TransactionTime' is a datetime column
df['TransactionTime'] = pd.to_datetime(df['TransactionTime'])

# Resampling by month and summing 'SalesValue'
df_resampled = df.resample('M', on='TransactionTime')['SalesValue'].sum().reset_index()

df_resampled.head()



# Filtering the data between specific dates
df_filtered = df_resampled[(df_resampled['TransactionTime'] >= '2018-01-01') & (df_resampled['TransactionTime'] <= '2019-04-01')]
last_date = df_filtered['TransactionTime'].max()
df_filtered.head()
last_date

In [None]:

# Plotting the sales values
plt.figure(figsize=(10, 6))
plt.plot(df_filtered['TransactionTime'], df_filtered['SalesValue'], label='Sales Value', color='blue')
plt.title('Sales Value from Jan 2018 to Apr 2019')
plt.xlabel('Transaction Time')
plt.ylabel('Sales Value')
plt.grid(True)
plt.legend()
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()


In [None]:

# Create a summary DataFrame from the original dataset (not the resampled one)
summary_df = lf.utils.summary_data_from_transaction_data(
    df,  # Use the original dataset for summary
    customer_id_col='UserId', 
    datetime_col='TransactionTime', 
    monetary_value_col='SalesValue',
    observation_period_end=last_date
)

# Display the summary DataFrame
print(summary_df.head())


In [None]:
summary_df = summary_df[summary_df['monetary_value'] > 0]


In [None]:
from lifetimes import BetaGeoFitter
bgf = BetaGeoFitter(penalizer_coef=0.15)
#The BG/NBD model is used to predict the number of repeat purchases a customer will make in the future.
# The penalizer coefficient (penalizer_coef=0.15) is included to regularize the model and prevent overfitting.
# If the model doesn’t converge, increasing the penalizer coefficient might help.

In [None]:
bgf.fit(summary_df['frequency'], summary_df['recency'], summary_df['T'])
#the fit() method trains the BG/NBD model. It uses the following columns:
#frequency: Number of repeat purchases made by the customer.
#recency: Time between the customer’s first and most recent purchase.
#T: The duration for which the customer has been observed (from their first purchase to the end of the observation period)
print(bgf.summary)


In [None]:
plot_probability_alive_matrix(bgf)
plt.show()

In [None]:
#his plot shows the probability that a customer is still "alive" (i.e., has not churned) based on their recency and frequency using the BG/NBD model.
'''
Explanation:
X-axis (Customer’s Historical Frequency):

This represents how many repeat purchases the customer has made historically.
As frequency increases, it generally indicates a more engaged customer.
Y-axis (Customer’s Recency):

This represents how recently the customer made a purchase.
Higher values mean the customer hasn't made a purchase for a long time, while lower values indicate recent purchases.
Color Scale (Probability of Being Alive):

The color gradient represents the probability that a customer is still "alive" and will make repeat purchases in the future.
Yellow (1.0): High probability of the customer still being active.
Purple (0.0): Low probability of the customer still being active (i.e., more likely to have churned).
Insights:
Customers with low recency (recent purchasers) and high frequency (frequent buyers) are very likely to still be "alive" and continue making purchases (yellow area in the bottom right).
Customers with high recency (haven't purchased in a long time) and low frequency (few purchases) are less likely to make another purchase (purple area in the top left).'''

In [None]:
import numpy as np  # Ensure you import numpy

summary_df['predicted_purchases_90'] = np.floor(bgf.conditional_expected_number_of_purchases_up_to_time(
    t=90,  # For the next 90 days
    frequency=summary_df['frequency'], 
    recency=summary_df['recency'], 
    T=summary_df['T']
)).astype(int)
summary_df['predicted_purchases_oneyear'] = np.floor(bgf.conditional_expected_number_of_purchases_up_to_time(
    t=360,  # For the next 90 days
    frequency=summary_df['frequency'], 
    recency=summary_df['recency'], 
    T=summary_df['T']
)).astype(int)   # Convert to integer

# Displaying the first few rows
print(summary_df[['frequency', 'recency', 'T', 'predicted_purchases_90','predicted_purchases_oneyear']].head())



In [None]:
# Export summary_df to a CSV file
summary_df.to_csv('summary_features.csv', index=False)
