<a href="https://colab.research.google.com/github/sandeep-006/Customer_lifetime_value_pred/blob/main/rfm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# 1. Load the dataset
df = pd.read_csv('validated_data.csv')
df['transaction_date'] = pd.to_datetime(df['transaction_date'], errors='coerce')
df['customer_since'] = pd.to_datetime(df['customer_since'], errors='coerce')

# Save the segregated files
df.to_csv('validated_data.csv', index=False)

# --- SECTION 2: RFM CALCULATION ---
# We use a 'snapshot date' (1 day after the latest transaction) as the reference point
snapshot_date = df['transaction_date'].max() + pd.Timedelta(days=1)

# Group by customer to calculate Recency, Frequency, and Monetary values
rfm = df.groupby('customer_id').agg({
    'transaction_date': lambda x: (snapshot_date - x.max()).days, # Recency
    'transaction_id': 'count',                                   # Frequency
    'final_amount': 'sum'                                        # Monetary
}).rename(columns={
    'transaction_date': 'Recency',
    'transaction_id': 'Frequency',
    'final_amount': 'Monetary'
})

# Add categorical attributes (Loyalty and Avg Tenure) to the RFM table
customer_meta = df.groupby('customer_id').agg({
    'loyalty_status': 'first',
    'customer_tenure_days': 'mean'
})

rfm_table = rfm.join(customer_meta).reset_index()

# --- SECTION 3: NEXT MONTH EXPENDITURE ESTIMATION ---
# For a simple projection, we calculate the Average Monthly Spend per customer
# (Monetary / Total months they've been active)
rfm_table['avg_monthly_spend'] = rfm_table['Monetary'] / (rfm_table['customer_tenure_days'] / 30)

# We can use this 'avg_monthly_spend' as a baseline prediction for next month
rfm_table['predicted_next_month_spend'] = rfm_table['avg_monthly_spend'].round(2)

# Save the master RFM & Prediction table
rfm_table.to_csv('customer_rfm_summary.csv', index=False)

print(f"Successfully created 'validated_data.csv' and 'invalid_data.csv'")
print(f"RFM Summary with predictions created for {len(rfm_table)} unique customers.")
display(rfm_table.head())

Successfully created 'validated_data.csv' and 'invalid_data.csv'
RFM Summary with predictions created for 200 unique customers.


Unnamed: 0,customer_id,Recency,Frequency,Monetary,loyalty_status,customer_tenure_days,avg_monthly_spend,predicted_next_month_spend
0,C001,274,48,113608.48,Platinum,483.458333,7049.73762,7049.74
1,C002,203,41,67180.76,Silver,386.609756,5213.067617,5213.07
2,C003,258,50,127786.13,Silver,392.22,9774.06532,9774.07
3,C004,349,22,34541.55,Bronze,433.045455,2392.927784,2392.93
4,C005,287,23,62393.93,Gold,716.782609,2611.416456,2611.42


In [3]:
rfm_table

Unnamed: 0,customer_id,Recency,Frequency,Monetary,loyalty_status,customer_tenure_days,avg_monthly_spend,predicted_next_month_spend
0,C001,274,48,113608.48,Platinum,483.458333,7049.737620,7049.74
1,C002,203,41,67180.76,Silver,386.609756,5213.067617,5213.07
2,C003,258,50,127786.13,Silver,392.220000,9774.065320,9774.07
3,C004,349,22,34541.55,Bronze,433.045455,2392.927784,2392.93
4,C005,287,23,62393.93,Gold,716.782609,2611.416456,2611.42
...,...,...,...,...,...,...,...,...
195,C196,243,21,52615.56,Bronze,230.619048,6844.477142,6844.48
196,C197,107,12,28680.72,Gold,227.166667,3787.622597,3787.62
197,C198,395,11,31553.15,Platinum,634.454545,1491.981588,1491.98
198,C199,293,45,97224.64,Gold,710.644444,4104.357985,4104.36


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder


In [6]:

# 1. Load the training data
# Ensure 'clv_training_data.csv' is in your Colab file explorer
df = pd.read_csv('customer_rfm_summary.csv')

# --- SECTION 1: FEATURE ENHANCEMENT ---
# We calculate 'Purchase Velocity' to show the model how often they buy per month
df['purchase_velocity'] = df['Frequency'] / (df['customer_tenure_days'] / 30)

# Encode 'loyalty_status' into numbers (0, 1, 2, 3)
le = LabelEncoder()
df['loyalty_encoded'] = le.fit_transform(df['loyalty_status'])


In [9]:

# Define our Features (X) and Target (y)
# Target is 'target_monthly_expenditure' which we normalized to 30 days
features = ['Recency', 'Frequency', 'Monetary', 'customer_tenure_days', 'purchase_velocity', 'loyalty_encoded']
X = df[features]
y = df['avg_monthly_spend']


In [10]:

# --- SECTION 2: TRAIN/TEST SPLIT ---
# We use 80% of data for training and 20% for testing the accuracy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- SECTION 3: MODEL TRAINING ---
# Random Forest is robust against outliers and missing feature scaling
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# --- SECTION 4: PREDICTION & EVALUATION ---
y_pred = model.predict(X_test)

# Calculate Accuracy Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"--- Model Performance Metrics ---")
print(f"Mean Absolute Error: ${mae:.2f} (Average error in prediction)")
print(f"R-squared Score: {r2:.2f}")

# --- SECTION 5: FEATURE IMPORTANCE ---
importances = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
print("\n--- Key Drivers of Future Spend ---")
print(importances)

# --- SECTION 6: SAMPLE PREDICTION ---
# Let's see the first 5 customers in the test set
results = pd.DataFrame({'Actual Spend': y_test, 'Predicted Spend': y_pred})
print("\n--- Sample Predictions (Next 30 Days Spend) ---")
display(results.head())

--- Model Performance Metrics ---
Mean Absolute Error: $271.76 (Average error in prediction)
R-squared Score: 0.96

--- Key Drivers of Future Spend ---
purchase_velocity       0.884972
Monetary                0.064738
customer_tenure_days    0.022232
Recency                 0.014048
Frequency               0.010260
loyalty_encoded         0.003750
dtype: float64

--- Sample Predictions (Next 30 Days Spend) ---


Unnamed: 0,Actual Spend,Predicted Spend
95,3475.489011,3491.633739
15,8110.344508,8732.182154
30,3111.795782,4389.193367
158,3445.044245,3001.523346
128,5865.078585,5865.697381


In [11]:
import joblib
from google.colab import files

# 1. Save the model to a file in the Colab environment
model_filename = 'rf_expenditure_model.joblib'
joblib.dump(model, model_filename)

# 2. Save the LabelEncoder (Crucial for predicting new data later)
joblib.dump(le, 'loyalty_encoder.joblib')

print(f"Model saved as {model_filename}")

# 3. Trigger a browser download to your local computer
files.download(model_filename)
files.download('loyalty_encoder.joblib')

Model saved as rf_expenditure_model.joblib


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
import joblib
from google.colab import files

# Even though it's a Random Forest, we can name the file .pth
model_filename = 'rf_expenditure_model.pth'

# Use joblib to save the scikit-learn model
joblib.dump(model, model_filename)

print(f"Successfully saved Scikit-Learn model as {model_filename}")

# Download it to your computer
files.download(model_filename)import joblib
from google.colab import files

# Even though it's a Random Forest, we can name the file .pth
model_filename = 'rf_expenditure_model.pth'

# Use joblib to save the scikit-learn model
joblib.dump(model, model_filename)

print(f"Successfully saved Scikit-Learn model as {model_filename}")

# Download it to your computer
files.download(model_filename)

SyntaxError: invalid syntax (ipython-input-112530613.py, line 13)