<a href="https://colab.research.google.com/github/sandeep-006/Customer_lifetime_value_pred/blob/main/rfm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# 1. Load the dataset
df = pd.read_csv('validated_data.csv')
df['transaction_date'] = pd.to_datetime(df['transaction_date'], errors='coerce')
df['customer_since'] = pd.to_datetime(df['customer_since'], errors='coerce')

# Save the segregated files
df.to_csv('validated_data.csv', index=False)

# --- SECTION 2: RFM CALCULATION ---
# We use a 'snapshot date' (1 day after the latest transaction) as the reference point
snapshot_date = df['transaction_date'].max() + pd.Timedelta(days=1)

# Group by customer to calculate Recency, Frequency, and Monetary values
rfm = df.groupby('customer_id').agg({
    'transaction_date': lambda x: (snapshot_date - x.max()).days, # Recency
    'transaction_id': 'count',                                   # Frequency
    'final_amount': 'sum'                                        # Monetary
}).rename(columns={
    'transaction_date': 'Recency',
    'transaction_id': 'Frequency',
    'final_amount': 'Monetary'
})

# Add categorical attributes (Loyalty and Avg Tenure) to the RFM table
customer_meta = df.groupby('customer_id').agg({
    'loyalty_status': 'first',
    'customer_tenure_days': 'mean'
})

rfm_table = rfm.join(customer_meta).reset_index()

# --- SECTION 3: NEXT MONTH EXPENDITURE ESTIMATION ---
# For a simple projection, we calculate the Average Monthly Spend per customer
# (Monetary / Total months they've been active)
rfm_table['avg_monthly_spend'] = rfm_table['Monetary'] / (rfm_table['customer_tenure_days'] / 30)

# We can use this 'avg_monthly_spend' as a baseline prediction for next month
rfm_table['predicted_next_month_spend'] = rfm_table['avg_monthly_spend'].round(2)

# Save the master RFM & Prediction table
rfm_table.to_csv('customer_rfm_summary.csv', index=False)

print(f"Successfully created 'validated_data.csv' and 'invalid_data.csv'")
print(f"RFM Summary with predictions created for {len(rfm_table)} unique customers.")
display(rfm_table.head())

Successfully created 'validated_data.csv' and 'invalid_data.csv'
RFM Summary with predictions created for 200 unique customers.


Unnamed: 0,customer_id,Recency,Frequency,Monetary,loyalty_status,customer_tenure_days,avg_monthly_spend,predicted_next_month_spend
0,C001,274,48,113608.48,Platinum,483.458333,7049.73762,7049.74
1,C002,203,41,67180.76,Silver,386.609756,5213.067617,5213.07
2,C003,258,50,127786.13,Silver,392.22,9774.06532,9774.07
3,C004,349,22,34541.55,Bronze,433.045455,2392.927784,2392.93
4,C005,287,23,62393.93,Gold,716.782609,2611.416456,2611.42


In [3]:
rfm_table

Unnamed: 0,customer_id,Recency,Frequency,Monetary,loyalty_status,customer_tenure_days,avg_monthly_spend,predicted_next_month_spend
0,C001,274,48,113608.48,Platinum,483.458333,7049.737620,7049.74
1,C002,203,41,67180.76,Silver,386.609756,5213.067617,5213.07
2,C003,258,50,127786.13,Silver,392.220000,9774.065320,9774.07
3,C004,349,22,34541.55,Bronze,433.045455,2392.927784,2392.93
4,C005,287,23,62393.93,Gold,716.782609,2611.416456,2611.42
...,...,...,...,...,...,...,...,...
195,C196,243,21,52615.56,Bronze,230.619048,6844.477142,6844.48
196,C197,107,12,28680.72,Gold,227.166667,3787.622597,3787.62
197,C198,395,11,31553.15,Platinum,634.454545,1491.981588,1491.98
198,C199,293,45,97224.64,Gold,710.644444,4104.357985,4104.36


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder


In [6]:

# 1. Load the training data
# Ensure 'clv_training_data.csv' is in your Colab file explorer
df = pd.read_csv('customer_rfm_summary.csv')

# --- SECTION 1: FEATURE ENHANCEMENT ---
# We calculate 'Purchase Velocity' to show the model how often they buy per month
df['purchase_velocity'] = df['Frequency'] / (df['customer_tenure_days'] / 30)

# Encode 'loyalty_status' into numbers (0, 1, 2, 3)
le = LabelEncoder()
df['loyalty_encoded'] = le.fit_transform(df['loyalty_status'])


In [33]:

# Define our Features (X) and Target (y)
# Target is 'target_monthly_expenditure' which we normalized to 30 days
features = ['Recency', 'Frequency', 'Monetary', 'customer_tenure_days', 'purchase_velocity', 'loyalty_encoded']
X = df[features]
y = df['avg_monthly_spend']

# --- SECTION 2: TRAIN/TEST SPLIT ---
# We use 80% of data for training and 20% for testing the accuracy
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.2, random_state=42)

###RandomForest

In [34]:


# --- SECTION 3: DATA NORMALIZATION ---
# We fit the scaler ONLY on training data to avoid data leakage
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

# Save the scaler (you will need this for future predictions!)
joblib.dump(scaler, 'feature_scaler.joblib')

# --- SECTION 4: MODEL TRAINING ---
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# --- SECTION 5: PREDICTION & EVALUATION ---
y_pred = model.predict(X_test)

print(f"--- Normalized Model Metrics ---")
print(f"R-squared Score: {r2_score(y_test, y_pred):.2f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.2f}")

# --- SECTION 6: SAMPLE PREDICTIONS WITH CUSTOMER ID ---
# We use X_test_raw.index to get back the original Customer IDs
results = pd.DataFrame({
    'Customer_ID': X_test_raw.index,
    'Actual Spend': y_test.values,
    'Predicted Spend': y_pred
}).set_index('Customer_ID')

print("\n--- Next 30-Day Predictions ---")
display(results.head(10))

# --- SECTION 7: SAVE EVERYTHING ---
joblib.dump(model, 'rf_normalized_model.joblib')
joblib.dump(le, 'loyalty_encoder.joblib')

--- Normalized Model Metrics ---
R-squared Score: 0.96
Mean Absolute Error: 269.52

--- Next 30-Day Predictions ---


Unnamed: 0_level_0,Actual Spend,Predicted Spend
Customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
C096,3475.489011,3491.633739
C016,8110.344508,8732.182154
C031,3111.795782,4389.193367
C159,3445.044245,3001.523346
C129,5865.078585,5857.997572
C116,1067.386826,1243.513734
C070,2010.490573,2489.82589
C171,8007.906303,7514.247601
C175,1365.535344,1293.048692
C046,1305.275286,1727.878026


['loyalty_encoder.joblib']

###XGBoost

In [22]:
import xgboost as xg
from sklearn.metrics import mean_absolute_error, r2_score

# 1. Initialize the XGBoost Regressor
# 'n_estimators' is the number of boosting rounds
xgb_model = xg.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)

# 2. Fit the model
xgb_model.fit(X_train, y_train)
# 3. Predict and Evaluate
y_pred_xgb = xgb_model.predict(X_test)
print(f"XGBoost R2 Score: {r2_score(y_test, y_pred_xgb):.2f}")
print(f"XGBoost MAE: ${mean_absolute_error(y_test, y_pred_xgb):.2f}")

XGBoost R2 Score: 0.97
XGBoost MAE: $233.47


In [27]:
xg_y_pred = xgb_model.predict(X_test)

In [26]:

from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

# Linear models work better if features are on the same scale (0 to 1)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1. Initialize Ridge
ridge_model = Ridge(alpha=1.0)

# 2. Fit and Predict
ridge_model.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_model.predict(X_test_scaled)

print(f"Ridge R2 Score: {r2_score(y_test, y_pred_ridge):.2f}")

Ridge R2 Score: 0.99


In [15]:
results

Unnamed: 0,Actual Spend,Predicted Spend
95,3475.489011,3491.633739
15,8110.344508,8732.182154
30,3111.795782,4389.193367
158,3445.044245,3001.523346
128,5865.078585,5865.697381
115,1067.386826,1232.840385
69,2010.490573,2489.82589
170,8007.906303,7514.247601
174,1365.535344,1293.048692
45,1305.275286,1727.878026


In [11]:
import joblib
from google.colab import files

# 1. Save the model to a file in the Colab environment
model_filename = 'rf_expenditure_model.joblib'
joblib.dump(model, model_filename)

# 2. Save the LabelEncoder (Crucial for predicting new data later)
joblib.dump(le, 'loyalty_encoder.joblib')

print(f"Model saved as {model_filename}")

# 3. Trigger a browser download to your local computer
files.download(model_filename)
files.download('loyalty_encoder.joblib')

Model saved as rf_expenditure_model.joblib


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
import joblib
from google.colab import files

# Even though it's a Random Forest, we can name the file .pth
model_filename = 'xgb_model.pth'

# Use joblib to save the scikit-learn model
joblib.dump(xgb_model, model_filename)

print(f"Successfully saved Scikit-Learn model as {model_filename}")

# Download it to your computer
files.download(model_filename)

Successfully saved Scikit-Learn model as xgb_model.pth


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:


# 1. Load the model and encoder
# Ensure these files are in your Colab file list
model = joblib.load('rf_expenditure_model.pth')
le = joblib.load('loyalty_encoder.joblib')

# 2. Define the input for ONE customer
# Example: A Gold member who joined 500 days ago, shopped 10 times (total $5000),
# and last shopped 5 days ago.
new_customer_data = {
    'Recency': 5,
    'Frequency': 10,
    'Monetary': 5000.0,
    'customer_tenure_days': 500,
    'loyalty_status': 'Gold'
}

# 3. Pre-process the input (Calculate Velocity and Encode Loyalty)
tenure_months = new_customer_data['customer_tenure_days'] / 30
velocity = new_customer_data['Frequency'] / tenure_months

# Transform into the format the model expects
input_df = pd.DataFrame([{
    'Recency': new_customer_data['Recency'],
    'Frequency': new_customer_data['Frequency'],
    'Monetary': new_customer_data['Monetary'],
    'customer_tenure_days': new_customer_data['customer_tenure_days'],
    'purchase_velocity': velocity,
    'loyalty_encoded': le.transform([new_customer_data['loyalty_status']])[0]
}])

# 4. Get the Prediction
prediction = model.predict(input_df)

print(f"--- Prediction Result ---")
print(f"Customer ID: Manual Input")
print(f"Expected Spend in Next 30 Days: {prediction[0]:.2f}")

--- Prediction Result ---
Customer ID: Manual Input
Expected Spend in Next 30 Days: 1262.14
