In [1]:
import pandas as pd

### 1. Linear Regression

In [36]:
df = pd.read_csv('rfm.csv')

In [11]:
df.head()

Unnamed: 0,merchant_name,anonymized_merchant,overall_northstar_per_merchant,recency,frequency,monetary,r_quartile,f_quartile,m_quartile,rfm_score,weighted_rfm
0,Rook Coffee,Merchant 1,0.587156,65,5268548,42125106.19,4,4,4,444,1.0
1,Hometown Coffee & Juice,Merchant 2,0.366252,65,219183,4250374.48,4,4,4,444,1.0
2,Mad Goat Coffee,Merchant 3,0.319755,65,122985,1051148.08,4,4,4,444,1.0
3,Lovebird,Merchant 4,0.283589,65,135275,5158357.41,4,4,4,444,1.0
4,Convoy Commune,Merchant 5,0.2774,65,114831,1214620.55,4,4,4,444,1.0


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [19]:
# Define the features and target variable
X = df[['r_quartile', 'f_quartile', 'm_quartile']]
y = df['overall_northstar_per_merchant']

In [20]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [21]:
# Fit the linear regression model
model = LinearRegression()
model.fit(X_scaled, y)

In [22]:
# Get the weights (coefficients)
weights = model.coef_
weight_dict = {
    'recency_weight': weights[0],
    'frequency_weight': weights[1],
    'monetary_weight': weights[2]
}

In [23]:
# Print the weights
print(weight_dict)

{'recency_weight': 0.017671057817652238, 'frequency_weight': 0.013075004745622176, 'monetary_weight': 0.05866709472581902}


In [24]:
# Regression analysis
X = df[['r_quartile', 'f_quartile', 'm_quartile']]
y = df['overall_northstar_per_merchant']
reg = LinearRegression().fit(X, y)
weights = reg.coef_

In [25]:
weight_dict['monetary_weight'] / df['weighted_rfm'].max()

0.05866709472581905

In [26]:
# Calculate weighted RFM score
df['weighted_rfm'] = (
    weight_dict['recency_weight'] * df['r_quartile'] +
    weight_dict['frequency_weight'] * df['f_quartile'] +
    weight_dict['monetary_weight'] * df['m_quartile']
)

In [27]:
# Normalize the weighted RFM score
df['normalized_weighted_rfm'] = df['weighted_rfm'] / df['weighted_rfm'].max()

In [28]:
df

Unnamed: 0,merchant_name,anonymized_merchant,overall_northstar_per_merchant,recency,frequency,monetary,r_quartile,f_quartile,m_quartile,rfm_score,weighted_rfm,normalized_weighted_rfm
0,Rook Coffee,Merchant 1,0.587156,65,5260282,42078793.38,4,4,4,444,0.357653,1.0
1,Hometown Coffee & Juice,Merchant 2,0.366252,65,218800,4241514.36,4,4,4,444,0.357653,1.0
2,Mad Goat Coffee,Merchant 3,0.319755,65,122874,1050563.99,4,4,4,444,0.357653,1.0
3,Lovebird,Merchant 4,0.283589,65,135149,5155160.42,4,4,4,444,0.357653,1.0
4,Convoy Commune,Merchant 5,0.2774,65,114743,1212714.79,4,4,4,444,0.357653,1.0
5,Purple Bowl,Merchant 6,0.238708,65,113847,1869957.36,4,4,4,444,0.357653,1.0
6,Zestia,Merchant 7,0.211743,65,76582,2078071.55,4,4,4,444,0.357653,1.0
7,Stone Tower Brews,Merchant 8,0.180937,65,103110,1627724.71,4,4,4,444,0.357653,1.0
8,Crestline Bagel,Merchant 9,0.179305,65,116274,2001317.19,4,4,4,444,0.357653,1.0
9,Juice n Bowls,Merchant 10,0.17839,65,66078,962582.83,3,3,4,334,0.326907,0.914034


### 2. Random Forest

In [37]:
from sklearn.ensemble import RandomForestRegressor

# Define the features and target variable
X = df[['r_quartile', 'f_quartile', 'm_quartile']]
y = df['overall_northstar_per_merchant']

# Fit the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

# Get the feature importances
importances = model.feature_importances_
importance_dict = {
    'recency_importance': importances[0],
    'frequency_importance': importances[1],
    'monetary_importance': importances[2]
}

# Print the feature importances
print(importance_dict)

{'recency_importance': 0.04407500654689107, 'frequency_importance': 0.0636258835605861, 'monetary_importance': 0.8922991098925228}


In [38]:
# Calculate weighted RFM score based on feature importances
df['weighted_rfm'] = (
    importance_dict['recency_importance'] * df['r_quartile'] +
    importance_dict['frequency_importance'] * df['f_quartile'] +
    importance_dict['monetary_importance'] * df['m_quartile']
)

In [39]:
# Normalize the weighted RFM score
df['normalized_weighted_rfm'] = df['weighted_rfm'] / df['weighted_rfm'].max()

In [40]:
df['weighted_rfm'].max()

4.0

In [41]:
df

Unnamed: 0,merchant_name,anonymized_merchant,overall_northstar_per_merchant,recency,frequency,monetary,r_quartile,f_quartile,m_quartile,rfm_score,weighted_rfm,normalized_weighted_rfm
0,Rook Coffee,Merchant 1,0.587156,65,5260282,42078793.38,4,4,4,444,4.0,1.0
1,Hometown Coffee & Juice,Merchant 2,0.366252,65,218800,4241514.36,4,4,4,444,4.0,1.0
2,Mad Goat Coffee,Merchant 3,0.319755,65,122874,1050563.99,4,4,4,444,4.0,1.0
3,Lovebird,Merchant 4,0.283589,65,135149,5155160.42,4,4,4,444,4.0,1.0
4,Convoy Commune,Merchant 5,0.2774,65,114743,1212714.79,4,4,4,444,4.0,1.0
5,Purple Bowl,Merchant 6,0.238708,65,113847,1869957.36,4,4,4,444,4.0,1.0
6,Zestia,Merchant 7,0.211743,65,76582,2078071.55,4,4,4,444,4.0,1.0
7,Stone Tower Brews,Merchant 8,0.180937,65,103110,1627724.71,4,4,4,444,4.0,1.0
8,Crestline Bagel,Merchant 9,0.179305,65,116274,2001317.19,4,4,4,444,4.0,1.0
9,Juice n Bowls,Merchant 10,0.17839,65,66078,962582.83,3,3,4,334,3.892299,0.973075


### 3. XGB

In [30]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [31]:
# Define the features and target variable
X = df[['r_quartile', 'f_quartile', 'm_quartile']]
y = df['overall_northstar_per_merchant']

In [32]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert to DMatrix for XGBoost
data_dmatrix = xgb.DMatrix(data=X_scaled, label=y)

# Train the XGBoost model
params = {"objective": "reg:squarederror", "max_depth": 3, "eta": 0.1}
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=100)

# Get the feature importances
importances = xg_reg.get_score(importance_type='weight')
importance_dict = {
    'recency_importance': importances.get('f0', 0),
    'frequency_importance': importances.get('f1', 0),
    'monetary_importance': importances.get('f2', 0)
}

# Normalize the importances to sum to 1
total_importance = sum(importance_dict.values())
importance_dict = {k: v / total_importance for k, v in importance_dict.items()}

In [33]:
# Print the feature importances
print(importance_dict)

# Calculate weighted RFM score based on feature importances
df['weighted_rfm'] = (
    importance_dict['recency_importance'] * df['r_quartile'] +
    importance_dict['frequency_importance'] * df['f_quartile'] +
    importance_dict['monetary_importance'] * df['m_quartile']
)

{'recency_importance': 0.28287841191067, 'frequency_importance': 0.34987593052109184, 'monetary_importance': 0.36724565756823824}


In [34]:
# Normalize the weighted RFM score
df['normalized_weighted_rfm'] = df['weighted_rfm'] / df['weighted_rfm'].max()

In [35]:
df

Unnamed: 0,merchant_name,anonymized_merchant,overall_northstar_per_merchant,recency,frequency,monetary,r_quartile,f_quartile,m_quartile,rfm_score,weighted_rfm,normalized_weighted_rfm
0,Rook Coffee,Merchant 1,0.587156,65,5260282,42078793.38,4,4,4,444,4.0,1.0
1,Hometown Coffee & Juice,Merchant 2,0.366252,65,218800,4241514.36,4,4,4,444,4.0,1.0
2,Mad Goat Coffee,Merchant 3,0.319755,65,122874,1050563.99,4,4,4,444,4.0,1.0
3,Lovebird,Merchant 4,0.283589,65,135149,5155160.42,4,4,4,444,4.0,1.0
4,Convoy Commune,Merchant 5,0.2774,65,114743,1212714.79,4,4,4,444,4.0,1.0
5,Purple Bowl,Merchant 6,0.238708,65,113847,1869957.36,4,4,4,444,4.0,1.0
6,Zestia,Merchant 7,0.211743,65,76582,2078071.55,4,4,4,444,4.0,1.0
7,Stone Tower Brews,Merchant 8,0.180937,65,103110,1627724.71,4,4,4,444,4.0,1.0
8,Crestline Bagel,Merchant 9,0.179305,65,116274,2001317.19,4,4,4,444,4.0,1.0
9,Juice n Bowls,Merchant 10,0.17839,65,66078,962582.83,3,3,4,334,3.367246,0.841811
