In [1]:
!pip install pandas pyarrow fastparquet



In [2]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import os

In [3]:
folder = Path(r"C:/Users/Dasha/Documents/project ML/enriched_data") 
files = list(folder.glob("part-*.parquet"))

df = pd.concat([pd.read_parquet(f) for f in files], ignore_index=True)


df = df.astype({
    "department": "str",
    "product_id": "int",
    "order_id": "int",
    "add_to_cart_order": "int",
    "reordered": "int",
    "department_id": "int",
    "aisle_id": "int", 
    "product_name": "str", 
    "avg_price": "int"    
})

df.head()

Unnamed: 0,department,product_id,order_id,add_to_cart_order,reordered,department_id,aisle_id,product_name,aisle,avg_price
0,pantry,30064,188723,2,0,13,17,Double Acting Baking Powder,baking ingredients,2
1,produce,38557,124150,9,0,4,24,Citrus Mandarins Organic,fresh fruits,1
2,produce,13176,539689,2,1,4,24,Bag of Organic Bananas,fresh fruits,1
3,babies,2611,544204,1,1,18,92,Gluten Free SpongeBob Spinach Littles,baby food formula,2
4,pantry,18441,4707,1,0,13,72,Organic Ketchup,condiments,2


In [4]:
base_path = r"C:/Users/Dasha/Documents/project ML/raw_data"

orders = pd.read_csv(os.path.join(base_path, "orders.csv"))
# order_products = pd.read_csv(os.path.join(BASE_PATH, "order_products__prior.csv"))
# order_products_train = pd.read_csv(os.path.join(BASE_PATH, "order_products__train.csv"))
# products = pd.read_csv(os.path.join(BASE_PATH, "products.csv"))
# aisles = pd.read_csv(os.path.join(BASE_PATH, "aisles.csv"))
# departments = pd.read_csv(os.path.join(BASE_PATH, "departments.csv"))

In [5]:
# calculating the metrics

df_joined = df.merge(orders, on = "order_id", how = "left")
df_joined.head()

rfm_dataset = df_joined.groupby('user_id').agg({
    'days_since_prior_order': 'last', 
    'order_id': 'count', 
    'avg_price': 'sum'
}).rename(columns = {
    'days_since_prior_order': 'recency', 
    'order_id': 'frequency', 
    'avg_price': 'monetary'
})

rfm_dataset.head()

Unnamed: 0_level_0,recency,frequency,monetary
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,19.0,7,13
2,13.0,14,24
3,7.0,10,17
4,,1,3
5,10.0,4,4


In [6]:
r = pd.qcut(rfm_dataset['recency'].rank(method='first'), 
            q=5, 
            labels=range(5, 0, -1))

f = pd.qcut(rfm_dataset['frequency'], 
            q=5, 
            labels=range(1, 6),  
            duplicates='drop')

m = pd.qcut(rfm_dataset['monetary'], 
            q=5, 
            labels=range(1, 6),  # Fixed function name
            duplicates='drop')

rfm = rfm_dataset.assign(R=r.values, F=f.values, M=m.values)

In [8]:
# temporary solution for NaN values
rfm[['R', 'F', 'M']].isna().sum()
rfm[['R', 'F', 'M']] = rfm[['R', 'F', 'M']].fillna(1).astype(int)


In [9]:
rfm['rfm_group'] = rfm[['R', 'F', 'M']].apply(lambda v:'-'.join(v.astype(str)), axis = 1)
rfm['rfm_score_total'] = rfm[['R', 'F', 'M']].sum(axis=1)

In [10]:
rfm.sort_values(by='rfm_score_total', ascending=False).head()


Unnamed: 0_level_0,recency,frequency,monetary,R,F,M,rfm_group,rfm_score_total
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
135347,1.0,34,53,5,5,5,5-5-5,15
25926,4.0,91,160,5,5,5,5-5-5,15
50,5.0,55,77,5,5,5,5-5-5,15
25937,4.0,27,49,5,5,5,5-5-5,15
55768,3.0,33,49,5,5,5,5-5-5,15
