In [1]:
import pandas as pd
import numpy as np

# --- 1. Load Data ---

In [2]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/clv project rebuild/data/cleaned.csv", parse_dates=['InvoiceDate'])

In [5]:
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
397919,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
397920,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
397921,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
397922,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


In [3]:
# Split Date: We use 3 months of data for the target (Oct, Nov, Dec 2011)
SPLIT_DATE = df['InvoiceDate'].max() - pd.DateOffset(months=3)

# --- 2. Create Train (Features) & Test (Target) Split ---

In [4]:
# Everything BEFORE split_date is what we know (Features)
df_features = df[df['InvoiceDate'] < SPLIT_DATE]

# Everything AFTER split_date is what we want to predict (Target)
df_target = df[df['InvoiceDate'] >= SPLIT_DATE]

# Creating Total Spend in df_features
df_features['TotalSpend'] = df_features['UnitPrice']* df_features['Quantity']
print(f"Feature Set Rows: {len(df_features)}")
print(f"Target Set Rows: {len(df_target)}")

Feature Set Rows: 235703
Target Set Rows: 162221


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_features['TotalSpend'] = df_features['UnitPrice']* df_features['Quantity']


# --- 3. Feature Engineering (X) ---

In [7]:
# We calculate features on the 'past' data
snapshot_date = df_features['InvoiceDate'].max() + pd.DateOffset(1)

In [9]:
# Features
features = df_features.groupby('CustomerID').agg({
    'Quantity' : 'sum', # Total qty
    'UnitPrice' : 'mean', # Avg unit price
    'TotalSpend': 'sum' # Monetary
}).rename(columns = {
    'Quantity': 'TotalQty',
    'UnitPrice': 'AvgUnitPrice',
    'TotalSpend': 'Monetary_Value'
})

In [10]:
features

Unnamed: 0_level_0,TotalQty,AvgUnitPrice,Monetary_Value
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346.0,74215,1.040000,77183.60
12347.0,1590,2.797661,2790.86
12348.0,2124,4.864643,1487.24
12350.0,197,3.841176,334.40
12352.0,254,27.449474,1561.81
...,...,...,...
18280.0,45,4.765000,180.60
18281.0,54,5.622857,80.82
18282.0,75,5.552857,100.21
18283.0,742,1.685257,1120.67


# --- 4. Target Creation (y) ---

In [11]:
# Making the future Clv (Total Spend)
df_target['TotalSpend'] = df_target['UnitPrice']* df_target['Quantity']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_target['TotalSpend'] = df_target['UnitPrice']* df_target['Quantity']


In [12]:
target = df_target.groupby("CustomerID")['TotalSpend'].sum().rename('CLV_target_3M')

In [13]:
## Now Combining the clv and features
final = features.merge(target, on = 'CustomerID', how = 'left')

## If the clv of CLV is 0 means they havent spent in last 3 months .
## We should keep them insted of removing which shows customer churn as well as clv

In [14]:
final['CLV_target_3M'] = final['CLV_target_3M'].fillna(0)

In [15]:
print(f"Final Dataset Shape: {final.shape}")
print(final.head())

Final Dataset Shape: (3365, 4)
            TotalQty  AvgUnitPrice  Monetary_Value  CLV_target_3M
CustomerID                                                       
12346.0        74215      1.040000        77183.60           0.00
12347.0         1590      2.797661         2790.86        1519.14
12348.0         2124      4.864643         1487.24         310.00
12350.0          197      3.841176          334.40           0.00
12352.0          254     27.449474         1561.81         944.23


In [None]:
# Saving the final data
final.to_csv('.../final_data.csv')

In [17]:
final.columns

Index(['TotalQty', 'AvgUnitPrice', 'Monetary_Value', 'CLV_target_3M'], dtype='object')

In [19]:
# Train Test Spilt

from sklearn.model_selection import train_test_split

X = final[['TotalQty', 'AvgUnitPrice', 'Monetary_Value']]
y = final['CLV_target_3M']
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size = 0.2 , random_state=42)

['/content/drive/MyDrive/Colab Notebooks/clv project rebuild/model_data/y_test.pkl']