In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import requests
from io import StringIO

In [12]:
# Download the Online Retail dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx"
response = requests.get(url)
if response.status_code == 200:
    print("Data downloaded successfully")
    with open('assets/Online Retail.xlsx', 'wb') as f:  
        f.write(response.content)
retail_data = pd.read_excel('assets/Online Retail.xlsx')
retail_data.to_csv('assets/Online Retail.csv', index=False)
df = pd.read_csv('assets/Online Retail.csv')
df.head()

Data downloaded successfully


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [19]:
# Data preprocessing
def preprocess_data(df):
    print("Cleaning the data...")
    # Remove rows with missing values and negative quantities or prices
    df = df.dropna()
    df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
    
    # Convert InvoiceDate to datetime
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
    
    # Calculate total amount per transaction
    df['TotalAmount'] = df['Quantity'] * df['UnitPrice']
    
    print("Creating customer sequences...")
    sequence_length = 5
    
    # Group by customer first to reduce computation
    customer_groups = df.groupby('CustomerID')
    
    sequences = []
    targets = []
    
    for _, customer_data in customer_groups:
        purchases = customer_data.sort_values('InvoiceDate')['TotalAmount'].values
        if len(purchases) >= sequence_length + 1:
            for i in range(len(purchases) - sequence_length):
                sequences.append(purchases[i:i+sequence_length])
                targets.append(purchases[i+sequence_length])
    
    return np.array(sequences), np.array(targets)

In [20]:
# Prepare sequences and targets
X, y = preprocess_data(df)

# Scale the data
scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)
X_scaled = np.array([scaler.fit_transform(seq.reshape(-1, 1)).ravel() for seq in X])
y_scaled = scaler.fit_transform(y.reshape(-1, 1)).flatten()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_scaled, test_size=0.2, random_state=42
)

# Reshape for LSTM [samples, time steps, features]
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Build the model
def build_clv_model(sequence_length):
    model = Sequential([
        LSTM(64, input_shape=(sequence_length, 1), return_sequences=True),
        Dropout(0.2),
        LSTM(32),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    return model

# Create and compile model
model = build_clv_model(sequence_length=5)
model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

Cleaning the data...
Creating customer sequences...
Epoch 1/20


  super().__init__(**kwargs)


[1m7536/7536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 1.0848 - mae: 0.2733 - val_loss: 0.9745 - val_mae: 0.2826
Epoch 2/20
[1m7536/7536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - loss: 1.0189 - mae: 0.2697 - val_loss: 0.9739 - val_mae: 0.2647
Epoch 3/20
[1m7536/7536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - loss: 0.8785 - mae: 0.2667 - val_loss: 0.9737 - val_mae: 0.2896
Epoch 4/20
[1m7536/7536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 0.9456 - mae: 0.2712 - val_loss: 0.9733 - val_mae: 0.2665
Epoch 5/20
[1m7536/7536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 1.1243 - mae: 0.2778 - val_loss: 0.9738 - val_mae: 0.2637
Epoch 6/20
[1m7536/7536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - loss: 0.9823 - mae: 0.2688 - val_loss: 0.9740 - val_mae: 0.2836
Epoch 7/20
[1m7536/7536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2

In [21]:
# Function to predict future value
def predict_future_value(customer_history, model, scaler):
    # Scale the input
    scaled_history = scaler.transform(customer_history.reshape(1, -1))
    # Reshape for LSTM
    scaled_history = scaled_history.reshape(1, scaled_history.shape[1], 1)
    # Make prediction
    scaled_prediction = model.predict(scaled_history, verbose=0)
    prediction = scaler.inverse_transform(scaled_prediction.reshape(-1, 1))
    return prediction[0][0]

In [22]:
sample_sequence = X[0]
original_value = y[0]
predicted_value = predict_future_value(sample_sequence, model, scaler)
print(f"Actual next purchase: ${original_value:.2f}")
print(f"Predicted next purchase: ${predicted_value:.2f}")

ValueError: X has 5 features, but StandardScaler is expecting 1 features as input.