In [58]:
import pandas as pd
import random
from datetime import datetime, timedelta
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from sklearn.metrics import precision_score, recall_score, f1_score



# List of top e-commerce brands in India
top_ecommerce_brands_india = [
    "Amazon", "Flipkart", "Myntra", "Snapdeal", "TataCliq", "Paytm Mall", "AJIO", "ShopClues",
    "BigBasket", "Grofers", "Reliance Fresh", "More Supermarket", "Spencer's", "Big Bazaar", "DMart", "Metro Cash and Carry",
    "PharmEasy", "Medlife", "1mg", "Netmeds", "Apollo Pharmacy", "MedPlus",
    "Nykaa", "Purplle", "Zivame", "Lenskart", "Beardo", "Mamaearth", "Wow Skin Science", "Bombay Shaving Company",
    "The Man Company", "SUGAR Cosmetics", "Kama Ayurveda", "Forest Essentials", "Colorbar", "Lakme", "VLCC",
    "Lotus Herbals", "Biotique", "Khadi Natural",
    "FirstCry", "BabyChakra", "Hopscotch",
    "Pepperfry", "Urban Ladder", "Home Centre", "Hometown", "Livspace",
    "Croma", "Reliance Digital", "Vijay Sales", "Poorvika Mobiles", "Sangeetha Mobiles", "Pai International",
    "Ezone", "Gadget 360", "Headphone Zone",
    "Zomato", "Swiggy", "Faasos", "FreshMenu", "Box8", "Behrouz Biryani", "Dunzo",
    "CureFit", "HealthifyMe", "1Wellness", "Healthkart",
    "Amazon", "Flipkart", "Snapdeal", "Crossword", "SapnaOnline", "Infibeam", "BookMyShow",
    "CaratLane", "Bluestone", "Voylla",
    "MakeMyTrip", "Yatra", "Cleartrip", "Goibibo", "RedBus", "IRCTC",
    "Heads Up For Tails", "Petsworld", "DogSpot",
    "UrbanClap", "Furlenco", "Rentomojo", "Craftsvilla", "FabIndia", "Limeroad", "Clovia", "Bewakoof"
]

# Number of transactions and unique customers
num_transactions = 50000
num_customers = 5000

# Generate customer IDs
customer_ids = [f"CUST_{i+1}" for i in range(num_customers)]

# Generate synthetic transaction data
data = []
for _ in range(num_transactions):
    customer_id = random.choice(customer_ids)
    brand = random.choice(top_ecommerce_brands_india)
    txn_date = datetime.now() - timedelta(days=random.randint(1, 365))  # Random date within the past year
    data.append([customer_id, brand, txn_date.strftime('%Y-%m-%d')])

# Convert to a DataFrame
df = pd.DataFrame(data, columns=['user_id', 'brand', 'transaction_date'])

# Display the DataFrame
print(df)


         user_id            brand transaction_date
0      CUST_3979           Swiggy       2023-10-31
1       CUST_678         Bewakoof       2024-05-14
2      CUST_2199  The Man Company       2023-09-17
3      CUST_2011         Colorbar       2024-06-04
4       CUST_879          CureFit       2023-10-03
...          ...              ...              ...
49995  CUST_1651          Goibibo       2023-07-08
49996  CUST_4412      Craftsvilla       2023-12-26
49997  CUST_4147        Mamaearth       2024-03-05
49998  CUST_1406         Flipkart       2024-01-23
49999  CUST_1505         Lenskart       2023-09-19

[50000 rows x 3 columns]


In [59]:
# Encode the brands
label_enc = LabelEncoder()
df['brand_encoded'] = label_enc.fit_transform(df['brand'])

# Prepare sequences for each customer
def prepare_sequences(df, n_steps):
    sequences = []
    for _, group in df.groupby('user_id'):
        brand_sequence = list(group['brand_encoded'])
        for i in range(1, len(brand_sequence)):
            if i + n_steps <= len(brand_sequence):
                sequences.append(brand_sequence[i:i+n_steps])
    return np.array(sequences)

n_steps = 6  # Number of time steps to consider
sequences = prepare_sequences(df, n_steps)

In [30]:
print(df)
print(sequences)

         user_id        brand transaction_date  brand_encoded
0      CUST_4790       Swiggy       2024-03-20             80
1       CUST_914        DMart       2023-12-16             24
2      CUST_3344        IRCTC       2024-05-22             45
3      CUST_4001   MakeMyTrip       2024-03-25             54
4      CUST_3828     Flipkart       2023-08-22             31
...          ...          ...              ...            ...
49995  CUST_3579  SapnaOnline       2024-02-17             76
49996  CUST_1291   BabyChakra       2024-05-04              5
49997  CUST_2628        Nykaa       2024-01-13             62
49998  CUST_1806      CureFit       2024-04-01             23
49999  CUST_1377    ShopClues       2024-04-26             77

[50000 rows x 4 columns]
[[38 82 13 43 48 37]
 [82 13 43 48 37 51]
 [13 43 48 37 51 12]
 ...
 [85 49 29  6 34 79]
 [49 29  6 34 79 42]
 [29  6 34 79 42 88]]


In [60]:

# Pad sequences to have the same length
max_length = max([len(seq) for seq in sequences])
sequences_padded = pad_sequences(sequences, maxlen=max_length, padding='post')

# The pad_sequences function is a utility function provided by Keras
# that is used to ensure that all sequences in a list have the same length
# Split sequences into input (X) and output (y)
X, y0 = sequences_padded[:, :-1], sequences_padded[:, -1]

# One-hot encode the output
y = to_categorical(y0, num_classes=len(label_enc.classes_))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(label_enc.classes_), output_dim=50, input_length=X.shape[1]))
model.add(LSTM(100))
model.add(Dense(len(label_enc.classes_), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [32]:
print(X)
print(y0)

[[38 82 13 43 48]
 [82 13 43 48 37]
 [13 43 48 37 51]
 ...
 [85 49 29  6 34]
 [49 29  6 34 79]
 [29  6 34 79 42]]
[37 51 12 ... 79 42 88]


In [33]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape  )

print(X_train)
print(X_test)
print(y_train)
print(y_test)

(16405, 5)
(4102, 5)
(16405, 92)
(4102, 92)
[[73 33 58 72 71]
 [79 19 71 20 53]
 [ 9 22  4 79  9]
 ...
 [62 80 69  2 82]
 [38 47 34 72 72]
 [11 25  3 70 42]]
[[69 45  8 10 44]
 [28 18 89 24 31]
 [49 54 56 17 58]
 ...
 [18  2  1 45 69]
 [38 57  7 83 78]
 [35 66  4 70  6]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [61]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d5685517430>

In [62]:
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
y_test_decoded = np.argmax(y_test, axis=1)



In [77]:
print(y_test_decoded)

[45 34 74 ... 13 69 45]


In [78]:
print(y_pred)

[74 43 46 ... 86  3 77]


In [75]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test_decoded, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[0 1 0 ... 0 0 0]
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 0 1]
 ...
 [0 0 1 ... 0 1 0]
 [0 0 2 ... 0 0 0]
 [0 0 1 ... 0 1 2]]


In [76]:
print(conf_matrix.shape)

(92, 92)


In [84]:
precision = precision_score(y_test_decoded, y_pred, average='weighted')
recall = recall_score(y_test_decoded, y_pred, average='weighted')
f1 = f1_score(y_test_decoded, y_pred, average='weighted')

print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

Precision: 0.010211932550116815, Recall: 0.01240272373540856, F1 Score: 0.008605248175606817


  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
# Predict the next n unique brands for each customer
def predict_next_n_unique_brands(model, sequence, n):
    predictions = []
    predicted_classes = set()  # Set to keep track of predicted classes
    while len(predictions) < n:
        prediction = model.predict(np.array(sequence).reshape(1, -1))[0]
        predicted_class = np.argmax(prediction)
        if predicted_class not in predicted_classes:
            predictions.append(label_enc.inverse_transform([predicted_class])[0])
            predicted_classes.add(predicted_class)
        sequence = np.append(sequence[1:], predicted_class)
    return predictions

# Example: Predict the next 5 unique brands for the first customer in the test set
sequence = X_test[0]
predicted_brands = predict_next_n_unique_brands(model, sequence, 5)
print("Predicted Brands:", predicted_brands)


Predicted Brands: ['SUGAR Cosmetics', 'Goibibo', 'Amazon', 'Yatra', 'Faasos']


In [71]:
# Create an empty list to store individual customer results
results = []

# Iterate over the first 10 unique customers
for user_id in df['user_id'].unique()[:10]:
    # Filter data for the current customer
    user_data = df[df['user_id'] == user_id]

    # Prepare sequences for the current customer
    sequences = prepare_sequences(user_data, n_steps)
    sequences_padded = pad_sequences(sequences, maxlen=max_length, padding='post')

    # Predict the next 5 brands
    if len(sequences_padded) > 0:
        first_sequence = sequences_padded[0]
        next_5_predicted_brands = predict_next_n_unique_brands(model, first_sequence, 5)
    else:
        next_5_predicted_brands = []

    # Append the results for the current customer to the list
    results.append({
        'user_id': user_id,
        'brands_interacted': user_data['brand'].tolist(),
        'next_5_predicted_brands': next_5_predicted_brands
    })

# Create a DataFrame from the list of results
results_df = pd.concat([pd.DataFrame([r]) for r in results], ignore_index=True)

# Display the results
print(results_df)


     user_id                                  brands_interacted  \
0  CUST_3979  [Swiggy, AJIO, Hometown, Spencer's, RedBus, Ka...   
1   CUST_678  [Bewakoof, Ezone, SapnaOnline, Behrouz Biryani...   
2  CUST_2199  [The Man Company, DogSpot, Reliance Digital, U...   
3  CUST_2011  [Colorbar, Crossword, Sangeetha Mobiles, Clear...   
4   CUST_879  [CureFit, Kama Ayurveda, FirstCry, Urban Ladde...   
5  CUST_3904  [Poorvika Mobiles, VLCC, Medlife, Hopscotch, P...   
6  CUST_2521  [Crossword, Spencer's, Snapdeal, More Supermar...   
7  CUST_2366         [CureFit, DogSpot, Medlife, Khadi Natural]   
8  CUST_3265  [Bewakoof, More Supermarket, UrbanClap, Apollo...   
9  CUST_1025  [Lakme, Reliance Digital, Dunzo, CaratLane, Bl...   

                             next_5_predicted_brands  
0       [Biotique, CureFit, Purplle, Zomato, Clovia]  
1    [Flipkart, Faasos, Zivame, Rentomojo, Hometown]  
2           [Amazon, Nykaa, Clovia, AJIO, CaratLane]  
3  [CaratLane, Lakme, Forest Essentials, T